Mercurial > libavcodec.hg
annotate i386/mpegvideo_mmx.c @ 1795:920e6381e1fe libavcodec
2 byte shorter userdata for mpeg4
in the past it was startcode,string,00,7F,startcode
now it is startcode,string,stratcode
both are mpeg4 compliant, as according to the standard the userdata lasts until the next 00 00 01 (startcode prefix) but some very primitive decoders which simply skip until the first 00 byte and then expect the next valid startcode might fail with the old variant, just a theory though (didnt test if quicktime can decode it now)
author | michael |
---|---|
date | Sun, 08 Feb 2004 22:52:35 +0000 |
parents | e31754bc5b65 |
children | f65d87bfdd5a |
rev | line source |
---|---|
8 | 1 /* |
2 * The simplest mpeg encoder (well, it was the simplest!) | |
429 | 3 * Copyright (c) 2000,2001 Fabrice Bellard. |
8 | 4 * |
429 | 5 * This library is free software; you can redistribute it and/or |
6 * modify it under the terms of the GNU Lesser General Public | |
7 * License as published by the Free Software Foundation; either | |
8 * version 2 of the License, or (at your option) any later version. | |
8 | 9 * |
429 | 10 * This library is distributed in the hope that it will be useful, |
8 | 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
429 | 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 * Lesser General Public License for more details. | |
8 | 14 * |
429 | 15 * You should have received a copy of the GNU Lesser General Public |
16 * License along with this library; if not, write to the Free Software | |
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
8 | 18 * |
19 * Optimized for ia32 cpus by Nick Kurshev <nickols_k@mail.ru> | |
325 | 20 * h263, mpeg1, mpeg2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at> |
8 | 21 */ |
22 | |
14
8ceb13af9cb6
renamed - use of s->dct_unquantize function pointer - SHOULD add faster h263 mmx specific unquantization stuff
glantau
parents:
8
diff
changeset
|
23 #include "../dsputil.h" |
8ceb13af9cb6
renamed - use of s->dct_unquantize function pointer - SHOULD add faster h263 mmx specific unquantization stuff
glantau
parents:
8
diff
changeset
|
24 #include "../mpegvideo.h" |
220 | 25 #include "../avcodec.h" |
14
8ceb13af9cb6
renamed - use of s->dct_unquantize function pointer - SHOULD add faster h263 mmx specific unquantization stuff
glantau
parents:
8
diff
changeset
|
26 |
1064 | 27 extern uint8_t zigzag_direct_noperm[64]; |
28 extern uint16_t inv_zigzag_direct16[64]; | |
200 | 29 |
8 | 30 static const unsigned long long int mm_wabs __attribute__ ((aligned(8))) = 0xffffffffffffffffULL; |
31 static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL; | |
32 | |
145
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
33 |
1689 | 34 static void dct_unquantize_h263_intra_mmx(MpegEncContext *s, |
145
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
35 DCTELEM *block, int n, int qscale) |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
36 { |
706
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
687
diff
changeset
|
37 int level, qmul, qadd, nCoeffs; |
145
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
38 |
706
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
687
diff
changeset
|
39 qmul = qscale << 1; |
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
687
diff
changeset
|
40 |
1661 | 41 assert(s->block_last_index[n]>=0 || s->h263_aic); |
706
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
687
diff
changeset
|
42 |
1689 | 43 if (!s->h263_aic) { |
44 if (n < 4) | |
45 level = block[0] * s->y_dc_scale; | |
46 else | |
47 level = block[0] * s->c_dc_scale; | |
48 qadd = (qscale - 1) | 1; | |
49 }else{ | |
50 qadd = 0; | |
51 level= block[0]; | |
52 } | |
53 if(s->ac_pred) | |
706
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
687
diff
changeset
|
54 nCoeffs=63; |
1689 | 55 else |
56 nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; | |
200 | 57 //printf("%d %d ", qmul, qadd); |
145
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
58 asm volatile( |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
59 "movd %1, %%mm6 \n\t" //qmul |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
60 "packssdw %%mm6, %%mm6 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
61 "packssdw %%mm6, %%mm6 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
62 "movd %2, %%mm5 \n\t" //qadd |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
63 "pxor %%mm7, %%mm7 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
64 "packssdw %%mm5, %%mm5 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
65 "packssdw %%mm5, %%mm5 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
66 "psubw %%mm5, %%mm7 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
67 "pxor %%mm4, %%mm4 \n\t" |
153 | 68 ".balign 16\n\t" |
145
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
69 "1: \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
70 "movq (%0, %3), %%mm0 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
71 "movq 8(%0, %3), %%mm1 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
72 |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
73 "pmullw %%mm6, %%mm0 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
74 "pmullw %%mm6, %%mm1 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
75 |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
76 "movq (%0, %3), %%mm2 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
77 "movq 8(%0, %3), %%mm3 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
78 |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
79 "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
80 "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
81 |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
82 "pxor %%mm2, %%mm0 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
83 "pxor %%mm3, %%mm1 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
84 |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
85 "paddw %%mm7, %%mm0 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
86 "paddw %%mm7, %%mm1 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
87 |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
88 "pxor %%mm0, %%mm2 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
89 "pxor %%mm1, %%mm3 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
90 |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
91 "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
92 "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
93 |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
94 "pandn %%mm2, %%mm0 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
95 "pandn %%mm3, %%mm1 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
96 |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
97 "movq %%mm0, (%0, %3) \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
98 "movq %%mm1, 8(%0, %3) \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
99 |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
100 "addl $16, %3 \n\t" |
706
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
687
diff
changeset
|
101 "jng 1b \n\t" |
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
687
diff
changeset
|
102 ::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(-nCoeffs)) |
145
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
103 : "memory" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
104 ); |
1689 | 105 block[0]= level; |
106 } | |
107 | |
108 | |
109 static void dct_unquantize_h263_inter_mmx(MpegEncContext *s, | |
110 DCTELEM *block, int n, int qscale) | |
111 { | |
112 int level, qmul, qadd, nCoeffs; | |
113 | |
114 qmul = qscale << 1; | |
115 qadd = (qscale - 1) | 1; | |
116 | |
117 assert(s->block_last_index[n]>=0 || s->h263_aic); | |
118 | |
119 nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; | |
120 //printf("%d %d ", qmul, qadd); | |
121 asm volatile( | |
122 "movd %1, %%mm6 \n\t" //qmul | |
123 "packssdw %%mm6, %%mm6 \n\t" | |
124 "packssdw %%mm6, %%mm6 \n\t" | |
125 "movd %2, %%mm5 \n\t" //qadd | |
126 "pxor %%mm7, %%mm7 \n\t" | |
127 "packssdw %%mm5, %%mm5 \n\t" | |
128 "packssdw %%mm5, %%mm5 \n\t" | |
129 "psubw %%mm5, %%mm7 \n\t" | |
130 "pxor %%mm4, %%mm4 \n\t" | |
131 ".balign 16\n\t" | |
132 "1: \n\t" | |
133 "movq (%0, %3), %%mm0 \n\t" | |
134 "movq 8(%0, %3), %%mm1 \n\t" | |
135 | |
136 "pmullw %%mm6, %%mm0 \n\t" | |
137 "pmullw %%mm6, %%mm1 \n\t" | |
138 | |
139 "movq (%0, %3), %%mm2 \n\t" | |
140 "movq 8(%0, %3), %%mm3 \n\t" | |
141 | |
142 "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 | |
143 "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 | |
144 | |
145 "pxor %%mm2, %%mm0 \n\t" | |
146 "pxor %%mm3, %%mm1 \n\t" | |
147 | |
148 "paddw %%mm7, %%mm0 \n\t" | |
149 "paddw %%mm7, %%mm1 \n\t" | |
150 | |
151 "pxor %%mm0, %%mm2 \n\t" | |
152 "pxor %%mm1, %%mm3 \n\t" | |
153 | |
154 "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 | |
155 "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 | |
156 | |
157 "pandn %%mm2, %%mm0 \n\t" | |
158 "pandn %%mm3, %%mm1 \n\t" | |
159 | |
160 "movq %%mm0, (%0, %3) \n\t" | |
161 "movq %%mm1, 8(%0, %3) \n\t" | |
162 | |
163 "addl $16, %3 \n\t" | |
164 "jng 1b \n\t" | |
165 ::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(-nCoeffs)) | |
166 : "memory" | |
167 ); | |
145
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
168 } |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
169 |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
170 |
8 | 171 /* |
172 NK: | |
173 Note: looking at PARANOID: | |
174 "enable all paranoid tests for rounding, overflows, etc..." | |
175 | |
176 #ifdef PARANOID | |
177 if (level < -2048 || level > 2047) | |
178 fprintf(stderr, "unquant error %d %d\n", i, level); | |
179 #endif | |
180 We can suppose that result of two multiplications can't be greate of 0xFFFF | |
181 i.e. is 16-bit, so we use here only PMULLW instruction and can avoid | |
182 a complex multiplication. | |
183 ===================================================== | |
184 Full formula for multiplication of 2 integer numbers | |
185 which are represent as high:low words: | |
186 input: value1 = high1:low1 | |
187 value2 = high2:low2 | |
188 output: value3 = value1*value2 | |
189 value3=high3:low3 (on overflow: modulus 2^32 wrap-around) | |
190 this mean that for 0x123456 * 0x123456 correct result is 0x766cb0ce4 | |
191 but this algorithm will compute only 0x66cb0ce4 | |
192 this limited by 16-bit size of operands | |
193 --------------------------------- | |
194 tlow1 = high1*low2 | |
195 tlow2 = high2*low1 | |
196 tlow1 = tlow1 + tlow2 | |
197 high3:low3 = low1*low2 | |
198 high3 += tlow1 | |
199 */ | |
1689 | 200 static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s, |
14
8ceb13af9cb6
renamed - use of s->dct_unquantize function pointer - SHOULD add faster h263 mmx specific unquantization stuff
glantau
parents:
8
diff
changeset
|
201 DCTELEM *block, int n, int qscale) |
8 | 202 { |
325 | 203 int nCoeffs; |
1064 | 204 const uint16_t *quant_matrix; |
1689 | 205 int block0; |
706
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
687
diff
changeset
|
206 |
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
687
diff
changeset
|
207 assert(s->block_last_index[n]>=0); |
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
687
diff
changeset
|
208 |
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
687
diff
changeset
|
209 nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; |
200 | 210 |
1689 | 211 if (n < 4) |
212 block0 = block[0] * s->y_dc_scale; | |
213 else | |
214 block0 = block[0] * s->c_dc_scale; | |
215 /* XXX: only mpeg1 */ | |
216 quant_matrix = s->intra_matrix; | |
145
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
217 asm volatile( |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
218 "pcmpeqw %%mm7, %%mm7 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
219 "psrlw $15, %%mm7 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
220 "movd %2, %%mm6 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
221 "packssdw %%mm6, %%mm6 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
222 "packssdw %%mm6, %%mm6 \n\t" |
325 | 223 "movl %3, %%eax \n\t" |
153 | 224 ".balign 16\n\t" |
145
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
225 "1: \n\t" |
325 | 226 "movq (%0, %%eax), %%mm0 \n\t" |
227 "movq 8(%0, %%eax), %%mm1 \n\t" | |
228 "movq (%1, %%eax), %%mm4 \n\t" | |
229 "movq 8(%1, %%eax), %%mm5 \n\t" | |
230 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] | |
231 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] | |
232 "pxor %%mm2, %%mm2 \n\t" | |
233 "pxor %%mm3, %%mm3 \n\t" | |
234 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 | |
235 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 | |
236 "pxor %%mm2, %%mm0 \n\t" | |
237 "pxor %%mm3, %%mm1 \n\t" | |
238 "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) | |
239 "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) | |
240 "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q | |
241 "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q | |
242 "pxor %%mm4, %%mm4 \n\t" | |
243 "pxor %%mm5, %%mm5 \n\t" // FIXME slow | |
244 "pcmpeqw (%0, %%eax), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 | |
245 "pcmpeqw 8(%0, %%eax), %%mm5 \n\t" // block[i] == 0 ? -1 : 0 | |
246 "psraw $3, %%mm0 \n\t" | |
247 "psraw $3, %%mm1 \n\t" | |
248 "psubw %%mm7, %%mm0 \n\t" | |
249 "psubw %%mm7, %%mm1 \n\t" | |
250 "por %%mm7, %%mm0 \n\t" | |
251 "por %%mm7, %%mm1 \n\t" | |
252 "pxor %%mm2, %%mm0 \n\t" | |
253 "pxor %%mm3, %%mm1 \n\t" | |
254 "psubw %%mm2, %%mm0 \n\t" | |
255 "psubw %%mm3, %%mm1 \n\t" | |
256 "pandn %%mm0, %%mm4 \n\t" | |
257 "pandn %%mm1, %%mm5 \n\t" | |
258 "movq %%mm4, (%0, %%eax) \n\t" | |
259 "movq %%mm5, 8(%0, %%eax) \n\t" | |
260 | |
261 "addl $16, %%eax \n\t" | |
262 "js 1b \n\t" | |
263 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) | |
264 : "%eax", "memory" | |
265 ); | |
1689 | 266 block[0]= block0; |
267 } | |
325 | 268 |
1689 | 269 static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s, |
270 DCTELEM *block, int n, int qscale) | |
271 { | |
272 int nCoeffs; | |
273 const uint16_t *quant_matrix; | |
274 | |
275 assert(s->block_last_index[n]>=0); | |
276 | |
277 nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; | |
278 | |
344 | 279 quant_matrix = s->inter_matrix; |
325 | 280 asm volatile( |
281 "pcmpeqw %%mm7, %%mm7 \n\t" | |
282 "psrlw $15, %%mm7 \n\t" | |
283 "movd %2, %%mm6 \n\t" | |
284 "packssdw %%mm6, %%mm6 \n\t" | |
285 "packssdw %%mm6, %%mm6 \n\t" | |
286 "movl %3, %%eax \n\t" | |
287 ".balign 16\n\t" | |
288 "1: \n\t" | |
289 "movq (%0, %%eax), %%mm0 \n\t" | |
290 "movq 8(%0, %%eax), %%mm1 \n\t" | |
291 "movq (%1, %%eax), %%mm4 \n\t" | |
292 "movq 8(%1, %%eax), %%mm5 \n\t" | |
145
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
293 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
294 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
295 "pxor %%mm2, %%mm2 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
296 "pxor %%mm3, %%mm3 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
297 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
298 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
299 "pxor %%mm2, %%mm0 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
300 "pxor %%mm3, %%mm1 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
301 "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
302 "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
303 "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
304 "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
305 "paddw %%mm7, %%mm0 \n\t" // abs(block[i])*2 + 1 |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
306 "paddw %%mm7, %%mm1 \n\t" // abs(block[i])*2 + 1 |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
307 "pmullw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
308 "pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
309 "pxor %%mm4, %%mm4 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
310 "pxor %%mm5, %%mm5 \n\t" // FIXME slow |
325 | 311 "pcmpeqw (%0, %%eax), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 |
312 "pcmpeqw 8(%0, %%eax), %%mm5 \n\t" // block[i] == 0 ? -1 : 0 | |
145
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
313 "psraw $4, %%mm0 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
314 "psraw $4, %%mm1 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
315 "psubw %%mm7, %%mm0 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
316 "psubw %%mm7, %%mm1 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
317 "por %%mm7, %%mm0 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
318 "por %%mm7, %%mm1 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
319 "pxor %%mm2, %%mm0 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
320 "pxor %%mm3, %%mm1 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
321 "psubw %%mm2, %%mm0 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
322 "psubw %%mm3, %%mm1 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
323 "pandn %%mm0, %%mm4 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
324 "pandn %%mm1, %%mm5 \n\t" |
325 | 325 "movq %%mm4, (%0, %%eax) \n\t" |
326 "movq %%mm5, 8(%0, %%eax) \n\t" | |
327 | |
328 "addl $16, %%eax \n\t" | |
329 "js 1b \n\t" | |
330 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) | |
331 : "%eax", "memory" | |
332 ); | |
333 } | |
334 | |
1689 | 335 static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s, |
325 | 336 DCTELEM *block, int n, int qscale) |
337 { | |
338 int nCoeffs; | |
1064 | 339 const uint16_t *quant_matrix; |
1689 | 340 int block0; |
325 | 341 |
706
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
687
diff
changeset
|
342 assert(s->block_last_index[n]>=0); |
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
687
diff
changeset
|
343 |
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
687
diff
changeset
|
344 if(s->alternate_scan) nCoeffs= 63; //FIXME |
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
687
diff
changeset
|
345 else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; |
325 | 346 |
1689 | 347 if (n < 4) |
348 block0 = block[0] * s->y_dc_scale; | |
349 else | |
350 block0 = block[0] * s->c_dc_scale; | |
351 quant_matrix = s->intra_matrix; | |
325 | 352 asm volatile( |
353 "pcmpeqw %%mm7, %%mm7 \n\t" | |
354 "psrlw $15, %%mm7 \n\t" | |
355 "movd %2, %%mm6 \n\t" | |
356 "packssdw %%mm6, %%mm6 \n\t" | |
357 "packssdw %%mm6, %%mm6 \n\t" | |
358 "movl %3, %%eax \n\t" | |
359 ".balign 16\n\t" | |
360 "1: \n\t" | |
361 "movq (%0, %%eax), %%mm0 \n\t" | |
362 "movq 8(%0, %%eax), %%mm1 \n\t" | |
363 "movq (%1, %%eax), %%mm4 \n\t" | |
364 "movq 8(%1, %%eax), %%mm5 \n\t" | |
365 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] | |
366 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] | |
367 "pxor %%mm2, %%mm2 \n\t" | |
368 "pxor %%mm3, %%mm3 \n\t" | |
369 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 | |
370 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 | |
371 "pxor %%mm2, %%mm0 \n\t" | |
372 "pxor %%mm3, %%mm1 \n\t" | |
373 "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) | |
374 "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) | |
375 "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q | |
376 "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q | |
377 "pxor %%mm4, %%mm4 \n\t" | |
378 "pxor %%mm5, %%mm5 \n\t" // FIXME slow | |
379 "pcmpeqw (%0, %%eax), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 | |
380 "pcmpeqw 8(%0, %%eax), %%mm5 \n\t" // block[i] == 0 ? -1 : 0 | |
381 "psraw $3, %%mm0 \n\t" | |
382 "psraw $3, %%mm1 \n\t" | |
383 "pxor %%mm2, %%mm0 \n\t" | |
384 "pxor %%mm3, %%mm1 \n\t" | |
385 "psubw %%mm2, %%mm0 \n\t" | |
386 "psubw %%mm3, %%mm1 \n\t" | |
387 "pandn %%mm0, %%mm4 \n\t" | |
388 "pandn %%mm1, %%mm5 \n\t" | |
389 "movq %%mm4, (%0, %%eax) \n\t" | |
390 "movq %%mm5, 8(%0, %%eax) \n\t" | |
145
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
391 |
325 | 392 "addl $16, %%eax \n\t" |
706
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
687
diff
changeset
|
393 "jng 1b \n\t" |
325 | 394 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) |
395 : "%eax", "memory" | |
396 ); | |
1689 | 397 block[0]= block0; |
325 | 398 //Note, we dont do mismatch control for intra as errors cannot accumulate |
1689 | 399 } |
325 | 400 |
1689 | 401 static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s, |
402 DCTELEM *block, int n, int qscale) | |
403 { | |
404 int nCoeffs; | |
405 const uint16_t *quant_matrix; | |
406 | |
407 assert(s->block_last_index[n]>=0); | |
408 | |
409 if(s->alternate_scan) nCoeffs= 63; //FIXME | |
410 else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; | |
411 | |
344 | 412 quant_matrix = s->inter_matrix; |
325 | 413 asm volatile( |
414 "pcmpeqw %%mm7, %%mm7 \n\t" | |
415 "psrlq $48, %%mm7 \n\t" | |
416 "movd %2, %%mm6 \n\t" | |
417 "packssdw %%mm6, %%mm6 \n\t" | |
418 "packssdw %%mm6, %%mm6 \n\t" | |
419 "movl %3, %%eax \n\t" | |
420 ".balign 16\n\t" | |
421 "1: \n\t" | |
422 "movq (%0, %%eax), %%mm0 \n\t" | |
423 "movq 8(%0, %%eax), %%mm1 \n\t" | |
424 "movq (%1, %%eax), %%mm4 \n\t" | |
425 "movq 8(%1, %%eax), %%mm5 \n\t" | |
426 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] | |
427 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] | |
428 "pxor %%mm2, %%mm2 \n\t" | |
429 "pxor %%mm3, %%mm3 \n\t" | |
430 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 | |
431 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 | |
432 "pxor %%mm2, %%mm0 \n\t" | |
433 "pxor %%mm3, %%mm1 \n\t" | |
434 "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) | |
435 "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) | |
436 "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 | |
437 "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 | |
438 "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*2*q | |
439 "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*2*q | |
440 "paddw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q | |
441 "paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q | |
442 "pxor %%mm4, %%mm4 \n\t" | |
443 "pxor %%mm5, %%mm5 \n\t" // FIXME slow | |
444 "pcmpeqw (%0, %%eax), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 | |
445 "pcmpeqw 8(%0, %%eax), %%mm5 \n\t" // block[i] == 0 ? -1 : 0 | |
446 "psrlw $4, %%mm0 \n\t" | |
447 "psrlw $4, %%mm1 \n\t" | |
448 "pxor %%mm2, %%mm0 \n\t" | |
449 "pxor %%mm3, %%mm1 \n\t" | |
450 "psubw %%mm2, %%mm0 \n\t" | |
451 "psubw %%mm3, %%mm1 \n\t" | |
452 "pandn %%mm0, %%mm4 \n\t" | |
453 "pandn %%mm1, %%mm5 \n\t" | |
454 "pxor %%mm4, %%mm7 \n\t" | |
455 "pxor %%mm5, %%mm7 \n\t" | |
456 "movq %%mm4, (%0, %%eax) \n\t" | |
457 "movq %%mm5, 8(%0, %%eax) \n\t" | |
458 | |
459 "addl $16, %%eax \n\t" | |
706
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
687
diff
changeset
|
460 "jng 1b \n\t" |
325 | 461 "movd 124(%0, %3), %%mm0 \n\t" |
462 "movq %%mm7, %%mm6 \n\t" | |
463 "psrlq $32, %%mm7 \n\t" | |
464 "pxor %%mm6, %%mm7 \n\t" | |
465 "movq %%mm7, %%mm6 \n\t" | |
466 "psrlq $16, %%mm7 \n\t" | |
467 "pxor %%mm6, %%mm7 \n\t" | |
468 "pslld $31, %%mm7 \n\t" | |
469 "psrlq $15, %%mm7 \n\t" | |
470 "pxor %%mm7, %%mm0 \n\t" | |
471 "movd %%mm0, 124(%0, %3) \n\t" | |
472 | |
473 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "r" (-2*nCoeffs) | |
474 : "%eax", "memory" | |
145
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
475 ); |
8 | 476 } |
477 | |
206 | 478 /* draw the edges of width 'w' of an image of size width, height |
479 this mmx version can only handle w==8 || w==16 */ | |
1064 | 480 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w) |
206 | 481 { |
1064 | 482 uint8_t *ptr, *last_line; |
206 | 483 int i; |
484 | |
485 last_line = buf + (height - 1) * wrap; | |
486 /* left and right */ | |
487 ptr = buf; | |
488 if(w==8) | |
489 { | |
490 asm volatile( | |
491 "1: \n\t" | |
492 "movd (%0), %%mm0 \n\t" | |
493 "punpcklbw %%mm0, %%mm0 \n\t" | |
494 "punpcklwd %%mm0, %%mm0 \n\t" | |
495 "punpckldq %%mm0, %%mm0 \n\t" | |
496 "movq %%mm0, -8(%0) \n\t" | |
497 "movq -8(%0, %2), %%mm1 \n\t" | |
498 "punpckhbw %%mm1, %%mm1 \n\t" | |
499 "punpckhwd %%mm1, %%mm1 \n\t" | |
500 "punpckhdq %%mm1, %%mm1 \n\t" | |
501 "movq %%mm1, (%0, %2) \n\t" | |
502 "addl %1, %0 \n\t" | |
503 "cmpl %3, %0 \n\t" | |
504 " jb 1b \n\t" | |
505 : "+r" (ptr) | |
506 : "r" (wrap), "r" (width), "r" (ptr + wrap*height) | |
507 ); | |
508 } | |
509 else | |
510 { | |
511 asm volatile( | |
512 "1: \n\t" | |
513 "movd (%0), %%mm0 \n\t" | |
514 "punpcklbw %%mm0, %%mm0 \n\t" | |
515 "punpcklwd %%mm0, %%mm0 \n\t" | |
516 "punpckldq %%mm0, %%mm0 \n\t" | |
517 "movq %%mm0, -8(%0) \n\t" | |
518 "movq %%mm0, -16(%0) \n\t" | |
519 "movq -8(%0, %2), %%mm1 \n\t" | |
520 "punpckhbw %%mm1, %%mm1 \n\t" | |
521 "punpckhwd %%mm1, %%mm1 \n\t" | |
522 "punpckhdq %%mm1, %%mm1 \n\t" | |
523 "movq %%mm1, (%0, %2) \n\t" | |
524 "movq %%mm1, 8(%0, %2) \n\t" | |
525 "addl %1, %0 \n\t" | |
526 "cmpl %3, %0 \n\t" | |
527 " jb 1b \n\t" | |
528 : "+r" (ptr) | |
529 : "r" (wrap), "r" (width), "r" (ptr + wrap*height) | |
530 ); | |
531 } | |
532 | |
533 for(i=0;i<w;i+=4) { | |
534 /* top and bottom (and hopefully also the corners) */ | |
535 ptr= buf - (i + 1) * wrap - w; | |
536 asm volatile( | |
537 "1: \n\t" | |
538 "movq (%1, %0), %%mm0 \n\t" | |
539 "movq %%mm0, (%0) \n\t" | |
540 "movq %%mm0, (%0, %2) \n\t" | |
541 "movq %%mm0, (%0, %2, 2) \n\t" | |
542 "movq %%mm0, (%0, %3) \n\t" | |
543 "addl $8, %0 \n\t" | |
544 "cmpl %4, %0 \n\t" | |
545 " jb 1b \n\t" | |
546 : "+r" (ptr) | |
547 : "r" ((int)buf - (int)ptr - w), "r" (-wrap), "r" (-wrap*3), "r" (ptr+width+2*w) | |
548 ); | |
549 ptr= last_line + (i + 1) * wrap - w; | |
550 asm volatile( | |
551 "1: \n\t" | |
552 "movq (%1, %0), %%mm0 \n\t" | |
553 "movq %%mm0, (%0) \n\t" | |
554 "movq %%mm0, (%0, %2) \n\t" | |
555 "movq %%mm0, (%0, %2, 2) \n\t" | |
556 "movq %%mm0, (%0, %3) \n\t" | |
557 "addl $8, %0 \n\t" | |
558 "cmpl %4, %0 \n\t" | |
559 " jb 1b \n\t" | |
560 : "+r" (ptr) | |
561 : "r" ((int)last_line - (int)ptr - w), "r" (wrap), "r" (wrap*3), "r" (ptr+width+2*w) | |
562 ); | |
563 } | |
564 } | |
565 | |
1719 | 566 static void denoise_dct_mmx(MpegEncContext *s, DCTELEM *block){ |
567 const int intra= s->mb_intra; | |
568 int *sum= s->dct_error_sum[intra]; | |
569 uint16_t *offset= s->dct_offset[intra]; | |
570 | |
571 s->dct_count[intra]++; | |
572 | |
573 asm volatile( | |
574 "pxor %%mm7, %%mm7 \n\t" | |
575 "1: \n\t" | |
576 "pxor %%mm0, %%mm0 \n\t" | |
577 "pxor %%mm1, %%mm1 \n\t" | |
578 "movq (%0), %%mm2 \n\t" | |
579 "movq 8(%0), %%mm3 \n\t" | |
580 "pcmpgtw %%mm2, %%mm0 \n\t" | |
581 "pcmpgtw %%mm3, %%mm1 \n\t" | |
582 "pxor %%mm0, %%mm2 \n\t" | |
583 "pxor %%mm1, %%mm3 \n\t" | |
584 "psubw %%mm0, %%mm2 \n\t" | |
585 "psubw %%mm1, %%mm3 \n\t" | |
586 "movq %%mm2, %%mm4 \n\t" | |
587 "movq %%mm3, %%mm5 \n\t" | |
588 "psubusw (%2), %%mm2 \n\t" | |
589 "psubusw 8(%2), %%mm3 \n\t" | |
590 "pxor %%mm0, %%mm2 \n\t" | |
591 "pxor %%mm1, %%mm3 \n\t" | |
592 "psubw %%mm0, %%mm2 \n\t" | |
593 "psubw %%mm1, %%mm3 \n\t" | |
594 "movq %%mm2, (%0) \n\t" | |
595 "movq %%mm3, 8(%0) \n\t" | |
596 "movq %%mm4, %%mm2 \n\t" | |
597 "movq %%mm5, %%mm3 \n\t" | |
598 "punpcklwd %%mm7, %%mm4 \n\t" | |
599 "punpckhwd %%mm7, %%mm2 \n\t" | |
600 "punpcklwd %%mm7, %%mm5 \n\t" | |
601 "punpckhwd %%mm7, %%mm3 \n\t" | |
602 "paddd (%1), %%mm4 \n\t" | |
603 "paddd 8(%1), %%mm2 \n\t" | |
604 "paddd 16(%1), %%mm5 \n\t" | |
605 "paddd 24(%1), %%mm3 \n\t" | |
606 "movq %%mm4, (%1) \n\t" | |
607 "movq %%mm2, 8(%1) \n\t" | |
608 "movq %%mm5, 16(%1) \n\t" | |
609 "movq %%mm3, 24(%1) \n\t" | |
610 "addl $16, %0 \n\t" | |
611 "addl $32, %1 \n\t" | |
612 "addl $16, %2 \n\t" | |
613 "cmpl %3, %0 \n\t" | |
614 " jb 1b \n\t" | |
615 : "+r" (block), "+r" (sum), "+r" (offset) | |
616 : "r"(block+64) | |
617 ); | |
618 } | |
619 | |
1720
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
620 static void denoise_dct_sse2(MpegEncContext *s, DCTELEM *block){ |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
621 const int intra= s->mb_intra; |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
622 int *sum= s->dct_error_sum[intra]; |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
623 uint16_t *offset= s->dct_offset[intra]; |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
624 |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
625 s->dct_count[intra]++; |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
626 |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
627 asm volatile( |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
628 "pxor %%xmm7, %%xmm7 \n\t" |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
629 "1: \n\t" |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
630 "pxor %%xmm0, %%xmm0 \n\t" |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
631 "pxor %%xmm1, %%xmm1 \n\t" |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
632 "movdqa (%0), %%xmm2 \n\t" |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
633 "movdqa 16(%0), %%xmm3 \n\t" |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
634 "pcmpgtw %%xmm2, %%xmm0 \n\t" |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
635 "pcmpgtw %%xmm3, %%xmm1 \n\t" |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
636 "pxor %%xmm0, %%xmm2 \n\t" |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
637 "pxor %%xmm1, %%xmm3 \n\t" |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
638 "psubw %%xmm0, %%xmm2 \n\t" |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
639 "psubw %%xmm1, %%xmm3 \n\t" |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
640 "movdqa %%xmm2, %%xmm4 \n\t" |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
641 "movdqa %%xmm3, %%xmm5 \n\t" |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
642 "psubusw (%2), %%xmm2 \n\t" |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
643 "psubusw 16(%2), %%xmm3 \n\t" |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
644 "pxor %%xmm0, %%xmm2 \n\t" |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
645 "pxor %%xmm1, %%xmm3 \n\t" |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
646 "psubw %%xmm0, %%xmm2 \n\t" |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
647 "psubw %%xmm1, %%xmm3 \n\t" |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
648 "movdqa %%xmm2, (%0) \n\t" |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
649 "movdqa %%xmm3, 16(%0) \n\t" |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
650 "movdqa %%xmm4, %%xmm6 \n\t" |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
651 "movdqa %%xmm5, %%xmm0 \n\t" |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
652 "punpcklwd %%xmm7, %%xmm4 \n\t" |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
653 "punpckhwd %%xmm7, %%xmm6 \n\t" |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
654 "punpcklwd %%xmm7, %%xmm5 \n\t" |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
655 "punpckhwd %%xmm7, %%xmm0 \n\t" |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
656 "paddd (%1), %%xmm4 \n\t" |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
657 "paddd 16(%1), %%xmm6 \n\t" |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
658 "paddd 32(%1), %%xmm5 \n\t" |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
659 "paddd 48(%1), %%xmm0 \n\t" |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
660 "movdqa %%xmm4, (%1) \n\t" |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
661 "movdqa %%xmm6, 16(%1) \n\t" |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
662 "movdqa %%xmm5, 32(%1) \n\t" |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
663 "movdqa %%xmm0, 48(%1) \n\t" |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
664 "addl $32, %0 \n\t" |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
665 "addl $64, %1 \n\t" |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
666 "addl $32, %2 \n\t" |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
667 "cmpl %3, %0 \n\t" |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
668 " jb 1b \n\t" |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
669 : "+r" (block), "+r" (sum), "+r" (offset) |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
670 : "r"(block+64) |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
671 ); |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
672 } |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
673 |
220 | 674 #undef HAVE_MMX2 |
675 #define RENAME(a) a ## _MMX | |
1565 | 676 #define RENAMEl(a) a ## _mmx |
220 | 677 #include "mpegvideo_mmx_template.c" |
678 | |
679 #define HAVE_MMX2 | |
680 #undef RENAME | |
1597 | 681 #undef RENAMEl |
220 | 682 #define RENAME(a) a ## _MMX2 |
1565 | 683 #define RENAMEl(a) a ## _mmx2 |
220 | 684 #include "mpegvideo_mmx_template.c" |
206 | 685 |
1765
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1720
diff
changeset
|
686 #undef RENAME |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1720
diff
changeset
|
687 #undef RENAMEl |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1720
diff
changeset
|
688 #define RENAME(a) a ## _SSE2 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1720
diff
changeset
|
689 #define RENAMEl(a) a ## _sse2 |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1720
diff
changeset
|
690 #include "mpegvideo_mmx_template.c" |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1720
diff
changeset
|
691 |
14
8ceb13af9cb6
renamed - use of s->dct_unquantize function pointer - SHOULD add faster h263 mmx specific unquantization stuff
glantau
parents:
8
diff
changeset
|
692 void MPV_common_init_mmx(MpegEncContext *s) |
8 | 693 { |
14
8ceb13af9cb6
renamed - use of s->dct_unquantize function pointer - SHOULD add faster h263 mmx specific unquantization stuff
glantau
parents:
8
diff
changeset
|
694 if (mm_flags & MM_MMX) { |
706
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
687
diff
changeset
|
695 const int dct_algo = s->avctx->dct_algo; |
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
687
diff
changeset
|
696 |
1689 | 697 s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx; |
698 s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx; | |
699 s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx; | |
700 s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx; | |
701 s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx; | |
702 s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx; | |
312 | 703 |
350
6ebbecc10063
- Advanced Intra Coding (AIC) support for H.263+ encoder, just DC by now.
pulento
parents:
344
diff
changeset
|
704 draw_edges = draw_edges_mmx; |
1719 | 705 |
1720
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
706 if (mm_flags & MM_SSE2) { |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
707 s->denoise_dct= denoise_dct_sse2; |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
708 } else { |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
709 s->denoise_dct= denoise_dct_mmx; |
96a86bd1e0d5
denoise_dct_sse2() patch by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1719
diff
changeset
|
710 } |
220 | 711 |
625
bb6a69f9d409
slow but accurate integer dct from IJG (should be ok with the LGPL as the old DCT is the fast integer DCT from IJG)
michaelni
parents:
620
diff
changeset
|
712 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){ |
1765
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1720
diff
changeset
|
713 if(mm_flags & MM_SSE2){ |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1720
diff
changeset
|
714 s->dct_quantize= dct_quantize_SSE2; |
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1720
diff
changeset
|
715 } else if(mm_flags & MM_MMXEXT){ |
625
bb6a69f9d409
slow but accurate integer dct from IJG (should be ok with the LGPL as the old DCT is the fast integer DCT from IJG)
michaelni
parents:
620
diff
changeset
|
716 s->dct_quantize= dct_quantize_MMX2; |
bb6a69f9d409
slow but accurate integer dct from IJG (should be ok with the LGPL as the old DCT is the fast integer DCT from IJG)
michaelni
parents:
620
diff
changeset
|
717 } else { |
bb6a69f9d409
slow but accurate integer dct from IJG (should be ok with the LGPL as the old DCT is the fast integer DCT from IJG)
michaelni
parents:
620
diff
changeset
|
718 s->dct_quantize= dct_quantize_MMX; |
bb6a69f9d409
slow but accurate integer dct from IJG (should be ok with the LGPL as the old DCT is the fast integer DCT from IJG)
michaelni
parents:
620
diff
changeset
|
719 } |
350
6ebbecc10063
- Advanced Intra Coding (AIC) support for H.263+ encoder, just DC by now.
pulento
parents:
344
diff
changeset
|
720 } |
14
8ceb13af9cb6
renamed - use of s->dct_unquantize function pointer - SHOULD add faster h263 mmx specific unquantization stuff
glantau
parents:
8
diff
changeset
|
721 } |
8 | 722 } |