Mercurial > libavcodec.hg
annotate i386/mpegvideo_mmx_template.c @ 625:bb6a69f9d409 libavcodec
slow but accurate integer dct from IJG (should be ok with the LGPL as the old DCT is the fast integer DCT from IJG)
per context DCT selection
author | michaelni |
---|---|
date | Thu, 29 Aug 2002 23:55:32 +0000 |
parents | b1a191202f96 |
children | 3e0f62e5eed6 |
rev | line source |
---|---|
220 | 1 /* |
429 | 2 * MPEG video MMX templates |
3 * | |
4 * Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at> | |
5 * | |
6 * This library is free software; you can redistribute it and/or | |
7 * modify it under the terms of the GNU Lesser General Public | |
8 * License as published by the Free Software Foundation; either | |
9 * version 2 of the License, or (at your option) any later version. | |
10 * | |
11 * This library is distributed in the hope that it will be useful, | |
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 * Lesser General Public License for more details. | |
15 * | |
16 * You should have received a copy of the GNU Lesser General Public | |
17 * License along with this library; if not, write to the Free Software | |
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
19 */ | |
220 | 20 #undef SPREADW |
21 #undef PMAXW | |
22 #ifdef HAVE_MMX2 | |
23 #define SPREADW(a) "pshufw $0, " #a ", " #a " \n\t" | |
24 #define PMAXW(a,b) "pmaxsw " #a ", " #b " \n\t" | |
25 | |
26 #else | |
27 #define SPREADW(a) \ | |
28 "punpcklwd " #a ", " #a " \n\t"\ | |
29 "punpcklwd " #a ", " #a " \n\t" | |
30 #define PMAXW(a,b) \ | |
31 "psubusw " #a ", " #b " \n\t"\ | |
32 "paddw " #a ", " #b " \n\t" | |
33 #endif | |
34 | |
35 static int RENAME(dct_quantize)(MpegEncContext *s, | |
36 DCTELEM *block, int n, | |
344 | 37 int qscale, int *overflow) |
220 | 38 { |
344 | 39 int level=0, last_non_zero_p1, q; //=0 is cuz gcc says uninitalized ... |
40 const UINT16 *qmat, *bias; | |
220 | 41 static __align8 INT16 temp_block[64]; |
42 | |
625
bb6a69f9d409
slow but accurate integer dct from IJG (should be ok with the LGPL as the old DCT is the fast integer DCT from IJG)
michaelni
parents:
599
diff
changeset
|
43 //s->fdct (block); |
bb6a69f9d409
slow but accurate integer dct from IJG (should be ok with the LGPL as the old DCT is the fast integer DCT from IJG)
michaelni
parents:
599
diff
changeset
|
44 fdct_mmx (block); //cant be anything else ... |
344 | 45 |
220 | 46 if (s->mb_intra) { |
47 int dummy; | |
48 if (n < 4) | |
49 q = s->y_dc_scale; | |
50 else | |
51 q = s->c_dc_scale; | |
52 /* note: block[0] is assumed to be positive */ | |
350
6ebbecc10063
- Advanced Intra Coding (AIC) support for H.263+ encoder, just DC by now.
pulento
parents:
344
diff
changeset
|
53 if (!s->h263_aic) { |
220 | 54 #if 1 |
344 | 55 asm volatile ( |
56 "xorl %%edx, %%edx \n\t" | |
57 "mul %%ecx \n\t" | |
58 : "=d" (level), "=a"(dummy) | |
59 : "a" (block[0] + (q >> 1)), "c" (inverse[q]) | |
60 ); | |
220 | 61 #else |
344 | 62 asm volatile ( |
63 "xorl %%edx, %%edx \n\t" | |
64 "divw %%cx \n\t" | |
65 "movzwl %%ax, %%eax \n\t" | |
66 : "=a" (level) | |
67 : "a" (block[0] + (q >> 1)), "c" (q) | |
68 : "%edx" | |
69 ); | |
220 | 70 #endif |
350
6ebbecc10063
- Advanced Intra Coding (AIC) support for H.263+ encoder, just DC by now.
pulento
parents:
344
diff
changeset
|
71 } else |
6ebbecc10063
- Advanced Intra Coding (AIC) support for H.263+ encoder, just DC by now.
pulento
parents:
344
diff
changeset
|
72 /* For AIC we skip quant/dequant of INTRADC */ |
6ebbecc10063
- Advanced Intra Coding (AIC) support for H.263+ encoder, just DC by now.
pulento
parents:
344
diff
changeset
|
73 level = block[0]; |
6ebbecc10063
- Advanced Intra Coding (AIC) support for H.263+ encoder, just DC by now.
pulento
parents:
344
diff
changeset
|
74 |
344 | 75 block[0]=0; //avoid fake overflow |
220 | 76 // temp_block[0] = (block[0] + (q >> 1)) / q; |
77 last_non_zero_p1 = 1; | |
344 | 78 bias = s->q_intra_matrix16_bias[qscale]; |
79 qmat = s->q_intra_matrix16[qscale]; | |
220 | 80 } else { |
81 last_non_zero_p1 = 0; | |
344 | 82 bias = s->q_inter_matrix16_bias[qscale]; |
83 qmat = s->q_inter_matrix16[qscale]; | |
220 | 84 } |
85 | |
599 | 86 if(s->out_format == FMT_H263 && s->mpeg_quant==0){ |
344 | 87 |
88 asm volatile( | |
89 "movd %%eax, %%mm3 \n\t" // last_non_zero_p1 | |
90 SPREADW(%%mm3) | |
91 "pxor %%mm7, %%mm7 \n\t" // 0 | |
92 "pxor %%mm4, %%mm4 \n\t" // 0 | |
93 "movq (%2), %%mm5 \n\t" // qmat[0] | |
94 "pxor %%mm6, %%mm6 \n\t" | |
95 "psubw (%3), %%mm6 \n\t" // -bias[0] | |
96 "movl $-128, %%eax \n\t" | |
97 ".balign 16 \n\t" | |
98 "1: \n\t" | |
99 "pxor %%mm1, %%mm1 \n\t" // 0 | |
100 "movq (%1, %%eax), %%mm0 \n\t" // block[i] | |
101 "pcmpgtw %%mm0, %%mm1 \n\t" // block[i] <= 0 ? 0xFF : 0x00 | |
102 "pxor %%mm1, %%mm0 \n\t" | |
103 "psubw %%mm1, %%mm0 \n\t" // ABS(block[i]) | |
104 "psubusw %%mm6, %%mm0 \n\t" // ABS(block[i]) + bias[0] | |
105 "pmulhw %%mm5, %%mm0 \n\t" // (ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16 | |
106 "por %%mm0, %%mm4 \n\t" | |
107 "pxor %%mm1, %%mm0 \n\t" | |
108 "psubw %%mm1, %%mm0 \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i]) | |
109 "movq %%mm0, (%5, %%eax) \n\t" | |
110 "pcmpeqw %%mm7, %%mm0 \n\t" // out==0 ? 0xFF : 0x00 | |
111 "movq (%4, %%eax), %%mm1 \n\t" | |
112 "movq %%mm7, (%1, %%eax) \n\t" // 0 | |
113 "pandn %%mm1, %%mm0 \n\t" | |
114 PMAXW(%%mm0, %%mm3) | |
115 "addl $8, %%eax \n\t" | |
116 " js 1b \n\t" | |
117 "movq %%mm3, %%mm0 \n\t" | |
118 "psrlq $32, %%mm3 \n\t" | |
119 PMAXW(%%mm0, %%mm3) | |
120 "movq %%mm3, %%mm0 \n\t" | |
121 "psrlq $16, %%mm3 \n\t" | |
122 PMAXW(%%mm0, %%mm3) | |
123 "movd %%mm3, %%eax \n\t" | |
124 "movzbl %%al, %%eax \n\t" // last_non_zero_p1 | |
125 : "+a" (last_non_zero_p1) | |
126 : "r" (block+64), "r" (qmat), "r" (bias), | |
127 "r" (inv_zigzag_direct16+64), "r" (temp_block+64) | |
128 ); | |
129 // note the asm is split cuz gcc doesnt like that many operands ... | |
130 asm volatile( | |
131 "movd %1, %%mm1 \n\t" // max_qcoeff | |
132 SPREADW(%%mm1) | |
133 "psubusw %%mm1, %%mm4 \n\t" | |
134 "packuswb %%mm4, %%mm4 \n\t" | |
135 "movd %%mm4, %0 \n\t" // *overflow | |
136 : "=g" (*overflow) | |
137 : "g" (s->max_qcoeff) | |
138 ); | |
139 }else{ // FMT_H263 | |
140 asm volatile( | |
141 "movd %%eax, %%mm3 \n\t" // last_non_zero_p1 | |
142 SPREADW(%%mm3) | |
143 "pxor %%mm7, %%mm7 \n\t" // 0 | |
144 "pxor %%mm4, %%mm4 \n\t" // 0 | |
145 "movl $-128, %%eax \n\t" | |
146 ".balign 16 \n\t" | |
147 "1: \n\t" | |
148 "pxor %%mm1, %%mm1 \n\t" // 0 | |
149 "movq (%1, %%eax), %%mm0 \n\t" // block[i] | |
150 "pcmpgtw %%mm0, %%mm1 \n\t" // block[i] <= 0 ? 0xFF : 0x00 | |
151 "pxor %%mm1, %%mm0 \n\t" | |
152 "psubw %%mm1, %%mm0 \n\t" // ABS(block[i]) | |
153 "movq (%3, %%eax), %%mm6 \n\t" // bias[0] | |
154 "paddusw %%mm6, %%mm0 \n\t" // ABS(block[i]) + bias[0] | |
155 "movq (%2, %%eax), %%mm5 \n\t" // qmat[i] | |
156 "pmulhw %%mm5, %%mm0 \n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16 | |
157 "por %%mm0, %%mm4 \n\t" | |
158 "pxor %%mm1, %%mm0 \n\t" | |
159 "psubw %%mm1, %%mm0 \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i]) | |
160 "movq %%mm0, (%5, %%eax) \n\t" | |
161 "pcmpeqw %%mm7, %%mm0 \n\t" // out==0 ? 0xFF : 0x00 | |
162 "movq (%4, %%eax), %%mm1 \n\t" | |
163 "movq %%mm7, (%1, %%eax) \n\t" // 0 | |
164 "pandn %%mm1, %%mm0 \n\t" | |
165 PMAXW(%%mm0, %%mm3) | |
166 "addl $8, %%eax \n\t" | |
167 " js 1b \n\t" | |
168 "movq %%mm3, %%mm0 \n\t" | |
169 "psrlq $32, %%mm3 \n\t" | |
170 PMAXW(%%mm0, %%mm3) | |
171 "movq %%mm3, %%mm0 \n\t" | |
172 "psrlq $16, %%mm3 \n\t" | |
173 PMAXW(%%mm0, %%mm3) | |
174 "movd %%mm3, %%eax \n\t" | |
175 "movzbl %%al, %%eax \n\t" // last_non_zero_p1 | |
176 : "+a" (last_non_zero_p1) | |
177 : "r" (block+64), "r" (qmat+64), "r" (bias+64), | |
178 "r" (inv_zigzag_direct16+64), "r" (temp_block+64) | |
179 ); | |
180 // note the asm is split cuz gcc doesnt like that many operands ... | |
181 asm volatile( | |
182 "movd %1, %%mm1 \n\t" // max_qcoeff | |
183 SPREADW(%%mm1) | |
184 "psubusw %%mm1, %%mm4 \n\t" | |
185 "packuswb %%mm4, %%mm4 \n\t" | |
186 "movd %%mm4, %0 \n\t" // *overflow | |
187 : "=g" (*overflow) | |
188 : "g" (s->max_qcoeff) | |
189 ); | |
190 } | |
191 | |
192 if(s->mb_intra) temp_block[0]= level; //FIXME move afer permute | |
599 | 193 |
220 | 194 // last_non_zero_p1=64; |
195 /* permute for IDCT */ | |
196 asm volatile( | |
344 | 197 "movl %0, %%eax \n\t" |
220 | 198 "pushl %%ebp \n\t" |
199 "movl %%esp, " MANGLE(esp_temp) "\n\t" | |
200 "1: \n\t" | |
201 "movzbl (%1, %%eax), %%ebx \n\t" | |
202 "movzbl 1(%1, %%eax), %%ebp \n\t" | |
203 "movw (%2, %%ebx, 2), %%cx \n\t" | |
204 "movw (%2, %%ebp, 2), %%sp \n\t" | |
205 "movzbl " MANGLE(permutation) "(%%ebx), %%ebx\n\t" | |
206 "movzbl " MANGLE(permutation) "(%%ebp), %%ebp\n\t" | |
207 "movw %%cx, (%3, %%ebx, 2) \n\t" | |
208 "movw %%sp, (%3, %%ebp, 2) \n\t" | |
209 "addl $2, %%eax \n\t" | |
210 " js 1b \n\t" | |
211 "movl " MANGLE(esp_temp) ", %%esp\n\t" | |
212 "popl %%ebp \n\t" | |
213 : | |
214 : "g" (-last_non_zero_p1), "d" (zigzag_direct_noperm+last_non_zero_p1), "S" (temp_block), "D" (block) | |
215 : "%eax", "%ebx", "%ecx" | |
216 ); | |
217 /* | |
218 for(i=0; i<last_non_zero_p1; i++) | |
219 { | |
220 int j= zigzag_direct_noperm[i]; | |
221 block[block_permute_op(j)]= temp_block[j]; | |
222 } | |
223 */ | |
224 //block_permute(block); | |
344 | 225 |
220 | 226 return last_non_zero_p1 - 1; |
227 } |