comparison i386/simple_idct_mmx.c @ 175:bd77d3cbb233 libavcodec

new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
author arpi_esp
date Sun, 09 Dec 2001 12:39:54 +0000
parents
children c0d8ecae7ac5
comparison
equal deleted inserted replaced
174:ac5075a55488 175:bd77d3cbb233
1 /*
2 Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at)
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18
19 #include <inttypes.h>
20 #include "../dsputil.h"
21
22 #define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
23 #define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
24 #define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
25 #define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
26 #define C4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
27 #define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
28 #define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
29 #define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
30
31 #define ROW_SHIFT 11
32 #define COL_SHIFT 20 // 6
33
34 static uint64_t __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000ULL;
35 static uint64_t __attribute__((aligned(8))) d40000= 0x0000000000040000ULL;
36 static int16_t __attribute__((aligned(8))) temp[64];
37 static int16_t __attribute__((aligned(8))) coeffs[]= {
38 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
39 // 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
40 // 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
41 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
42 // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
43 // 0, 0, 0, 0,
44 // 0, 0, 0, 0,
45
46 C4, C2, C4, C2,
47 C4, C6, C4, C6,
48 C1, C3, C1, C3,
49 C5, C7, C5, C7,
50
51 C4, C6, C4, C6,
52 -C4, -C2, -C4, -C2,
53 C3, -C7, C3, -C7,
54 -C1, -C5, -C1, -C5,
55
56 C4, -C6, C4, -C6,
57 -C4, C2, -C4, C2,
58 C5, -C1, C5, -C1,
59 C7, C3, C7, C3,
60
61 C4, -C2, C4, -C2,
62 C4, -C6, C4, -C6,
63 C7, -C5, C7, -C5,
64 C3, -C1, C3, -C1
65 };
66 #if 0
67 static void inline idctCol (int16_t * col, int16_t *input)
68 {
69 #undef C0
70 #undef C1
71 #undef C2
72 #undef C3
73 #undef C4
74 #undef C5
75 #undef C6
76 #undef C7
77 int a0, a1, a2, a3, b0, b1, b2, b3;
78 const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
79 const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
80 const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
81 const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
82 const int C4 = 16384; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
83 const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
84 const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
85 const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
86 /*
87 if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) {
88 col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] =
89 col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3;
90 return;
91 }*/
92
93 col[8*0] = input[8*0 + 0];
94 col[8*1] = input[8*2 + 0];
95 col[8*2] = input[8*0 + 1];
96 col[8*3] = input[8*2 + 1];
97 col[8*4] = input[8*4 + 0];
98 col[8*5] = input[8*6 + 0];
99 col[8*6] = input[8*4 + 1];
100 col[8*7] = input[8*6 + 1];
101
102 a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6] + (1<<(COL_SHIFT-1));
103 a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6] + (1<<(COL_SHIFT-1));
104 a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6] + (1<<(COL_SHIFT-1));
105 a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6] + (1<<(COL_SHIFT-1));
106
107 b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7];
108 b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7];
109 b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7];
110 b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7];
111
112 col[8*0] = (a0 + b0) >> COL_SHIFT;
113 col[8*1] = (a1 + b1) >> COL_SHIFT;
114 col[8*2] = (a2 + b2) >> COL_SHIFT;
115 col[8*3] = (a3 + b3) >> COL_SHIFT;
116 col[8*4] = (a3 - b3) >> COL_SHIFT;
117 col[8*5] = (a2 - b2) >> COL_SHIFT;
118 col[8*6] = (a1 - b1) >> COL_SHIFT;
119 col[8*7] = (a0 - b0) >> COL_SHIFT;
120 }
121
122 static void inline idctRow (int16_t * output, int16_t * input)
123 {
124 int16_t row[8];
125
126 int a0, a1, a2, a3, b0, b1, b2, b3;
127 const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
128 const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
129 const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
130 const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
131 const int C4 = 16384; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
132 const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
133 const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
134 const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
135
136 row[0] = input[0];
137 row[2] = input[1];
138 row[4] = input[4];
139 row[6] = input[5];
140 row[1] = input[8];
141 row[3] = input[9];
142 row[5] = input[12];
143 row[7] = input[13];
144
145 if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) {
146 row[0] = row[1] = row[2] = row[3] = row[4] =
147 row[5] = row[6] = row[7] = row[0]<<3;
148 output[0] = row[0];
149 output[2] = row[1];
150 output[4] = row[2];
151 output[6] = row[3];
152 output[8] = row[4];
153 output[10] = row[5];
154 output[12] = row[6];
155 output[14] = row[7];
156 return;
157 }
158
159 a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1));
160 a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1));
161 a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1));
162 a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1));
163
164 b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
165 b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
166 b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
167 b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
168
169 row[0] = (a0 + b0) >> ROW_SHIFT;
170 row[1] = (a1 + b1) >> ROW_SHIFT;
171 row[2] = (a2 + b2) >> ROW_SHIFT;
172 row[3] = (a3 + b3) >> ROW_SHIFT;
173 row[4] = (a3 - b3) >> ROW_SHIFT;
174 row[5] = (a2 - b2) >> ROW_SHIFT;
175 row[6] = (a1 - b1) >> ROW_SHIFT;
176 row[7] = (a0 - b0) >> ROW_SHIFT;
177
178 output[0] = row[0];
179 output[2] = row[1];
180 output[4] = row[2];
181 output[6] = row[3];
182 output[8] = row[4];
183 output[10] = row[5];
184 output[12] = row[6];
185 output[14] = row[7];
186 }
187 #endif
188
189 static inline void idct(int16_t *block)
190 {
191 int i;
192 //for(i=0; i<64; i++) temp[i]= block[ block_permute_op(i) ];
193 //for(i=0; i<64; i++) temp[block_permute_op(i)]= block[ i ];
194 //for(i=0; i<64; i++) block[i]= temp[i];
195 //block_permute(block);
196 /*
197 idctRow(temp, block);
198 idctRow(temp+16, block+16);
199 idctRow(temp+1, block+2);
200 idctRow(temp+17, block+18);
201 idctRow(temp+32, block+32);
202 idctRow(temp+48, block+48);
203 idctRow(temp+33, block+34);
204 idctRow(temp+49, block+50);
205 */
206
207 asm volatile(
208 // "lea 64(%0), %%eax \n\t"
209 //r0,r2,R0,R2 r4,r6,R4,R6 r1,r3,R1,R3 r5,r7,R5,R7
210 //src0 src4 src1 src5
211 //r0,R0,r7,R7 r1,R1,r6,R6 r2,R2,r5,R5 r3,R3,r4,R4
212 //dst0 dst1 dst2 dst3
213 #if 0 //Alternative, simpler variant
214 #define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
215 "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\
216 "movq " #src4 ", %%mm1 \n\t" /* R6 R4 r6 r4 */\
217 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
218 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
219 "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\
220 "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\
221 "movq 24(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\
222 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C4R4 C6r6+C4r4 */\
223 "movq 32(%2), %%mm6 \n\t" /* C3 C1 C3 C1 */\
224 "pmaddwd %%mm2, %%mm6 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
225 "movq 40(%2), %%mm7 \n\t" /* C7 C5 C7 C5 */\
226 "pmaddwd %%mm3, %%mm7 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
227 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
228 #rounder ", %%mm4 \n\t"\
229 \
230 "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\
231 "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\
232 "paddd %%mm7, %%mm6 \n\t" /* B0 b0 */\
233 "paddd %%mm4, %%mm6 \n\t" /* A0+B0 a0+b0 */\
234 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
235 "psubd %%mm6, %%mm4 \n\t" /* A0-B0 a0-b0 */\
236 "psrad $" #shift ", %%mm6 \n\t"\
237 "psrad $" #shift ", %%mm4 \n\t"\
238 WRITE0(%%mm6, %%mm4, dst) \
239 \
240 "movq 56(%2), %%mm4 \n\t" /* -C2 -C4 -C2 -C4 */\
241 "pmaddwd %%mm1, %%mm4 \n\t" /* -C2R6-C4R4 -C2r6-C4r4 */\
242 "movq 64(%2), %%mm6 \n\t" /* -C7 C3 -C7 C3 */\
243 "pmaddwd %%mm2, %%mm6 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
244 "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
245 "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
246 "paddd %%mm5, %%mm4 \n\t" /* A1 a1 */\
247 #rounder ", %%mm4 \n\t"\
248 \
249 "movq 80(%2), %%mm5 \n\t" /* -C6 C4 -C6 C4 */\
250 "pmaddwd %%mm0, %%mm5 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\
251 "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\
252 "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\
253 "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\
254 "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\
255 "psrad $" #shift ", %%mm6 \n\t"\
256 "psrad $" #shift ", %%mm4 \n\t"\
257 WRITE1(%%mm6, %%mm4, dst, %%mm7) \
258 \
259 "movq 88(%2), %%mm4 \n\t" /* C2 -C4 C2 -C4 */\
260 "pmaddwd %%mm1, %%mm4 \n\t" /* C2R6-C4R4 C2r6-C4r4 */\
261 "movq 96(%2), %%mm6 \n\t" /* -C1 C5 -C1 C5 */\
262 "pmaddwd %%mm2, %%mm6 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
263 "movq 104(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
264 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
265 "paddd %%mm5, %%mm4 \n\t" /* A2 a2 */\
266 #rounder ", %%mm4 \n\t"\
267 \
268 "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\
269 "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\
270 "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\
271 "pmaddwd 120(%2), %%mm1 \n\t" /* -C6R6+C4R4 -C6r6+C4r4 */\
272 "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\
273 "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\
274 "pmaddwd 128(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
275 "pmaddwd 136(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
276 "psrad $" #shift ", %%mm6 \n\t"\
277 "psrad $" #shift ", %%mm4 \n\t"\
278 \
279 "paddd %%mm1, %%mm0 \n\t" /* A3 a3 */\
280 #rounder ", %%mm0 \n\t"\
281 "paddd %%mm3, %%mm2 \n\t" /* B3 b3 */\
282 "paddd %%mm0, %%mm2 \n\t" /* A3+B3 a3+b3 */\
283 "paddd %%mm0, %%mm0 \n\t" /* 2A3 2a3 */\
284 "psubd %%mm2, %%mm0 \n\t" /* A3-B3 a3-b3 */\
285 "psrad $" #shift ", %%mm2 \n\t"\
286 "psrad $" #shift ", %%mm0 \n\t"\
287 WRITE2(%%mm6, %%mm4, %%mm2, %%mm0, dst)
288
289 #define DC_COND_IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
290 "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\
291 "movq " #src4 ", %%mm1 \n\t" /* R6 R4 r6 r4 */\
292 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
293 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
294 "movq wm1010, %%mm4 \n\t"\
295 "pand %%mm0, %%mm4 \n\t"\
296 "por %%mm1, %%mm4 \n\t"\
297 "por %%mm2, %%mm4 \n\t"\
298 "por %%mm3, %%mm4 \n\t"\
299 "packssdw %%mm4,%%mm4 \n\t"\
300 "movd %%mm4, %%eax \n\t"\
301 "orl %%eax, %%eax \n\t"\
302 "jz 1f \n\t"\
303 "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\
304 "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\
305 "movq 24(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\
306 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C4R4 C6r6+C4r4 */\
307 "movq 32(%2), %%mm6 \n\t" /* C3 C1 C3 C1 */\
308 "pmaddwd %%mm2, %%mm6 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
309 "movq 40(%2), %%mm7 \n\t" /* C7 C5 C7 C5 */\
310 "pmaddwd %%mm3, %%mm7 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
311 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
312 #rounder ", %%mm4 \n\t"\
313 \
314 "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\
315 "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\
316 "paddd %%mm7, %%mm6 \n\t" /* B0 b0 */\
317 "paddd %%mm4, %%mm6 \n\t" /* A0+B0 a0+b0 */\
318 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
319 "psubd %%mm6, %%mm4 \n\t" /* A0-B0 a0-b0 */\
320 "psrad $" #shift ", %%mm6 \n\t"\
321 "psrad $" #shift ", %%mm4 \n\t"\
322 WRITE0(%%mm6, %%mm4, dst) \
323 \
324 "movq 56(%2), %%mm4 \n\t" /* -C2 -C4 -C2 -C4 */\
325 "pmaddwd %%mm1, %%mm4 \n\t" /* -C2R6-C4R4 -C2r6-C4r4 */\
326 "movq 64(%2), %%mm6 \n\t" /* -C7 C3 -C7 C3 */\
327 "pmaddwd %%mm2, %%mm6 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
328 "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
329 "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
330 "paddd %%mm5, %%mm4 \n\t" /* A1 a1 */\
331 #rounder ", %%mm4 \n\t"\
332 \
333 "movq 80(%2), %%mm5 \n\t" /* -C6 C4 -C6 C4 */\
334 "pmaddwd %%mm0, %%mm5 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\
335 "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\
336 "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\
337 "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\
338 "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\
339 "psrad $" #shift ", %%mm6 \n\t"\
340 "psrad $" #shift ", %%mm4 \n\t"\
341 WRITE1(%%mm6, %%mm4, dst, %%mm7) \
342 \
343 "movq 88(%2), %%mm4 \n\t" /* C2 -C4 C2 -C4 */\
344 "pmaddwd %%mm1, %%mm4 \n\t" /* C2R6-C4R4 C2r6-C4r4 */\
345 "movq 96(%2), %%mm6 \n\t" /* -C1 C5 -C1 C5 */\
346 "pmaddwd %%mm2, %%mm6 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
347 "movq 104(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
348 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
349 "paddd %%mm5, %%mm4 \n\t" /* A2 a2 */\
350 #rounder ", %%mm4 \n\t"\
351 \
352 "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\
353 "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\
354 "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\
355 "pmaddwd 120(%2), %%mm1 \n\t" /* -C6R6+C4R4 -C6r6+C4r4 */\
356 "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\
357 "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\
358 "pmaddwd 128(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
359 "pmaddwd 136(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
360 "psrad $" #shift ", %%mm6 \n\t"\
361 "psrad $" #shift ", %%mm4 \n\t"\
362 \
363 "paddd %%mm1, %%mm0 \n\t" /* A3 a3 */\
364 #rounder ", %%mm0 \n\t"\
365 "paddd %%mm3, %%mm2 \n\t" /* B3 b3 */\
366 "paddd %%mm0, %%mm2 \n\t" /* A3+B3 a3+b3 */\
367 "paddd %%mm0, %%mm0 \n\t" /* 2A3 2a3 */\
368 "psubd %%mm2, %%mm0 \n\t" /* A3-B3 a3-b3 */\
369 "psrad $" #shift ", %%mm2 \n\t"\
370 "psrad $" #shift ", %%mm0 \n\t"\
371 WRITE2(%%mm6, %%mm4, %%mm2, %%mm0, dst)\
372 "jmp 2f \n\t"\
373 "1: \n\t"\
374 WRITE3(%%mm0, dst)\
375 "2: \n\t"\
376
377
378 #define WRITE0(s0, s7, dst)\
379 "movq " #s0 ", " #dst " \n\t" /* R0 r0 */\
380 "movq " #s7 ", 24+" #dst " \n\t" /* R7 r7 */
381
382 #define WRITE1(s1, s6, dst, tmp)\
383 "movq " #dst ", " #tmp " \n\t" /* R0 r0 */\
384 "packssdw " #s1 ", " #tmp " \n\t" /* R1 r1 R0 r0*/\
385 "movq " #tmp ", " #dst " \n\t"\
386 "movq 24+" #dst ", " #tmp " \n\t" /* R7 r7 */\
387 "packssdw " #tmp ", " #s6 " \n\t" /* R7 r7 R6 r6*/\
388 "movq " #s6 ", 24+" #dst " \n\t"
389
390 #define WRITE2(s2, s5, s3, s4, dst)\
391 "packssdw " #s3 ", " #s2 " \n\t" /* R3 r3 R2 r2*/\
392 "packssdw " #s5 ", " #s4 " \n\t" /* R5 r5 R4 r4*/\
393 "movq " #s2 ", 8+" #dst " \n\t"\
394 "movq " #s4 ", 16+" #dst " \n\t"
395
396 #define WRITE3(a, dst)\
397 "pslld $16, " #a " \n\t"\
398 "psrad $13, " #a " \n\t"\
399 "packssdw " #a ", " #a " \n\t"\
400 "movq " #a ", " #dst " \n\t"\
401 "movq " #a ", 8+" #dst " \n\t"\
402 "movq " #a ", 16+" #dst " \n\t"\
403 "movq " #a ", 24+" #dst " \n\t"\
404
405 //IDCT_CORE( src0, src4, src1, src5, dst, rounder, shift)
406 IDCT_CORE( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
407 /*
408 DC_COND_IDCT_CORE( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
409 DC_COND_IDCT_CORE( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
410 DC_COND_IDCT_CORE( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
411 */
412 IDCT_CORE( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
413 IDCT_CORE( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
414 IDCT_CORE( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
415
416 #undef WRITE0
417 #undef WRITE1
418 #undef WRITE2
419
420 #define WRITE0(s0, s7, dst)\
421 "packssdw " #s0 ", " #s0 " \n\t" /* C0, c0, C0, c0 */\
422 "packssdw " #s7 ", " #s7 " \n\t" /* C7, c7, C7, c7 */\
423 "movd " #s0 ", " #dst " \n\t" /* C0, c0 */\
424 "movd " #s7 ", 112+" #dst " \n\t" /* C7, c7 */
425
426 #define WRITE1(s1, s6, dst, tmp)\
427 "packssdw " #s1 ", " #s1 " \n\t" /* C1, c1, C1, c1 */\
428 "packssdw " #s6 ", " #s6 " \n\t" /* C6, c6, C6, c6 */\
429 "movd " #s1 ", 16+" #dst " \n\t" /* C1, c1 */\
430 "movd " #s6 ", 96+" #dst " \n\t" /* C6, c6 */
431
432 #define WRITE2(s2, s5, s3, s4, dst)\
433 "packssdw " #s2 ", " #s2 " \n\t" /* C2, c2, C2, c2 */\
434 "packssdw " #s3 ", " #s3 " \n\t" /* C3, c3, C3, c3 */\
435 "movd " #s2 ", 32+" #dst " \n\t" /* C2, c2 */\
436 "movd " #s3 ", 48+" #dst " \n\t" /* C3, c3 */\
437 "packssdw " #s4 ", " #s4 " \n\t" /* C4, c4, C4, c4 */\
438 "packssdw " #s5 ", " #s5 " \n\t" /* C5, c5, C5, c5 */\
439 "movd " #s4 ", 64+" #dst " \n\t" /* C4, c4 */\
440 "movd " #s5 ", 80+" #dst " \n\t" /* C5, c5 */\
441
442 //IDCT_CORE( src0, src4, src1, src5, dst, rounder, shift)
443 IDCT_CORE( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
444 IDCT_CORE( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
445 IDCT_CORE( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
446 IDCT_CORE( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
447
448 #else
449
450 #define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
451 "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\
452 "movq " #src4 ", %%mm1 \n\t" /* R6 R4 r6 r4 */\
453 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
454 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
455 "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\
456 "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\
457 "movq 24(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\
458 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C4R4 C6r6+C4r4 */\
459 "movq 32(%2), %%mm6 \n\t" /* C3 C1 C3 C1 */\
460 "pmaddwd %%mm2, %%mm6 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
461 "movq 40(%2), %%mm7 \n\t" /* C7 C5 C7 C5 */\
462 "pmaddwd %%mm3, %%mm7 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
463 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
464 #rounder ", %%mm4 \n\t"\
465 \
466 "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\
467 "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\
468 "paddd %%mm7, %%mm6 \n\t" /* B0 b0 */\
469 "paddd %%mm4, %%mm6 \n\t" /* A0+B0 a0+b0 */\
470 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
471 "psubd %%mm6, %%mm4 \n\t" /* A0-B0 a0-b0 */\
472 "psrad $" #shift ", %%mm6 \n\t"\
473 "psrad $" #shift ", %%mm4 \n\t"\
474 WRITE0(%%mm6, %%mm4, dst) \
475 \
476 "movq 56(%2), %%mm4 \n\t" /* -C2 -C4 -C2 -C4 */\
477 "pmaddwd %%mm1, %%mm4 \n\t" /* -C2R6-C4R4 -C2r6-C4r4 */\
478 "movq 64(%2), %%mm6 \n\t" /* -C7 C3 -C7 C3 */\
479 "pmaddwd %%mm2, %%mm6 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
480 "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
481 "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
482 "paddd %%mm5, %%mm4 \n\t" /* A1 a1 */\
483 #rounder ", %%mm4 \n\t"\
484 \
485 "movq 80(%2), %%mm5 \n\t" /* -C6 C4 -C6 C4 */\
486 "pmaddwd %%mm0, %%mm5 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\
487 "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\
488 "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\
489 "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\
490 "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\
491 "psrad $" #shift ", %%mm6 \n\t"\
492 "psrad $" #shift ", %%mm4 \n\t"\
493 WRITE1(%%mm6, %%mm4, dst, %%mm7) \
494 \
495 "movq 88(%2), %%mm4 \n\t" /* C2 -C4 C2 -C4 */\
496 "pmaddwd %%mm1, %%mm4 \n\t" /* C2R6-C4R4 C2r6-C4r4 */\
497 "movq 96(%2), %%mm6 \n\t" /* -C1 C5 -C1 C5 */\
498 "pmaddwd %%mm2, %%mm6 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
499 "movq 104(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
500 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
501 "paddd %%mm5, %%mm4 \n\t" /* A2 a2 */\
502 #rounder ", %%mm4 \n\t"\
503 \
504 "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\
505 "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\
506 "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\
507 "pmaddwd 120(%2), %%mm1 \n\t" /* -C6R6+C4R4 -C6r6+C4r4 */\
508 "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\
509 "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\
510 "pmaddwd 128(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
511 "pmaddwd 136(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
512 "psrad $" #shift ", %%mm6 \n\t"\
513 "psrad $" #shift ", %%mm4 \n\t"\
514 \
515 "paddd %%mm1, %%mm0 \n\t" /* A3 a3 */\
516 #rounder ", %%mm0 \n\t"\
517 "paddd %%mm3, %%mm2 \n\t" /* B3 b3 */\
518 "paddd %%mm0, %%mm2 \n\t" /* A3+B3 a3+b3 */\
519 "paddd %%mm0, %%mm0 \n\t" /* 2A3 2a3 */\
520 "psubd %%mm2, %%mm0 \n\t" /* A3-B3 a3-b3 */\
521 "psrad $" #shift ", %%mm2 \n\t"\
522 "psrad $" #shift ", %%mm0 \n\t"\
523 WRITE2(%%mm6, %%mm4, %%mm2, %%mm0, dst)
524
525 #define DC_COND_IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
526 "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\
527 "movq " #src4 ", %%mm1 \n\t" /* R6 R4 r6 r4 */\
528 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
529 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
530 "movq wm1010, %%mm4 \n\t"\
531 "pand %%mm0, %%mm4 \n\t"\
532 "por %%mm1, %%mm4 \n\t"\
533 "por %%mm2, %%mm4 \n\t"\
534 "por %%mm3, %%mm4 \n\t"\
535 "packssdw %%mm4,%%mm4 \n\t"\
536 "movd %%mm4, %%eax \n\t"\
537 "orl %%eax, %%eax \n\t"\
538 "jz 1f \n\t"\
539 "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\
540 "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\
541 "movq 24(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\
542 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C4R4 C6r6+C4r4 */\
543 "movq 32(%2), %%mm6 \n\t" /* C3 C1 C3 C1 */\
544 "pmaddwd %%mm2, %%mm6 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
545 "movq 40(%2), %%mm7 \n\t" /* C7 C5 C7 C5 */\
546 "pmaddwd %%mm3, %%mm7 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
547 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
548 #rounder ", %%mm4 \n\t"\
549 \
550 "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\
551 "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\
552 "paddd %%mm7, %%mm6 \n\t" /* B0 b0 */\
553 "paddd %%mm4, %%mm6 \n\t" /* A0+B0 a0+b0 */\
554 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
555 "psubd %%mm6, %%mm4 \n\t" /* A0-B0 a0-b0 */\
556 "psrad $" #shift ", %%mm6 \n\t"\
557 "psrad $" #shift ", %%mm4 \n\t"\
558 WRITE0(%%mm6, %%mm4, dst) \
559 \
560 "movq 56(%2), %%mm4 \n\t" /* -C2 -C4 -C2 -C4 */\
561 "pmaddwd %%mm1, %%mm4 \n\t" /* -C2R6-C4R4 -C2r6-C4r4 */\
562 "movq 64(%2), %%mm6 \n\t" /* -C7 C3 -C7 C3 */\
563 "pmaddwd %%mm2, %%mm6 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
564 "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
565 "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
566 "paddd %%mm5, %%mm4 \n\t" /* A1 a1 */\
567 #rounder ", %%mm4 \n\t"\
568 \
569 "movq 80(%2), %%mm5 \n\t" /* -C6 C4 -C6 C4 */\
570 "pmaddwd %%mm0, %%mm5 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\
571 "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\
572 "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\
573 "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\
574 "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\
575 "psrad $" #shift ", %%mm6 \n\t"\
576 "psrad $" #shift ", %%mm4 \n\t"\
577 WRITE1(%%mm6, %%mm4, dst, %%mm7) \
578 \
579 "movq 88(%2), %%mm4 \n\t" /* C2 -C4 C2 -C4 */\
580 "pmaddwd %%mm1, %%mm4 \n\t" /* C2R6-C4R4 C2r6-C4r4 */\
581 "movq 96(%2), %%mm6 \n\t" /* -C1 C5 -C1 C5 */\
582 "pmaddwd %%mm2, %%mm6 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
583 "movq 104(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
584 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
585 "paddd %%mm5, %%mm4 \n\t" /* A2 a2 */\
586 #rounder ", %%mm4 \n\t"\
587 \
588 "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\
589 "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\
590 "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\
591 "pmaddwd 120(%2), %%mm1 \n\t" /* -C6R6+C4R4 -C6r6+C4r4 */\
592 "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\
593 "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\
594 "pmaddwd 128(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
595 "pmaddwd 136(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
596 "psrad $" #shift ", %%mm6 \n\t"\
597 "psrad $" #shift ", %%mm4 \n\t"\
598 \
599 "paddd %%mm1, %%mm0 \n\t" /* A3 a3 */\
600 #rounder ", %%mm0 \n\t"\
601 "paddd %%mm3, %%mm2 \n\t" /* B3 b3 */\
602 "paddd %%mm0, %%mm2 \n\t" /* A3+B3 a3+b3 */\
603 "paddd %%mm0, %%mm0 \n\t" /* 2A3 2a3 */\
604 "psubd %%mm2, %%mm0 \n\t" /* A3-B3 a3-b3 */\
605 "psrad $" #shift ", %%mm2 \n\t"\
606 "psrad $" #shift ", %%mm0 \n\t"\
607 WRITE2(%%mm6, %%mm4, %%mm2, %%mm0, dst)\
608 "jmp 2f \n\t"\
609 "#.balign 16 \n\t"\
610 "1: \n\t"\
611 WRITE3(%%mm0, dst)\
612 "2: \n\t"\
613
614 #define Z_COND_IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift, bt) \
615 "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\
616 "movq " #src4 ", %%mm1 \n\t" /* R6 R4 r6 r4 */\
617 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
618 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
619 "movq %%mm0, %%mm4 \n\t"\
620 "por %%mm1, %%mm4 \n\t"\
621 "por %%mm2, %%mm4 \n\t"\
622 "por %%mm3, %%mm4 \n\t"\
623 "packssdw %%mm4, %%mm4 \n\t"\
624 "movd %%mm4, %%eax \n\t"\
625 "orl %%eax, %%eax \n\t"\
626 "jz " #bt " \n\t"\
627 "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\
628 "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\
629 "movq 24(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\
630 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C4R4 C6r6+C4r4 */\
631 "movq 32(%2), %%mm6 \n\t" /* C3 C1 C3 C1 */\
632 "pmaddwd %%mm2, %%mm6 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
633 "movq 40(%2), %%mm7 \n\t" /* C7 C5 C7 C5 */\
634 "pmaddwd %%mm3, %%mm7 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
635 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
636 #rounder ", %%mm4 \n\t"\
637 \
638 "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\
639 "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\
640 "paddd %%mm7, %%mm6 \n\t" /* B0 b0 */\
641 "paddd %%mm4, %%mm6 \n\t" /* A0+B0 a0+b0 */\
642 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
643 "psubd %%mm6, %%mm4 \n\t" /* A0-B0 a0-b0 */\
644 "psrad $" #shift ", %%mm6 \n\t"\
645 "psrad $" #shift ", %%mm4 \n\t"\
646 WRITE0(%%mm6, %%mm4, dst) \
647 \
648 "movq 56(%2), %%mm4 \n\t" /* -C2 -C4 -C2 -C4 */\
649 "pmaddwd %%mm1, %%mm4 \n\t" /* -C2R6-C4R4 -C2r6-C4r4 */\
650 "movq 64(%2), %%mm6 \n\t" /* -C7 C3 -C7 C3 */\
651 "pmaddwd %%mm2, %%mm6 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
652 "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
653 "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
654 "paddd %%mm5, %%mm4 \n\t" /* A1 a1 */\
655 #rounder ", %%mm4 \n\t"\
656 \
657 "movq 80(%2), %%mm5 \n\t" /* -C6 C4 -C6 C4 */\
658 "pmaddwd %%mm0, %%mm5 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\
659 "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\
660 "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\
661 "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\
662 "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\
663 "psrad $" #shift ", %%mm6 \n\t"\
664 "psrad $" #shift ", %%mm4 \n\t"\
665 WRITE1(%%mm6, %%mm4, dst, %%mm7) \
666 \
667 "movq 88(%2), %%mm4 \n\t" /* C2 -C4 C2 -C4 */\
668 "pmaddwd %%mm1, %%mm4 \n\t" /* C2R6-C4R4 C2r6-C4r4 */\
669 "movq 96(%2), %%mm6 \n\t" /* -C1 C5 -C1 C5 */\
670 "pmaddwd %%mm2, %%mm6 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
671 "movq 104(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
672 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
673 "paddd %%mm5, %%mm4 \n\t" /* A2 a2 */\
674 #rounder ", %%mm4 \n\t"\
675 \
676 "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\
677 "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\
678 "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\
679 "pmaddwd 120(%2), %%mm1 \n\t" /* -C6R6+C4R4 -C6r6+C4r4 */\
680 "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\
681 "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\
682 "pmaddwd 128(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
683 "pmaddwd 136(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
684 "psrad $" #shift ", %%mm6 \n\t"\
685 "psrad $" #shift ", %%mm4 \n\t"\
686 \
687 "paddd %%mm1, %%mm0 \n\t" /* A3 a3 */\
688 #rounder ", %%mm0 \n\t"\
689 "paddd %%mm3, %%mm2 \n\t" /* B3 b3 */\
690 "paddd %%mm0, %%mm2 \n\t" /* A3+B3 a3+b3 */\
691 "paddd %%mm0, %%mm0 \n\t" /* 2A3 2a3 */\
692 "psubd %%mm2, %%mm0 \n\t" /* A3-B3 a3-b3 */\
693 "psrad $" #shift ", %%mm2 \n\t"\
694 "psrad $" #shift ", %%mm0 \n\t"\
695 WRITE2(%%mm6, %%mm4, %%mm2, %%mm0, dst)\
696
697
698 #define WRITE0(s0, s7, dst)\
699 "movq " #s0 ", " #dst " \n\t" /* R0 r0 */\
700 "movq " #s7 ", 24+" #dst " \n\t" /* R7 r7 */
701
702 #define WRITE1(s1, s6, dst, tmp)\
703 "movq " #dst ", " #tmp " \n\t" /* R0 r0 */\
704 "packssdw " #s1 ", " #tmp " \n\t" /* R1 r1 R0 r0*/\
705 "movq " #tmp ", " #dst " \n\t"\
706 "movq 24+" #dst ", " #tmp " \n\t" /* R7 r7 */\
707 "packssdw " #tmp ", " #s6 " \n\t" /* R7 r7 R6 r6*/\
708 "movq " #s6 ", 24+" #dst " \n\t"
709
710 #define WRITE2(s2, s5, s3, s4, dst)\
711 "packssdw " #s3 ", " #s2 " \n\t" /* R3 r3 R2 r2*/\
712 "packssdw " #s5 ", " #s4 " \n\t" /* R5 r5 R4 r4*/\
713 "movq " #s2 ", 8+" #dst " \n\t"\
714 "movq " #s4 ", 16+" #dst " \n\t"
715
716 #define WRITE3(a, dst)\
717 "pslld $16, " #a " \n\t"\
718 "paddd d40000, " #a " \n\t"\
719 "psrad $13, " #a " \n\t"\
720 "packssdw " #a ", " #a " \n\t"\
721 "movq " #a ", " #dst " \n\t"\
722 "movq " #a ", 8+" #dst " \n\t"\
723 "movq " #a ", 16+" #dst " \n\t"\
724 "movq " #a ", 24+" #dst " \n\t"\
725
726 #define WRITE0b(s0, s7, dst)\
727 "packssdw " #s0 ", " #s0 " \n\t" /* C0, c0, C0, c0 */\
728 "packssdw " #s7 ", " #s7 " \n\t" /* C7, c7, C7, c7 */\
729 "movd " #s0 ", " #dst " \n\t" /* C0, c0 */\
730 "movd " #s7 ", 112+" #dst " \n\t" /* C7, c7 */
731
732 #define WRITE1b(s1, s6, dst, tmp)\
733 "packssdw " #s1 ", " #s1 " \n\t" /* C1, c1, C1, c1 */\
734 "packssdw " #s6 ", " #s6 " \n\t" /* C6, c6, C6, c6 */\
735 "movd " #s1 ", 16+" #dst " \n\t" /* C1, c1 */\
736 "movd " #s6 ", 96+" #dst " \n\t" /* C6, c6 */
737
738 #define WRITE2b(s2, s5, s3, s4, dst)\
739 "packssdw " #s2 ", " #s2 " \n\t" /* C2, c2, C2, c2 */\
740 "packssdw " #s3 ", " #s3 " \n\t" /* C3, c3, C3, c3 */\
741 "movd " #s2 ", 32+" #dst " \n\t" /* C2, c2 */\
742 "movd " #s3 ", 48+" #dst " \n\t" /* C3, c3 */\
743 "packssdw " #s4 ", " #s4 " \n\t" /* C4, c4, C4, c4 */\
744 "packssdw " #s5 ", " #s5 " \n\t" /* C5, c5, C5, c5 */\
745 "movd " #s4 ", 64+" #dst " \n\t" /* C4, c4 */\
746 "movd " #s5 ", 80+" #dst " \n\t" /* C5, c5 */\
747
748
749 //IDCT_CORE( src0, src4, src1, src5, dst, rounder, shift)
750 DC_COND_IDCT_CORE( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
751 Z_COND_IDCT_CORE( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
752 Z_COND_IDCT_CORE( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
753 Z_COND_IDCT_CORE( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
754
755 #undef IDCT_CORE
756 #define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
757 "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\
758 "movq " #src4 ", %%mm1 \n\t" /* R6 R4 r6 r4 */\
759 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
760 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
761 "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\
762 "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\
763 "movq 24(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\
764 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C4R4 C6r6+C4r4 */\
765 "movq 32(%2), %%mm6 \n\t" /* C3 C1 C3 C1 */\
766 "pmaddwd %%mm2, %%mm6 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
767 "movq 40(%2), %%mm7 \n\t" /* C7 C5 C7 C5 */\
768 "pmaddwd %%mm3, %%mm7 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
769 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
770 \
771 "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\
772 "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\
773 "paddd %%mm7, %%mm6 \n\t" /* B0 b0 */\
774 "paddd %%mm4, %%mm6 \n\t" /* A0+B0 a0+b0 */\
775 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
776 "psubd %%mm6, %%mm4 \n\t" /* A0-B0 a0-b0 */\
777 "psrad $" #shift ", %%mm6 \n\t"\
778 "psrad $" #shift ", %%mm4 \n\t"\
779 WRITE0b(%%mm6, %%mm4, dst) \
780 \
781 "movq 56(%2), %%mm4 \n\t" /* -C2 -C4 -C2 -C4 */\
782 "pmaddwd %%mm1, %%mm4 \n\t" /* -C2R6-C4R4 -C2r6-C4r4 */\
783 "movq 64(%2), %%mm6 \n\t" /* -C7 C3 -C7 C3 */\
784 "pmaddwd %%mm2, %%mm6 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
785 "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
786 "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
787 "paddd %%mm5, %%mm4 \n\t" /* A1 a1 */\
788 \
789 "movq 80(%2), %%mm5 \n\t" /* -C6 C4 -C6 C4 */\
790 "pmaddwd %%mm0, %%mm5 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\
791 "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\
792 "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\
793 "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\
794 "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\
795 "psrad $" #shift ", %%mm6 \n\t"\
796 "psrad $" #shift ", %%mm4 \n\t"\
797 WRITE1b(%%mm6, %%mm4, dst, %%mm7) \
798 \
799 "movq 88(%2), %%mm4 \n\t" /* C2 -C4 C2 -C4 */\
800 "pmaddwd %%mm1, %%mm4 \n\t" /* C2R6-C4R4 C2r6-C4r4 */\
801 "movq 96(%2), %%mm6 \n\t" /* -C1 C5 -C1 C5 */\
802 "pmaddwd %%mm2, %%mm6 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
803 "movq 104(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
804 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
805 "paddd %%mm5, %%mm4 \n\t" /* A2 a2 */\
806 \
807 "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\
808 "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\
809 "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\
810 "pmaddwd 120(%2), %%mm1 \n\t" /* -C6R6+C4R4 -C6r6+C4r4 */\
811 "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\
812 "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\
813 "pmaddwd 128(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
814 "pmaddwd 136(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
815 "psrad $" #shift ", %%mm6 \n\t"\
816 "psrad $" #shift ", %%mm4 \n\t"\
817 \
818 "paddd %%mm1, %%mm0 \n\t" /* A3 a3 */\
819 "paddd %%mm3, %%mm2 \n\t" /* B3 b3 */\
820 "paddd %%mm0, %%mm2 \n\t" /* A3+B3 a3+b3 */\
821 "paddd %%mm0, %%mm0 \n\t" /* 2A3 2a3 */\
822 "psubd %%mm2, %%mm0 \n\t" /* A3-B3 a3-b3 */\
823 "psrad $" #shift ", %%mm2 \n\t"\
824 "psrad $" #shift ", %%mm0 \n\t"\
825 WRITE2b(%%mm6, %%mm4, %%mm2, %%mm0, dst)
826
827 //IDCT_CORE( src0, src4, src1, src5, dst, rounder, shift)
828 IDCT_CORE( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
829 IDCT_CORE( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
830 IDCT_CORE( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
831 IDCT_CORE( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
832 "jmp 9f \n\t"
833
834 "#.balign 16 \n\t"\
835 "4: \n\t"
836 Z_COND_IDCT_CORE( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
837 Z_COND_IDCT_CORE( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
838
839 #undef IDCT_CORE
840 #define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
841 "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\
842 "movq " #src4 ", %%mm1 \n\t" /* R6 R4 r6 r4 */\
843 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
844 "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\
845 "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\
846 "movq 24(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\
847 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C4R4 C6r6+C4r4 */\
848 "movq 40(%2), %%mm7 \n\t" /* C7 C5 C7 C5 */\
849 "pmaddwd %%mm3, %%mm7 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
850 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
851 \
852 "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\
853 "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\
854 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
855 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
856 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
857 "psrad $" #shift ", %%mm7 \n\t"\
858 "psrad $" #shift ", %%mm4 \n\t"\
859 WRITE0b(%%mm7, %%mm4, dst) \
860 \
861 "movq 56(%2), %%mm4 \n\t" /* -C2 -C4 -C2 -C4 */\
862 "pmaddwd %%mm1, %%mm4 \n\t" /* -C2R6-C4R4 -C2r6-C4r4 */\
863 "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
864 "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
865 "paddd %%mm5, %%mm4 \n\t" /* A1 a1 */\
866 \
867 "movq 80(%2), %%mm5 \n\t" /* -C6 C4 -C6 C4 */\
868 "pmaddwd %%mm0, %%mm5 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\
869 "paddd %%mm4, %%mm7 \n\t" /* A1+B1 a1+b1 */\
870 "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\
871 "psubd %%mm7, %%mm4 \n\t" /* A1-B1 a1-b1 */\
872 "psrad $" #shift ", %%mm7 \n\t"\
873 "psrad $" #shift ", %%mm4 \n\t"\
874 WRITE1b(%%mm7, %%mm4, dst, %%mm6) \
875 \
876 "movq 88(%2), %%mm4 \n\t" /* C2 -C4 C2 -C4 */\
877 "pmaddwd %%mm1, %%mm4 \n\t" /* C2R6-C4R4 C2r6-C4r4 */\
878 "movq 104(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
879 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
880 "paddd %%mm5, %%mm4 \n\t" /* A2 a2 */\
881 \
882 "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\
883 "paddd %%mm4, %%mm7 \n\t" /* A1+B1 a1+b1 */\
884 "pmaddwd 120(%2), %%mm1 \n\t" /* -C6R6+C4R4 -C6r6+C4r4 */\
885 "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\
886 "psubd %%mm7, %%mm4 \n\t" /* A1-B1 a1-b1 */\
887 "pmaddwd 136(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
888 "psrad $" #shift ", %%mm7 \n\t"\
889 "psrad $" #shift ", %%mm4 \n\t"\
890 \
891 "paddd %%mm1, %%mm0 \n\t" /* A3 a3 */\
892 "paddd %%mm0, %%mm3 \n\t" /* A3+B3 a3+b3 */\
893 "paddd %%mm0, %%mm0 \n\t" /* 2A3 2a3 */\
894 "psubd %%mm3, %%mm0 \n\t" /* A3-B3 a3-b3 */\
895 "psrad $" #shift ", %%mm3 \n\t"\
896 "psrad $" #shift ", %%mm0 \n\t"\
897 WRITE2b(%%mm7, %%mm4, %%mm3, %%mm0, dst)
898
899 //IDCT_CORE( src0, src4, src1, src5, dst, rounder, shift)
900 IDCT_CORE( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
901 IDCT_CORE( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
902 IDCT_CORE( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
903 IDCT_CORE( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
904 "jmp 9f \n\t"
905
906 "#.balign 16 \n\t"\
907 "6: \n\t"
908 Z_COND_IDCT_CORE( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
909
910 #undef IDCT_CORE
911 #define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
912 "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\
913 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
914 "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\
915 "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\
916 "movq 40(%2), %%mm7 \n\t" /* C7 C5 C7 C5 */\
917 "pmaddwd %%mm3, %%mm7 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
918 \
919 "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\
920 "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\
921 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
922 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
923 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
924 "psrad $" #shift ", %%mm7 \n\t"\
925 "psrad $" #shift ", %%mm4 \n\t"\
926 WRITE0b(%%mm7, %%mm4, dst) \
927 \
928 "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
929 "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
930 \
931 "movq 80(%2), %%mm4 \n\t" /* -C6 C4 -C6 C4 */\
932 "pmaddwd %%mm0, %%mm4 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\
933 "paddd %%mm5, %%mm7 \n\t" /* A1+B1 a1+b1 */\
934 "paddd %%mm5, %%mm5 \n\t" /* 2A1 2a1 */\
935 "psubd %%mm7, %%mm5 \n\t" /* A1-B1 a1-b1 */\
936 "psrad $" #shift ", %%mm7 \n\t"\
937 "psrad $" #shift ", %%mm5 \n\t"\
938 WRITE1b(%%mm7, %%mm5, dst, %%mm6) \
939 \
940 "movq 104(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
941 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
942 \
943 "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\
944 "paddd %%mm4, %%mm7 \n\t" /* A1+B1 a1+b1 */\
945 "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\
946 "psubd %%mm7, %%mm4 \n\t" /* A1-B1 a1-b1 */\
947 "pmaddwd 136(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
948 "psrad $" #shift ", %%mm7 \n\t"\
949 "psrad $" #shift ", %%mm4 \n\t"\
950 \
951 "paddd %%mm0, %%mm3 \n\t" /* A3+B3 a3+b3 */\
952 "paddd %%mm0, %%mm0 \n\t" /* 2A3 2a3 */\
953 "psubd %%mm3, %%mm0 \n\t" /* A3-B3 a3-b3 */\
954 "psrad $" #shift ", %%mm3 \n\t"\
955 "psrad $" #shift ", %%mm0 \n\t"\
956 WRITE2b(%%mm7, %%mm4, %%mm3, %%mm0, dst)
957
958 //IDCT_CORE( src0, src4, src1, src5, dst, rounder, shift)
959 IDCT_CORE( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
960 IDCT_CORE( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
961 IDCT_CORE( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
962 IDCT_CORE( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
963 "jmp 9f \n\t"
964
965 "#.balign 16 \n\t"\
966 "2: \n\t"
967 Z_COND_IDCT_CORE( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
968
969 #undef IDCT_CORE
970 #define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
971 "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\
972 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
973 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
974 "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\
975 "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\
976 "movq 32(%2), %%mm6 \n\t" /* C3 C1 C3 C1 */\
977 "pmaddwd %%mm2, %%mm6 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
978 "movq 40(%2), %%mm7 \n\t" /* C7 C5 C7 C5 */\
979 "pmaddwd %%mm3, %%mm7 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
980 \
981 "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\
982 "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\
983 "paddd %%mm7, %%mm6 \n\t" /* B0 b0 */\
984 "paddd %%mm4, %%mm6 \n\t" /* A0+B0 a0+b0 */\
985 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
986 "psubd %%mm6, %%mm4 \n\t" /* A0-B0 a0-b0 */\
987 "psrad $" #shift ", %%mm6 \n\t"\
988 "psrad $" #shift ", %%mm4 \n\t"\
989 WRITE0b(%%mm6, %%mm4, dst) \
990 \
991 "movq 64(%2), %%mm6 \n\t" /* -C7 C3 -C7 C3 */\
992 "pmaddwd %%mm2, %%mm6 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
993 "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
994 "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
995 \
996 "movq 80(%2), %%mm4 \n\t" /* -C6 C4 -C6 C4 */\
997 "pmaddwd %%mm0, %%mm4 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\
998 "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\
999 "paddd %%mm5, %%mm6 \n\t" /* A1+B1 a1+b1 */\
1000 "paddd %%mm5, %%mm5 \n\t" /* 2A1 2a1 */\
1001 "psubd %%mm6, %%mm5 \n\t" /* A1-B1 a1-b1 */\
1002 "psrad $" #shift ", %%mm6 \n\t"\
1003 "psrad $" #shift ", %%mm5 \n\t"\
1004 WRITE1b(%%mm6, %%mm5, dst, %%mm7) \
1005 \
1006 "movq 96(%2), %%mm6 \n\t" /* -C1 C5 -C1 C5 */\
1007 "pmaddwd %%mm2, %%mm6 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
1008 "movq 104(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
1009 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
1010 \
1011 "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\
1012 "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\
1013 "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\
1014 "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\
1015 "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\
1016 "pmaddwd 128(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
1017 "pmaddwd 136(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
1018 "psrad $" #shift ", %%mm6 \n\t"\
1019 "psrad $" #shift ", %%mm4 \n\t"\
1020 \
1021 "paddd %%mm3, %%mm2 \n\t" /* B3 b3 */\
1022 "paddd %%mm0, %%mm2 \n\t" /* A3+B3 a3+b3 */\
1023 "paddd %%mm0, %%mm0 \n\t" /* 2A3 2a3 */\
1024 "psubd %%mm2, %%mm0 \n\t" /* A3-B3 a3-b3 */\
1025 "psrad $" #shift ", %%mm2 \n\t"\
1026 "psrad $" #shift ", %%mm0 \n\t"\
1027 WRITE2b(%%mm6, %%mm4, %%mm2, %%mm0, dst)
1028
1029 //IDCT_CORE( src0, src4, src1, src5, dst, rounder, shift)
1030 IDCT_CORE( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
1031 IDCT_CORE( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
1032 IDCT_CORE( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
1033 IDCT_CORE( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1034 "jmp 9f \n\t"
1035
1036 "#.balign 16 \n\t"\
1037 "3: \n\t"
1038 #undef IDCT_CORE
1039 #define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
1040 "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\
1041 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
1042 "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\
1043 "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\
1044 "movq 32(%2), %%mm6 \n\t" /* C3 C1 C3 C1 */\
1045 "pmaddwd %%mm2, %%mm6 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
1046 \
1047 "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\
1048 "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\
1049 "paddd %%mm4, %%mm6 \n\t" /* A0+B0 a0+b0 */\
1050 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
1051 "psubd %%mm6, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1052 "psrad $" #shift ", %%mm6 \n\t"\
1053 "psrad $" #shift ", %%mm4 \n\t"\
1054 WRITE0b(%%mm6, %%mm4, dst) \
1055 \
1056 "movq 64(%2), %%mm6 \n\t" /* -C7 C3 -C7 C3 */\
1057 "pmaddwd %%mm2, %%mm6 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
1058 \
1059 "movq 80(%2), %%mm4 \n\t" /* -C6 C4 -C6 C4 */\
1060 "pmaddwd %%mm0, %%mm4 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\
1061 "paddd %%mm5, %%mm6 \n\t" /* A1+B1 a1+b1 */\
1062 "paddd %%mm5, %%mm5 \n\t" /* 2A1 2a1 */\
1063 "psubd %%mm6, %%mm5 \n\t" /* A1-B1 a1-b1 */\
1064 "psrad $" #shift ", %%mm6 \n\t"\
1065 "psrad $" #shift ", %%mm5 \n\t"\
1066 WRITE1b(%%mm6, %%mm5, dst, %%mm7) \
1067 \
1068 "movq 96(%2), %%mm6 \n\t" /* -C1 C5 -C1 C5 */\
1069 "pmaddwd %%mm2, %%mm6 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
1070 \
1071 "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\
1072 "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\
1073 "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\
1074 "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\
1075 "pmaddwd 128(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
1076 "psrad $" #shift ", %%mm6 \n\t"\
1077 "psrad $" #shift ", %%mm4 \n\t"\
1078 \
1079 "paddd %%mm0, %%mm2 \n\t" /* A3+B3 a3+b3 */\
1080 "paddd %%mm0, %%mm0 \n\t" /* 2A3 2a3 */\
1081 "psubd %%mm2, %%mm0 \n\t" /* A3-B3 a3-b3 */\
1082 "psrad $" #shift ", %%mm2 \n\t"\
1083 "psrad $" #shift ", %%mm0 \n\t"\
1084 WRITE2b(%%mm6, %%mm4, %%mm2, %%mm0, dst)
1085
1086 //IDCT_CORE( src0, src4, src1, src5, dst, rounder, shift)
1087 IDCT_CORE( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
1088 IDCT_CORE( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
1089 IDCT_CORE( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
1090 IDCT_CORE( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1091 "jmp 9f \n\t"
1092
1093 "#.balign 16 \n\t"\
1094 "5: \n\t"
1095 #undef IDCT_CORE
1096 #define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
1097 "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\
1098 "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\
1099 "movq %%mm4, %%mm6\n\t"\
1100 "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\
1101 "movq " #src4 ", %%mm1 \n\t" /* R6 R4 r6 r4 */\
1102 "movq 24(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\
1103 "movq %%mm5, %%mm7\n\t"\
1104 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C4R4 C6r6+C4r4 */\
1105 "movq 8+" #src0 ", %%mm2 \n\t" /*2R2 R0 r2 r0 */\
1106 "pmaddwd %%mm2, %%mm6 \n\t" /*2C2R2+C4R0 C2r2+C4r0 */\
1107 "movq 8+" #src4 ", %%mm3 \n\t" /*2R6 R4 r6 r4 */\
1108 "pmaddwd %%mm3, %%mm7 \n\t" /*2C6R6+C4R4 C6r6+C4r4 */\
1109 \
1110 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
1111 "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\
1112 "psrad $" #shift ", %%mm4 \n\t"\
1113 "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\
1114 \
1115 "paddd %%mm7, %%mm6 \n\t" /*2A0 a0 */\
1116 "movq 56(%2), %%mm7 \n\t" /* -C2 -C4 -C2 -C4 */\
1117 "psrad $" #shift ", %%mm6 \n\t"\
1118 "pmaddwd %%mm1, %%mm7 \n\t" /* -C2R6-C4R4 -C2r6-C4r4 */\
1119 \
1120 "packssdw %%mm6, %%mm4 \n\t" /* C0, c0, C0, c0 */\
1121 "movq 48(%2), %%mm6 \n\t" /* C6 C4 C6 C4 */\
1122 "movq %%mm4, " #dst " \n\t" /* C0, c0 */\
1123 "pmaddwd %%mm2, %%mm6 \n\t" /*2C6R2+C4R0 C6r2+C4r0 */\
1124 \
1125 "movq %%mm4, 112+" #dst " \n\t" /* C0, c0 */\
1126 "movq 56(%2), %%mm4 \n\t" /* -C2 -C4 -C2 -C4 */\
1127 "pmaddwd %%mm3, %%mm4 \n\t" /*2-C2R6-C4R4 -C2r6-C4r4 */\
1128 \
1129 "paddd %%mm5, %%mm7 \n\t" /* A1 a1 */\
1130 "movq 80(%2), %%mm5 \n\t" /* -C6 C4 -C6 C4 */\
1131 "psrad $" #shift ", %%mm7 \n\t"\
1132 "pmaddwd %%mm0, %%mm5 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\
1133 \
1134 "paddd %%mm4, %%mm6 \n\t" /*2A1 a1 */\
1135 "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\
1136 \
1137 "psrad $" #shift ", %%mm6 \n\t"\
1138 "movq 88(%2), %%mm4 \n\t" /* C2 -C4 C2 -C4 */\
1139 "pmaddwd %%mm1, %%mm4 \n\t" /* C2R6-C4R4 C2r6-C4r4 */\
1140 \
1141 "pmaddwd 120(%2), %%mm1 \n\t" /* -C6R6+C4R4 -C6r6+C4r4 */\
1142 "packssdw %%mm6, %%mm7 \n\t" /* C1, c1, C1, c1 */\
1143 \
1144 "movq 80(%2), %%mm6 \n\t" /* -C6 C4 -C6 C4 */\
1145 "movq %%mm7, 16+" #dst " \n\t" /* C1, c1 */\
1146 "pmaddwd %%mm2, %%mm6 \n\t" /*2-C6R2+C4R0 -C6r2+C4r0 */\
1147 \
1148 "movq %%mm7, 96+" #dst " \n\t" /* C1, c1 */\
1149 "movq 88(%2), %%mm7 \n\t" /* C2 -C4 C2 -C4 */\
1150 "pmaddwd %%mm3, %%mm7 \n\t" /*2C2R6-C4R4 C2r6-C4r4 */\
1151 \
1152 "pmaddwd 112(%2), %%mm2 \n\t" /*2-C2R2+C4R0 -C2r2+C4r0 */\
1153 "paddd %%mm5, %%mm4 \n\t" /* A2 a2 */\
1154 \
1155 "pmaddwd 120(%2), %%mm3 \n\t" /*2-C6R6+C4R4 -C6r6+C4r4 */\
1156 "psrad $" #shift ", %%mm4 \n\t"\
1157 \
1158 "paddd %%mm7, %%mm6 \n\t" /*2A2 a2 */\
1159 "paddd %%mm1, %%mm0 \n\t" /* A3 a3 */\
1160 \
1161 "psrad $" #shift ", %%mm6 \n\t"\
1162 \
1163 "packssdw %%mm6, %%mm4 \n\t" /* C2, c2, C2, c2 */\
1164 "movq %%mm4, 32+" #dst " \n\t" /* C2, c2 */\
1165 "psrad $" #shift ", %%mm0 \n\t"\
1166 "paddd %%mm3, %%mm2 \n\t" /*2A3 a3 */\
1167 \
1168 "movq %%mm4, 80+" #dst " \n\t" /* C2, c2 */\
1169 "psrad $" #shift ", %%mm2 \n\t"\
1170 \
1171 "packssdw %%mm2, %%mm0 \n\t" /* C3, c3, C3, c3 */\
1172 "movq %%mm0, 48+" #dst " \n\t" /* C3, c3 */\
1173 "movq %%mm0, 64+" #dst " \n\t" /* C3, c3 */\
1174
1175 //IDCT_CORE( src0, src4, src1, src5, dst, rounder, shift)
1176 IDCT_CORE( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
1177 //IDCT_CORE( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
1178 IDCT_CORE( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
1179 //IDCT_CORE( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1180 "jmp 9f \n\t"
1181
1182
1183 "#.balign 16 \n\t"\
1184 "1: \n\t"
1185 #undef IDCT_CORE
1186 #define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
1187 "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\
1188 "movq " #src4 ", %%mm1 \n\t" /* R6 R4 r6 r4 */\
1189 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
1190 "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\
1191 "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\
1192 "movq 24(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\
1193 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C4R4 C6r6+C4r4 */\
1194 "movq 32(%2), %%mm6 \n\t" /* C3 C1 C3 C1 */\
1195 "pmaddwd %%mm2, %%mm6 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
1196 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
1197 \
1198 "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\
1199 "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\
1200 "paddd %%mm4, %%mm6 \n\t" /* A0+B0 a0+b0 */\
1201 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
1202 "psubd %%mm6, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1203 "psrad $" #shift ", %%mm6 \n\t"\
1204 "psrad $" #shift ", %%mm4 \n\t"\
1205 WRITE0b(%%mm6, %%mm4, dst) \
1206 \
1207 "movq 56(%2), %%mm4 \n\t" /* -C2 -C4 -C2 -C4 */\
1208 "pmaddwd %%mm1, %%mm4 \n\t" /* -C2R6-C4R4 -C2r6-C4r4 */\
1209 "movq 64(%2), %%mm6 \n\t" /* -C7 C3 -C7 C3 */\
1210 "pmaddwd %%mm2, %%mm6 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
1211 "paddd %%mm5, %%mm4 \n\t" /* A1 a1 */\
1212 \
1213 "movq 80(%2), %%mm5 \n\t" /* -C6 C4 -C6 C4 */\
1214 "pmaddwd %%mm0, %%mm5 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\
1215 "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\
1216 "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\
1217 "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\
1218 "psrad $" #shift ", %%mm6 \n\t"\
1219 "psrad $" #shift ", %%mm4 \n\t"\
1220 WRITE1b(%%mm6, %%mm4, dst, %%mm7) \
1221 \
1222 "movq 88(%2), %%mm4 \n\t" /* C2 -C4 C2 -C4 */\
1223 "pmaddwd %%mm1, %%mm4 \n\t" /* C2R6-C4R4 C2r6-C4r4 */\
1224 "movq 96(%2), %%mm6 \n\t" /* -C1 C5 -C1 C5 */\
1225 "pmaddwd %%mm2, %%mm6 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
1226 "paddd %%mm5, %%mm4 \n\t" /* A2 a2 */\
1227 \
1228 "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\
1229 "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\
1230 "pmaddwd 120(%2), %%mm1 \n\t" /* -C6R6+C4R4 -C6r6+C4r4 */\
1231 "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\
1232 "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\
1233 "pmaddwd 128(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
1234 "psrad $" #shift ", %%mm6 \n\t"\
1235 "psrad $" #shift ", %%mm4 \n\t"\
1236 \
1237 "paddd %%mm1, %%mm0 \n\t" /* A3 a3 */\
1238 "paddd %%mm0, %%mm2 \n\t" /* A3+B3 a3+b3 */\
1239 "paddd %%mm0, %%mm0 \n\t" /* 2A3 2a3 */\
1240 "psubd %%mm2, %%mm0 \n\t" /* A3-B3 a3-b3 */\
1241 "psrad $" #shift ", %%mm2 \n\t"\
1242 "psrad $" #shift ", %%mm0 \n\t"\
1243 WRITE2b(%%mm6, %%mm4, %%mm2, %%mm0, dst)
1244
1245 //IDCT_CORE( src0, src4, src1, src5, dst, rounder, shift)
1246 IDCT_CORE( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
1247 IDCT_CORE( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
1248 IDCT_CORE( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
1249 IDCT_CORE( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1250 "jmp 9f \n\t"
1251
1252
1253 "#.balign 16 \n\t"
1254 "7: \n\t"
1255 #undef IDCT_CORE
1256 #define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
1257 "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\
1258 "movq 16(%2), %%mm2 \n\t" /* C2 C4 C2 C4 */\
1259 "movq 8+" #src0 ", %%mm1 \n\t" /* R2 R0 r2 r0 */\
1260 "pmaddwd %%mm0, %%mm2 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\
1261 "movq 16(%2), %%mm3 \n\t" /* C2 C4 C2 C4 */\
1262 "pmaddwd %%mm1, %%mm3 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\
1263 \
1264 "movq 48(%2), %%mm4 \n\t" /* C6 C4 C6 C4 */\
1265 "pmaddwd %%mm0, %%mm4 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\
1266 "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\
1267 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\
1268 "movq 80(%2), %%mm6 \n\t" /* -C6 C4 -C6 C4 */\
1269 "pmaddwd %%mm0, %%mm6 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\
1270 "movq 80(%2), %%mm7 \n\t" /* -C6 C4 -C6 C4 */\
1271 "pmaddwd %%mm1, %%mm7 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\
1272 "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\
1273 "psrad $" #shift ", %%mm2 \n\t"\
1274 "psrad $" #shift ", %%mm3 \n\t"\
1275 "pmaddwd 112(%2), %%mm1 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\
1276 "packssdw %%mm3, %%mm2 \n\t" /* C0, c0, C0, c0 */\
1277 "movq %%mm2, " #dst " \n\t" /* C0, c0 */\
1278 "psrad $" #shift ", %%mm4 \n\t"\
1279 "psrad $" #shift ", %%mm5 \n\t"\
1280 "movq %%mm2, 112+" #dst " \n\t" /* C0, c0 */\
1281 "packssdw %%mm5, %%mm4 \n\t" /* C1, c1, C1, c1 */\
1282 "movq %%mm4, 16+" #dst " \n\t" /* C0, c0 */\
1283 "psrad $" #shift ", %%mm7 \n\t"\
1284 "psrad $" #shift ", %%mm6 \n\t"\
1285 "movq %%mm4, 96+" #dst " \n\t" /* C0, c0 */\
1286 "packssdw %%mm7, %%mm6 \n\t" /* C2, c2, C2, c2 */\
1287 "movq %%mm6, 32+" #dst " \n\t" /* C0, c0 */\
1288 "psrad $" #shift ", %%mm0 \n\t"\
1289 "movq %%mm6, 80+" #dst " \n\t" /* C0, c0 */\
1290 "psrad $" #shift ", %%mm1 \n\t"\
1291 "packssdw %%mm1, %%mm0 \n\t" /* C3, c3, C3, c3 */\
1292 "movq %%mm0, 48+" #dst " \n\t" /* C0, c0 */\
1293 "movq %%mm0, 64+" #dst " \n\t" /* C0, c0 */\
1294
1295 //IDCT_CORE( src0, src4, src1, src5, dst, rounder, shift)
1296 IDCT_CORE( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
1297 //IDCT_CORE( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
1298 IDCT_CORE( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
1299 //IDCT_CORE( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1300
1301
1302 #endif
1303
1304 /*
1305 Input
1306 00 20 02 22 40 60 42 62
1307 10 30 12 32 50 70 52 72
1308 01 21 03 23 41 61 43 63
1309 11 31 13 33 51 71 53 73
1310 04 24 06 26 44 64 46 66
1311 14 34 16 36 54 74 56 76
1312 ...
1313 */
1314 /*
1315 Temp
1316 00 02 10 12 20 22 30 32
1317 40 42 50 52 60 62 70 72
1318 01 03 11 13 21 23 31 33
1319 41 43 51 53 61 63 71 73
1320 04 06 14 16 24 26 34 36
1321 44 46 54 56 64 66 74 76
1322 05 07 15 17 25 27 35 37
1323 45 47 55 57 65 67 75 77
1324 */
1325
1326 /*
1327 Output
1328 00 10 20 30 40 50 60 70
1329 01 11 21 31 41 51 61 71
1330 ...
1331 */
1332
1333 "9: \n\t"
1334 :: "r" (block), "r" (temp), "r" (coeffs)
1335 : "%eax"
1336 );
1337 /*
1338 idctCol(block, temp);
1339 idctCol(block+1, temp+2);
1340 idctCol(block+2, temp+4);
1341 idctCol(block+3, temp+6);
1342 idctCol(block+4, temp+8);
1343 idctCol(block+5, temp+10);
1344 idctCol(block+6, temp+12);
1345 idctCol(block+7, temp+14);
1346 */
1347 }
1348
1349 void simple_idct_mmx(int16_t *block)
1350 {
1351 static int imax=0, imin=0;
1352 static int omax=0, omin=0;
1353 int i, j;
1354 /*
1355 for(i=0; i<64; i++)
1356 {
1357 if(block[i] > imax)
1358 {
1359 imax= block[i];
1360 printf("Input-Max: %d\n", imax);
1361 printf("Input-Min: %d\n", imin);
1362 printf("Output-Max: %d\n", omax);
1363 printf("Output-Min: %d\n", omin);
1364 }
1365 if(block[i] < imin)
1366 {
1367 imin= block[i];
1368 printf("Input-Max: %d\n", imax);
1369 printf("Input-Min: %d\n", imin);
1370 printf("Output-Max: %d\n", omax);
1371 printf("Output-Min: %d\n", omin);
1372 }
1373 }*/
1374 /* static int stat[64];
1375 for(j=0; j<4; j++)
1376 {
1377 static int line[8]={0,2,1,3,4,6,5,7};
1378 for(i=0; i<16; i++)
1379 {
1380 if(block[j*16+i])
1381 {
1382 stat[j*16+1]++;
1383 break;
1384 }
1385 }
1386 for(i=0; i<16; i++)
1387 {
1388 if(block[j*16+i] && i!=0 && i!=2)
1389 {
1390 stat[j*16+2]++;
1391 break;
1392 }
1393 }
1394 }
1395 stat[0]++;*/
1396 /* for(i=1; i<8; i++)
1397 {
1398 if(block[i] != 0)
1399 {
1400 stat[1]++;
1401 break;
1402 }
1403 }
1404 for(i=32; i<64; i++)
1405 {
1406 if(block[i] != 0)
1407 {
1408 stat[2]++;
1409 break;
1410 }
1411 }
1412 stat[0]++;
1413 */
1414 // return;
1415 idct(block);
1416 // memset(block, 0, 128);
1417 /*
1418 if(stat[0] > 100000)
1419 for(i=0; i<64; i++)
1420 {
1421 if((i&7) == 0) printf("\n");
1422 printf("%06d ", stat[i]);
1423 }
1424 */
1425 /*
1426 for(i=0; i<4; i++) printf("%d", stat[1+i*16]);
1427 printf(" ");
1428 for(i=0; i<4; i++) printf("%d", stat[2+i*16]);
1429 printf("\n");
1430 */
1431 // printf("%d", stat[2]);
1432
1433 // memset(stat, 0, 256);
1434
1435 /*
1436 for(i=0; i<64; i++)
1437 {
1438 if(block[i] > omax)
1439 {
1440 omax= block[i];
1441 printf("Input-Max: %d\n", imax);
1442 printf("Input-Min: %d\n", imin);
1443 printf("Output-Max: %d\n", omax);
1444 printf("Output-Min: %d\n", omin);
1445 }
1446 if(block[i] < omin)
1447 {
1448 omin= block[i];
1449 printf("Input-Max: %d\n", imax);
1450 printf("Input-Min: %d\n", imin);
1451 printf("Output-Max: %d\n", omax);
1452 printf("Output-Min: %d\n", omin);
1453 }
1454 }*/
1455 }