Mercurial > libavcodec.hg
comparison i386/fdct_mmx.s @ 0:986e461dc072 libavcodec
Initial revision
author | glantau |
---|---|
date | Sun, 22 Jul 2001 14:18:56 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:986e461dc072 |
---|---|
1 ; ////////////////////////////////////////////////////////////////////////////// | |
2 ; // | |
3 ; // fdctam32.c - AP922 MMX(3D-Now) forward-DCT | |
4 ; // ---------- | |
5 ; // Intel Application Note AP-922 - fast, precise implementation of DCT | |
6 ; // http://developer.intel.com/vtune/cbts/appnotes.htm | |
7 ; // ---------- | |
8 ; // | |
9 ; // This routine can use a 3D-Now/MMX enhancement to increase the | |
10 ; // accuracy of the fdct_col_4 macro. The dct_col function uses 3D-Now's | |
11 ; // PMHULHRW instead of MMX's PMHULHW(and POR). The substitution improves | |
12 ; // accuracy very slightly with performance penalty. If the target CPU | |
13 ; // does not support 3D-Now, then this function cannot be executed. | |
14 ; // | |
15 ; // For a fast, precise MMX implementation of inverse-DCT | |
16 ; // visit http://www.elecard.com/peter | |
17 ; // | |
18 ; // v1.0 07/22/2000 (initial release) | |
19 ; // | |
20 ; // liaor@iname.com http://members.tripod.com/~liaor | |
21 ; ////////////////////////////////////////////////////////////////////////////// | |
22 | |
23 ;;; | |
24 ;;; A.Stevens Jul 2000: ported to nasm syntax and disentangled from | |
25 ;;; from Win**** compiler specific stuff. | |
26 ;;; All the real work was done above though. | |
27 ;;; See above for how to optimise quality on 3DNow! CPU's | |
28 | |
29 ;; | |
30 ;; Macros for code-readability... | |
31 ;; | |
32 %define INP eax ; pointer to (short *blk) | |
33 %define OUT ecx ; pointer to output (temporary store space qwTemp[]) | |
34 %define TABLE ebx ; pointer to tab_frw_01234567[] | |
35 %define TABLEF ebx ; pointer to tg_all_16 | |
36 %define round_frw_row edx | |
37 | |
38 | |
39 %define x0 INP + 0*16 | |
40 %define x1 INP + 1*16 | |
41 %define x2 INP + 2*16 | |
42 %define x3 INP + 3*16 | |
43 %define x4 INP + 4*16 | |
44 %define x5 INP + 5*16 | |
45 %define x6 INP + 6*16 | |
46 %define x7 INP + 7*16 | |
47 %define y0 OUT + 0*16 | |
48 %define y1 OUT + 1*16 | |
49 %define y2 OUT + 2*16 | |
50 %define y3 OUT + 3*16 | |
51 %define y4 OUT + 4*16 | |
52 %define y5 OUT + 5*16 | |
53 %define y6 OUT + 6*16 | |
54 %define y7 OUT + 7*16 | |
55 | |
56 ;; | |
57 ;; Constants for DCT | |
58 ;; | |
59 %define BITS_FRW_ACC 3 ; 2 or 3 for accuracy | |
60 %define SHIFT_FRW_COL BITS_FRW_ACC | |
61 %define SHIFT_FRW_ROW (BITS_FRW_ACC + 17) | |
62 %define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1)) | |
63 %define RND_FRW_COL (1 << (SHIFT_FRW_COL-1)) | |
64 | |
65 extern fdct_one_corr | |
66 extern fdct_r_row ; Defined in C for convenience | |
67 ;; | |
68 ;; Concatenated table of forward dct transformation coeffs. | |
69 ;; | |
70 extern fdct_tg_all_16 ; Defined in C for convenience | |
71 ;; Offsets into table.. | |
72 | |
73 %define tg_1_16 (TABLEF + 0) | |
74 %define tg_2_16 (TABLEF + 8) | |
75 %define tg_3_16 (TABLEF + 16) | |
76 %define cos_4_16 (TABLEF + 24) | |
77 %define ocos_4_16 (TABLEF + 32) | |
78 | |
79 ;; | |
80 ;; Concatenated table of forward dct coefficients | |
81 ;; | |
82 extern tab_frw_01234567 ; Defined in C for convenience | |
83 | |
84 ;; Offsets into table.. | |
85 SECTION .text | |
86 | |
87 global fdct_mmx | |
88 | |
89 ;;; | |
90 ;;; void fdct_mmx( short *blk ) | |
91 ;;; | |
92 | |
93 | |
94 | |
95 ; //////////////////////////////////////////////////////////////////////// | |
96 ; // | |
97 ; // The high-level pseudocode for the fdct_am32() routine : | |
98 ; // | |
99 ; // fdct_am32() | |
100 ; // { | |
101 ; // forward_dct_col03(); // dct_column transform on cols 0-3 | |
102 ; // forward_dct_col47(); // dct_column transform on cols 4-7 | |
103 ; // for ( j = 0; j < 8; j=j+1 ) | |
104 ; // forward_dct_row1(j); // dct_row transform on row #j | |
105 ; // } | |
106 ; // | |
107 ; | |
108 | |
109 align 32 | |
110 fdct_mmx: | |
111 push ebp ; save stack pointer | |
112 mov ebp, esp ; link | |
113 | |
114 push ebx | |
115 push ecx | |
116 push edx | |
117 push edi | |
118 | |
119 mov INP, [ebp+8]; ; input data is row 0 of blk[] | |
120 ;// transform the left half of the matrix (4 columns) | |
121 | |
122 lea TABLEF, [fdct_tg_all_16]; | |
123 mov OUT, INP; | |
124 | |
125 ; lea round_frw_col, [r_frw_col] | |
126 ; for ( i = 0; i < 2; i = i + 1) | |
127 ; the for-loop is executed twice. We are better off unrolling the | |
128 ; loop to avoid branch misprediction. | |
129 .mmx32_fdct_col03: | |
130 movq mm0, [x1] ; 0 ; x1 | |
131 ;; | |
132 | |
133 movq mm1, [x6] ; 1 ; x6 | |
134 movq mm2, mm0 ; 2 ; x1 | |
135 | |
136 movq mm3, [x2] ; 3 ; x2 | |
137 paddsw mm0, mm1 ; t1 = x[1] + x[6] | |
138 | |
139 movq mm4, [x5] ; 4 ; x5 | |
140 psllw mm0, SHIFT_FRW_COL ; t1 | |
141 | |
142 movq mm5, [x0] ; 5 ; x0 | |
143 paddsw mm4, mm3 ; t2 = x[2] + x[5] | |
144 | |
145 paddsw mm5, [x7] ; t0 = x[0] + x[7] | |
146 psllw mm4, SHIFT_FRW_COL ; t2 | |
147 | |
148 movq mm6, mm0 ; 6 ; t1 | |
149 psubsw mm2, mm1 ; 1 ; t6 = x[1] - x[6] | |
150 | |
151 movq mm1, [tg_2_16] ; 1 ; tg_2_16 | |
152 psubsw mm0, mm4 ; tm12 = t1 - t2 | |
153 | |
154 movq mm7, [x3] ; 7 ; x3 | |
155 pmulhw mm1, mm0 ; tm12*tg_2_16 | |
156 | |
157 paddsw mm7, [x4] ; t3 = x[3] + x[4] | |
158 psllw mm5, SHIFT_FRW_COL ; t0 | |
159 | |
160 paddsw mm6, mm4 ; 4 ; tp12 = t1 + t2 | |
161 psllw mm7, SHIFT_FRW_COL ; t3 | |
162 | |
163 movq mm4, mm5 ; 4 ; t0 | |
164 psubsw mm5, mm7 ; tm03 = t0 - t3 | |
165 | |
166 paddsw mm1, mm5 ; y2 = tm03 + tm12*tg_2_16 | |
167 paddsw mm4, mm7 ; 7 ; tp03 = t0 + t3 | |
168 | |
169 por mm1, [fdct_one_corr] ; correction y2 +0.5 | |
170 psllw mm2, SHIFT_FRW_COL+1 ; t6 | |
171 | |
172 pmulhw mm5, [tg_2_16] ; tm03*tg_2_16 | |
173 movq mm7, mm4 ; 7 ; tp03 | |
174 | |
175 psubsw mm3, [x5] ; t5 = x[2] - x[5] | |
176 psubsw mm4, mm6 ; y4 = tp03 - tp12 | |
177 | |
178 movq [y2], mm1 ; 1 ; save y2 | |
179 paddsw mm7, mm6 ; 6 ; y0 = tp03 + tp12 | |
180 | |
181 movq mm1, [x3] ; 1 ; x3 | |
182 psllw mm3, SHIFT_FRW_COL+1 ; t5 | |
183 | |
184 psubsw mm1, [x4] ; t4 = x[3] - x[4] | |
185 movq mm6, mm2 ; 6 ; t6 | |
186 | |
187 movq [y4], mm4 ; 4 ; save y4 | |
188 paddsw mm2, mm3 ; t6 + t5 | |
189 | |
190 pmulhw mm2, [ocos_4_16] ; tp65 = (t6 + t5)*cos_4_16 | |
191 psubsw mm6, mm3 ; 3 ; t6 - t5 | |
192 | |
193 pmulhw mm6, [ocos_4_16] ; tm65 = (t6 - t5)*cos_4_16 | |
194 psubsw mm5, mm0 ; 0 ; y6 = tm03*tg_2_16 - tm12 | |
195 | |
196 por mm5, [fdct_one_corr] ; correction y6 +0.5 | |
197 psllw mm1, SHIFT_FRW_COL ; t4 | |
198 | |
199 por mm2, [fdct_one_corr] ; correction tp65 +0.5 | |
200 movq mm4, mm1 ; 4 ; t4 | |
201 | |
202 movq mm3, [x0] ; 3 ; x0 | |
203 paddsw mm1, mm6 ; tp465 = t4 + tm65 | |
204 | |
205 psubsw mm3, [x7] ; t7 = x[0] - x[7] | |
206 psubsw mm4, mm6 ; 6 ; tm465 = t4 - tm65 | |
207 | |
208 movq mm0, [tg_1_16] ; 0 ; tg_1_16 | |
209 psllw mm3, SHIFT_FRW_COL ; t7 | |
210 | |
211 movq mm6, [tg_3_16] ; 6 ; tg_3_16 | |
212 pmulhw mm0, mm1 ; tp465*tg_1_16 | |
213 | |
214 movq [y0], mm7 ; 7 ; save y0 | |
215 pmulhw mm6, mm4 ; tm465*tg_3_16 | |
216 | |
217 movq [y6], mm5 ; 5 ; save y6 | |
218 movq mm7, mm3 ; 7 ; t7 | |
219 | |
220 movq mm5, [tg_3_16] ; 5 ; tg_3_16 | |
221 psubsw mm7, mm2 ; tm765 = t7 - tp65 | |
222 | |
223 paddsw mm3, mm2 ; 2 ; tp765 = t7 + tp65 | |
224 pmulhw mm5, mm7 ; tm765*tg_3_16 | |
225 | |
226 paddsw mm0, mm3 ; y1 = tp765 + tp465*tg_1_16 | |
227 paddsw mm6, mm4 ; tm465*tg_3_16 | |
228 | |
229 pmulhw mm3, [tg_1_16] ; tp765*tg_1_16 | |
230 ;; | |
231 | |
232 por mm0, [fdct_one_corr] ; correction y1 +0.5 | |
233 paddsw mm5, mm7 ; tm765*tg_3_16 | |
234 | |
235 psubsw mm7, mm6 ; 6 ; y3 = tm765 - tm465*tg_3_16 | |
236 add INP, 0x08 ; ; increment pointer | |
237 | |
238 movq [y1], mm0 ; 0 ; save y1 | |
239 paddsw mm5, mm4 ; 4 ; y5 = tm765*tg_3_16 + tm465 | |
240 | |
241 movq [y3], mm7 ; 7 ; save y3 | |
242 psubsw mm3, mm1 ; 1 ; y7 = tp765*tg_1_16 - tp465 | |
243 | |
244 movq [y5], mm5 ; 5 ; save y5 | |
245 | |
246 | |
247 .mmx32_fdct_col47: ; begin processing last four columns | |
248 movq mm0, [x1] ; 0 ; x1 | |
249 ;; | |
250 movq [y7], mm3 ; 3 ; save y7 (columns 0-4) | |
251 ;; | |
252 | |
253 movq mm1, [x6] ; 1 ; x6 | |
254 movq mm2, mm0 ; 2 ; x1 | |
255 | |
256 movq mm3, [x2] ; 3 ; x2 | |
257 paddsw mm0, mm1 ; t1 = x[1] + x[6] | |
258 | |
259 movq mm4, [x5] ; 4 ; x5 | |
260 psllw mm0, SHIFT_FRW_COL ; t1 | |
261 | |
262 movq mm5, [x0] ; 5 ; x0 | |
263 paddsw mm4, mm3 ; t2 = x[2] + x[5] | |
264 | |
265 paddsw mm5, [x7] ; t0 = x[0] + x[7] | |
266 psllw mm4, SHIFT_FRW_COL ; t2 | |
267 | |
268 movq mm6, mm0 ; 6 ; t1 | |
269 psubsw mm2, mm1 ; 1 ; t6 = x[1] - x[6] | |
270 | |
271 movq mm1, [tg_2_16] ; 1 ; tg_2_16 | |
272 psubsw mm0, mm4 ; tm12 = t1 - t2 | |
273 | |
274 movq mm7, [x3] ; 7 ; x3 | |
275 pmulhw mm1, mm0 ; tm12*tg_2_16 | |
276 | |
277 paddsw mm7, [x4] ; t3 = x[3] + x[4] | |
278 psllw mm5, SHIFT_FRW_COL ; t0 | |
279 | |
280 paddsw mm6, mm4 ; 4 ; tp12 = t1 + t2 | |
281 psllw mm7, SHIFT_FRW_COL ; t3 | |
282 | |
283 movq mm4, mm5 ; 4 ; t0 | |
284 psubsw mm5, mm7 ; tm03 = t0 - t3 | |
285 | |
286 paddsw mm1, mm5 ; y2 = tm03 + tm12*tg_2_16 | |
287 paddsw mm4, mm7 ; 7 ; tp03 = t0 + t3 | |
288 | |
289 por mm1, [fdct_one_corr] ; correction y2 +0.5 | |
290 psllw mm2, SHIFT_FRW_COL+1 ; t6 | |
291 | |
292 pmulhw mm5, [tg_2_16] ; tm03*tg_2_16 | |
293 movq mm7, mm4 ; 7 ; tp03 | |
294 | |
295 psubsw mm3, [x5] ; t5 = x[2] - x[5] | |
296 psubsw mm4, mm6 ; y4 = tp03 - tp12 | |
297 | |
298 movq [y2+8], mm1 ; 1 ; save y2 | |
299 paddsw mm7, mm6 ; 6 ; y0 = tp03 + tp12 | |
300 | |
301 movq mm1, [x3] ; 1 ; x3 | |
302 psllw mm3, SHIFT_FRW_COL+1 ; t5 | |
303 | |
304 psubsw mm1, [x4] ; t4 = x[3] - x[4] | |
305 movq mm6, mm2 ; 6 ; t6 | |
306 | |
307 movq [y4+8], mm4 ; 4 ; save y4 | |
308 paddsw mm2, mm3 ; t6 + t5 | |
309 | |
310 pmulhw mm2, [ocos_4_16] ; tp65 = (t6 + t5)*cos_4_16 | |
311 psubsw mm6, mm3 ; 3 ; t6 - t5 | |
312 | |
313 pmulhw mm6, [ocos_4_16] ; tm65 = (t6 - t5)*cos_4_16 | |
314 psubsw mm5, mm0 ; 0 ; y6 = tm03*tg_2_16 - tm12 | |
315 | |
316 por mm5, [fdct_one_corr] ; correction y6 +0.5 | |
317 psllw mm1, SHIFT_FRW_COL ; t4 | |
318 | |
319 por mm2, [fdct_one_corr] ; correction tp65 +0.5 | |
320 movq mm4, mm1 ; 4 ; t4 | |
321 | |
322 movq mm3, [x0] ; 3 ; x0 | |
323 paddsw mm1, mm6 ; tp465 = t4 + tm65 | |
324 | |
325 psubsw mm3, [x7] ; t7 = x[0] - x[7] | |
326 psubsw mm4, mm6 ; 6 ; tm465 = t4 - tm65 | |
327 | |
328 movq mm0, [tg_1_16] ; 0 ; tg_1_16 | |
329 psllw mm3, SHIFT_FRW_COL ; t7 | |
330 | |
331 movq mm6, [tg_3_16] ; 6 ; tg_3_16 | |
332 pmulhw mm0, mm1 ; tp465*tg_1_16 | |
333 | |
334 movq [y0+8], mm7 ; 7 ; save y0 | |
335 pmulhw mm6, mm4 ; tm465*tg_3_16 | |
336 | |
337 movq [y6+8], mm5 ; 5 ; save y6 | |
338 movq mm7, mm3 ; 7 ; t7 | |
339 | |
340 movq mm5, [tg_3_16] ; 5 ; tg_3_16 | |
341 psubsw mm7, mm2 ; tm765 = t7 - tp65 | |
342 | |
343 paddsw mm3, mm2 ; 2 ; tp765 = t7 + tp65 | |
344 pmulhw mm5, mm7 ; tm765*tg_3_16 | |
345 | |
346 paddsw mm0, mm3 ; y1 = tp765 + tp465*tg_1_16 | |
347 paddsw mm6, mm4 ; tm465*tg_3_16 | |
348 | |
349 pmulhw mm3, [tg_1_16] ; tp765*tg_1_16 | |
350 ;; | |
351 | |
352 por mm0, [fdct_one_corr] ; correction y1 +0.5 | |
353 paddsw mm5, mm7 ; tm765*tg_3_16 | |
354 | |
355 psubsw mm7, mm6 ; 6 ; y3 = tm765 - tm465*tg_3_16 | |
356 ;; | |
357 | |
358 movq [y1+8], mm0 ; 0 ; save y1 | |
359 paddsw mm5, mm4 ; 4 ; y5 = tm765*tg_3_16 + tm465 | |
360 | |
361 movq [y3+8], mm7 ; 7 ; save y3 | |
362 psubsw mm3, mm1 ; 1 ; y7 = tp765*tg_1_16 - tp465 | |
363 | |
364 movq [y5+8], mm5 ; 5 ; save y5 | |
365 | |
366 movq [y7+8], mm3 ; 3 ; save y7 | |
367 | |
368 ; emms; | |
369 ; } ; end of forward_dct_col07() | |
370 ; done with dct_row transform | |
371 | |
372 | |
373 ; fdct_mmx32_cols() -- | |
374 ; the following subroutine repeats the row-transform operation, | |
375 ; except with different shift&round constants. This version | |
376 ; does NOT transpose the output again. Thus the final output | |
377 ; is transposed with respect to the source. | |
378 ; | |
379 ; The output is stored into blk[], which destroys the original | |
380 ; input data. | |
381 mov INP, [ebp+8]; ;; row 0 | |
382 mov edi, 0x08; ;x = 8 | |
383 | |
384 lea TABLE, [tab_frw_01234567]; ; row 0 | |
385 mov OUT, INP; | |
386 | |
387 lea round_frw_row, [fdct_r_row]; | |
388 ; for ( x = 8; x > 0; --x ) ; transform one row per iteration | |
389 | |
390 ; ---------- loop begin | |
391 .lp_mmx_fdct_row1: | |
392 movd mm5, [INP+12]; ; mm5 = 7 6 | |
393 | |
394 punpcklwd mm5, [INP+8] ; mm5 = 5 7 4 6 | |
395 | |
396 movq mm2, mm5; ; mm2 = 5 7 4 6 | |
397 psrlq mm5, 32; ; mm5 = _ _ 5 7 | |
398 | |
399 movq mm0, [INP]; ; mm0 = 3 2 1 0 | |
400 punpcklwd mm5, mm2;; mm5 = 4 5 6 7 | |
401 | |
402 movq mm1, mm0; ; mm1 = 3 2 1 0 | |
403 paddsw mm0, mm5; ; mm0 = [3+4, 2+5, 1+6, 0+7] (xt3, xt2, xt1, xt0) | |
404 | |
405 psubsw mm1, mm5; ; mm1 = [3-4, 2-5, 1-6, 0-7] (xt7, xt6, xt5, xt4) | |
406 movq mm2, mm0; ; mm2 = [ xt3 xt2 xt1 xt0 ] | |
407 | |
408 ;movq [ xt3xt2xt1xt0 ], mm0; | |
409 ;movq [ xt7xt6xt5xt4 ], mm1; | |
410 | |
411 punpcklwd mm0, mm1;; mm0 = [ xt5 xt1 xt4 xt0 ] | |
412 | |
413 punpckhwd mm2, mm1;; mm2 = [ xt7 xt3 xt6 xt2 ] | |
414 movq mm1, mm2; ; mm1 | |
415 | |
416 ;; shuffle bytes around | |
417 | |
418 ; movq mm0, [INP] ; 0 ; x3 x2 x1 x0 | |
419 | |
420 ; movq mm1, [INP+8] ; 1 ; x7 x6 x5 x4 | |
421 movq mm2, mm0 ; 2 ; x3 x2 x1 x0 | |
422 | |
423 movq mm3, [TABLE] ; 3 ; w06 w04 w02 w00 | |
424 punpcklwd mm0, mm1 ; x5 x1 x4 x0 | |
425 | |
426 movq mm5, mm0 ; 5 ; x5 x1 x4 x0 | |
427 punpckldq mm0, mm0 ; x4 x0 x4 x0 [ xt2 xt0 xt2 xt0 ] | |
428 | |
429 movq mm4, [TABLE+8] ; 4 ; w07 w05 w03 w01 | |
430 punpckhwd mm2, mm1 ; 1 ; x7 x3 x6 x2 | |
431 | |
432 pmaddwd mm3, mm0 ; x4*w06+x0*w04 x4*w02+x0*w00 | |
433 movq mm6, mm2 ; 6 ; x7 x3 x6 x2 | |
434 | |
435 movq mm1, [TABLE+32] ; 1 ; w22 w20 w18 w16 | |
436 punpckldq mm2, mm2 ; x6 x2 x6 x2 [ xt3 xt1 xt3 xt1 ] | |
437 | |
438 pmaddwd mm4, mm2 ; x6*w07+x2*w05 x6*w03+x2*w01 | |
439 punpckhdq mm5, mm5 ; x5 x1 x5 x1 [ xt6 xt4 xt6 xt4 ] | |
440 | |
441 pmaddwd mm0, [TABLE+16] ; x4*w14+x0*w12 x4*w10+x0*w08 | |
442 punpckhdq mm6, mm6 ; x7 x3 x7 x3 [ xt7 xt5 xt7 xt5 ] | |
443 | |
444 movq mm7, [TABLE+40] ; 7 ; w23 w21 w19 w17 | |
445 pmaddwd mm1, mm5 ; x5*w22+x1*w20 x5*w18+x1*w16 | |
446 ;mm3 = a1, a0 (y2,y0) | |
447 ;mm1 = b1, b0 (y3,y1) | |
448 ;mm0 = a3,a2 (y6,y4) | |
449 ;mm5 = b3,b2 (y7,y5) | |
450 | |
451 paddd mm3, [round_frw_row] ; +rounder (y2,y0) | |
452 pmaddwd mm7, mm6 ; x7*w23+x3*w21 x7*w19+x3*w17 | |
453 | |
454 pmaddwd mm2, [TABLE+24] ; x6*w15+x2*w13 x6*w11+x2*w09 | |
455 paddd mm3, mm4 ; 4 ; a1=sum(even1) a0=sum(even0) ; now ( y2, y0) | |
456 | |
457 pmaddwd mm5, [TABLE+48] ; x5*w30+x1*w28 x5*w26+x1*w24 | |
458 ;; | |
459 | |
460 pmaddwd mm6, [TABLE+56] ; x7*w31+x3*w29 x7*w27+x3*w25 | |
461 paddd mm1, mm7 ; 7 ; b1=sum(odd1) b0=sum(odd0) ; now ( y3, y1) | |
462 | |
463 paddd mm0, [round_frw_row] ; +rounder (y6,y4) | |
464 psrad mm3, SHIFT_FRW_ROW ; (y2, y0) | |
465 | |
466 paddd mm1, [round_frw_row] ; +rounder (y3,y1) | |
467 paddd mm0, mm2 ; 2 ; a3=sum(even3) a2=sum(even2) ; now (y6, y4) | |
468 | |
469 paddd mm5, [round_frw_row] ; +rounder (y7,y5) | |
470 psrad mm1, SHIFT_FRW_ROW ; y1=a1+b1 y0=a0+b0 | |
471 | |
472 paddd mm5, mm6 ; 6 ; b3=sum(odd3) b2=sum(odd2) ; now ( y7, y5) | |
473 psrad mm0, SHIFT_FRW_ROW ;y3=a3+b3 y2=a2+b2 | |
474 | |
475 add OUT, 16; ; increment row-output address by 1 row | |
476 psrad mm5, SHIFT_FRW_ROW ; y4=a3-b3 y5=a2-b2 | |
477 | |
478 add INP, 16; ; increment row-address by 1 row | |
479 packssdw mm3, mm0 ; 0 ; y6 y4 y2 y0 | |
480 | |
481 packssdw mm1, mm5 ; 3 ; y7 y5 y3 y1 | |
482 movq mm6, mm3; ; mm0 = y6 y4 y2 y0 | |
483 | |
484 punpcklwd mm3, mm1; ; y3 y2 y1 y0 | |
485 sub edi, 0x01; ; i = i - 1 | |
486 | |
487 punpckhwd mm6, mm1; ; y7 y6 y5 y4 | |
488 add TABLE,64; ; increment to next table | |
489 | |
490 movq [OUT-16], mm3 ; 1 ; save y3 y2 y1 y0 | |
491 | |
492 movq [OUT-8], mm6 ; 7 ; save y7 y6 y5 y4 | |
493 | |
494 cmp edi, 0x00; | |
495 jg near .lp_mmx_fdct_row1; ; begin fdct processing on next row | |
496 ;; | |
497 ;; Tidy up and return | |
498 ;; | |
499 pop edi | |
500 pop edx | |
501 pop ecx | |
502 pop ebx | |
503 | |
504 pop ebp ; restore stack pointer | |
505 emms | |
506 ret | |
507 |