annotate i386/vp3dsp_sse2.c @ 2892:41315d0120b3 libavcodec

replace a few mov + psrlq with pshufw, there are more cases which could benefit from this but they would require us to duplicate some functions ... the trick is from various places (my own code in libpostproc, a patch on the x264 list, ...)
author michael
date Wed, 21 Sep 2005 21:17:09 +0000
parents fd5d7c732c6b
children ef2149182f1c
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
1 /*
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
2 * Copyright (C) 2004 the ffmpeg project
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
3 *
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
4 * This library is free software; you can redistribute it and/or
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
5 * modify it under the terms of the GNU Lesser General Public
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
6 * License as published by the Free Software Foundation; either
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
7 * version 2 of the License, or (at your option) any later version.
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
8 *
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
9 * This library is distributed in the hope that it will be useful,
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
12 * Lesser General Public License for more details.
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
13 *
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
14 * You should have received a copy of the GNU Lesser General Public
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
15 * License along with this library; if not, write to the Free Software
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
17 */
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
18
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
19 /**
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
20 * @file vp3dsp_sse2.c
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
21 * SSE2-optimized functions cribbed from the original VP3 source code.
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
22 */
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
23
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
24 #include "../dsputil.h"
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
25 #include "mmx.h"
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
26
2753
ba8ecddf5598 adding a few const
michael
parents: 2696
diff changeset
27 static const unsigned short __align16 SSE2_dequant_const[] =
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
28 {
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
29 0,65535,65535,0,0,0,0,0, // 0x0000 0000 0000 0000 0000 FFFF FFFF 0000
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
30 0,0,0,0,65535,65535,0,0, // 0x0000 0000 FFFF FFFF 0000 0000 0000 0000
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
31 65535,65535,65535,0,0,0,0,0,// 0x0000 0000 0000 0000 0000 FFFF FFFF FFFF
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
32 0,0,0,65535,0,0,0,0, // 0x0000 0000 0000 0000 FFFF 0000 0000 0000
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
33 0,0,0,65535,65535,0,0,0, // 0x0000 0000 0000 FFFF FFFF 0000 0000 0000
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
34 65535,0,0,0,0,65535,0,0, // 0x0000 0000 FFFF 0000 0000 0000 0000 FFFF
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
35 0,0,65535,65535, 0,0,0,0 // 0x0000 0000 0000 0000 FFFF FFFF 0000 0000
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
36 };
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
37
2753
ba8ecddf5598 adding a few const
michael
parents: 2696
diff changeset
38 static const unsigned int __align16 eight_data[] =
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
39 {
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
40 0x00080008,
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
41 0x00080008,
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
42 0x00080008,
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
43 0x00080008
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
44 };
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
45
2753
ba8ecddf5598 adding a few const
michael
parents: 2696
diff changeset
46 static const unsigned short __align16 SSE2_idct_data[7 * 8] =
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
47 {
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
48 64277,64277,64277,64277,64277,64277,64277,64277,
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
49 60547,60547,60547,60547,60547,60547,60547,60547,
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
50 54491,54491,54491,54491,54491,54491,54491,54491,
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
51 46341,46341,46341,46341,46341,46341,46341,46341,
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
52 36410,36410,36410,36410,36410,36410,36410,36410,
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
53 25080,25080,25080,25080,25080,25080,25080,25080,
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
54 12785,12785,12785,12785,12785,12785,12785,12785
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
55 };
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
56
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
57
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
58 #define SSE2_Column_IDCT() { \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
59 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
60 movdqu_m2r(*I(3), xmm2); /* xmm2 = i3 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
61 movdqu_m2r(*C(3), xmm6); /* xmm6 = c3 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
62 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
63 movdqu_r2r(xmm2, xmm4); /* xmm4 = i3 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
64 movdqu_m2r(*I(5), xmm7); /* xmm7 = i5 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
65 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
66 pmulhw_r2r(xmm6, xmm4); /* xmm4 = c3 * i3 - i3 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
67 movdqu_m2r(*C(5), xmm1); /* xmm1 = c5 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
68 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
69 pmulhw_r2r(xmm7, xmm6); /* xmm6 = c3 * i5 - i5 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
70 movdqu_r2r(xmm1, xmm5); /* xmm5 = c5 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
71 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
72 pmulhw_r2r(xmm2, xmm1); /* xmm1 = c5 * i3 - i3 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
73 movdqu_m2r(*I(1), xmm3); /* xmm3 = i1 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
74 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
75 pmulhw_r2r(xmm7, xmm5); /* xmm5 = c5 * i5 - i5 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
76 movdqu_m2r(*C(1), xmm0); /* xmm0 = c1 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
77 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
78 /* all registers are in use */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
79 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
80 paddw_r2r(xmm2, xmm4); /* xmm4 = c3 * i3 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
81 paddw_r2r(xmm7, xmm6); /* xmm6 = c3 * i5 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
82 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
83 paddw_r2r(xmm1, xmm2); /* xmm2 = c5 * i3 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
84 movdqu_m2r(*I(7), xmm1); /* xmm1 = i7 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
85 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
86 paddw_r2r(xmm5, xmm7); /* xmm7 = c5 * i5 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
87 movdqu_r2r(xmm0, xmm5); /* xmm5 = c1 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
88 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
89 pmulhw_r2r(xmm3, xmm0); /* xmm0 = c1 * i1 - i1 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
90 paddsw_r2r(xmm7, xmm4); /* xmm4 = c3 * i3 + c5 * i5 = C */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
91 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
92 pmulhw_r2r(xmm1, xmm5); /* xmm5 = c1 * i7 - i7 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
93 movdqu_m2r(*C(7), xmm7); /* xmm7 = c7 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
94 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
95 psubsw_r2r(xmm2, xmm6); /* xmm6 = c3 * i5 - c5 * i3 = D */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
96 paddw_r2r(xmm3, xmm0); /* xmm0 = c1 * i1 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
97 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
98 pmulhw_r2r(xmm7, xmm3); /* xmm3 = c7 * i1 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
99 movdqu_m2r(*I(2), xmm2); /* xmm2 = i2 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
100 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
101 pmulhw_r2r(xmm1, xmm7); /* xmm7 = c7 * i7 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
102 paddw_r2r(xmm1, xmm5); /* xmm5 = c1 * i7 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
103 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
104 movdqu_r2r(xmm2, xmm1); /* xmm1 = i2 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
105 pmulhw_m2r(*C(2), xmm2); /* xmm2 = i2 * c2 -i2 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
106 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
107 psubsw_r2r(xmm5, xmm3); /* xmm3 = c7 * i1 - c1 * i7 = B */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
108 movdqu_m2r(*I(6), xmm5); /* xmm5 = i6 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
109 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
110 paddsw_r2r(xmm7, xmm0); /* xmm0 = c1 * i1 + c7 * i7 = A */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
111 movdqu_r2r(xmm5, xmm7); /* xmm7 = i6 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
112 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
113 psubsw_r2r(xmm4, xmm0); /* xmm0 = A - C */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
114 pmulhw_m2r(*C(2), xmm5); /* xmm5 = c2 * i6 - i6 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
115 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
116 paddw_r2r(xmm1, xmm2); /* xmm2 = i2 * c2 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
117 pmulhw_m2r(*C(6), xmm1); /* xmm1 = c6 * i2 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
118 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
119 paddsw_r2r(xmm4, xmm4); /* xmm4 = C + C */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
120 paddsw_r2r(xmm0, xmm4); /* xmm4 = A + C = C. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
121 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
122 psubsw_r2r(xmm6, xmm3); /* xmm3 = B - D */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
123 paddw_r2r(xmm7, xmm5); /* xmm5 = c2 * i6 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
124 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
125 paddsw_r2r(xmm6, xmm6); /* xmm6 = D + D */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
126 pmulhw_m2r(*C(6), xmm7); /* xmm7 = c6 * i6 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
127 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
128 paddsw_r2r(xmm3, xmm6); /* xmm6 = B + D = D. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
129 movdqu_r2m(xmm4, *I(1)); /* Save C. at I(1) */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
130 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
131 psubsw_r2r(xmm5, xmm1); /* xmm1 = c6 * i2 - c2 * i6 = H */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
132 movdqu_m2r(*C(4), xmm4); /* xmm4 = c4 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
133 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
134 movdqu_r2r(xmm3, xmm5); /* xmm5 = B - D */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
135 pmulhw_r2r(xmm4, xmm3); /* xmm3 = ( c4 -1 ) * ( B - D ) */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
136 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
137 paddsw_r2r(xmm2, xmm7); /* xmm7 = c2 * i2 + c6 * i6 = G */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
138 movdqu_r2m(xmm6, *I(2)); /* Save D. at I(2) */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
139 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
140 movdqu_r2r(xmm0, xmm2); /* xmm2 = A - C */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
141 movdqu_m2r(*I(0), xmm6); /* xmm6 = i0 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
142 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
143 pmulhw_r2r(xmm4, xmm0); /* xmm0 = ( c4 - 1 ) * ( A - C ) = A. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
144 paddw_r2r(xmm3, xmm5); /* xmm5 = c4 * ( B - D ) = B. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
145 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
146 movdqu_m2r(*I(4), xmm3); /* xmm3 = i4 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
147 psubsw_r2r(xmm1, xmm5); /* xmm5 = B. - H = B.. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
148 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
149 paddw_r2r(xmm0, xmm2); /* xmm2 = c4 * ( A - C) = A. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
150 psubsw_r2r(xmm3, xmm6); /* xmm6 = i0 - i4 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
151 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
152 movdqu_r2r(xmm6, xmm0); /* xmm0 = i0 - i4 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
153 pmulhw_r2r(xmm4, xmm6); /* xmm6 = (c4 - 1) * (i0 - i4) = F */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
154 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
155 paddsw_r2r(xmm3, xmm3); /* xmm3 = i4 + i4 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
156 paddsw_r2r(xmm1, xmm1); /* xmm1 = H + H */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
157 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
158 paddsw_r2r(xmm0, xmm3); /* xmm3 = i0 + i4 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
159 paddsw_r2r(xmm5, xmm1); /* xmm1 = B. + H = H. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
160 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
161 pmulhw_r2r(xmm3, xmm4); /* xmm4 = ( c4 - 1 ) * ( i0 + i4 ) */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
162 paddw_r2r(xmm0, xmm6); /* xmm6 = c4 * ( i0 - i4 ) */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
163 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
164 psubsw_r2r(xmm2, xmm6); /* xmm6 = F - A. = F. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
165 paddsw_r2r(xmm2, xmm2); /* xmm2 = A. + A. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
166 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
167 movdqu_m2r(*I(1), xmm0); /* Load C. from I(1) */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
168 paddsw_r2r(xmm6, xmm2); /* xmm2 = F + A. = A.. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
169 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
170 paddw_r2r(xmm3, xmm4); /* xmm4 = c4 * ( i0 + i4 ) = 3 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
171 psubsw_r2r(xmm1, xmm2); /* xmm2 = A.. - H. = R2 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
172 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
173 paddsw_m2r(*Eight, xmm2); /* Adjust R2 and R1 before shifting */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
174 paddsw_r2r(xmm1, xmm1); /* xmm1 = H. + H. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
175 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
176 paddsw_r2r(xmm2, xmm1); /* xmm1 = A.. + H. = R1 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
177 psraw_i2r(4, xmm2); /* xmm2 = op2 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
178 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
179 psubsw_r2r(xmm7, xmm4); /* xmm4 = E - G = E. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
180 psraw_i2r(4, xmm1); /* xmm1 = op1 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
181 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
182 movdqu_m2r(*I(2), xmm3); /* Load D. from I(2) */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
183 paddsw_r2r(xmm7, xmm7); /* xmm7 = G + G */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
184 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
185 movdqu_r2m(xmm2, *O(2)); /* Write out op2 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
186 paddsw_r2r(xmm4, xmm7); /* xmm7 = E + G = G. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
187 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
188 movdqu_r2m(xmm1, *O(1)); /* Write out op1 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
189 psubsw_r2r(xmm3, xmm4); /* xmm4 = E. - D. = R4 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
190 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
191 paddsw_m2r(*Eight, xmm4); /* Adjust R4 and R3 before shifting */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
192 paddsw_r2r(xmm3, xmm3); /* xmm3 = D. + D. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
193 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
194 paddsw_r2r(xmm4, xmm3); /* xmm3 = E. + D. = R3 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
195 psraw_i2r(4, xmm4); /* xmm4 = op4 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
196 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
197 psubsw_r2r(xmm5, xmm6); /* xmm6 = F. - B..= R6 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
198 psraw_i2r(4, xmm3); /* xmm3 = op3 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
199 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
200 paddsw_m2r(*Eight, xmm6); /* Adjust R6 and R5 before shifting */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
201 paddsw_r2r(xmm5, xmm5); /* xmm5 = B.. + B.. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
202 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
203 paddsw_r2r(xmm6, xmm5); /* xmm5 = F. + B.. = R5 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
204 psraw_i2r(4, xmm6); /* xmm6 = op6 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
205 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
206 movdqu_r2m(xmm4, *O(4)); /* Write out op4 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
207 psraw_i2r(4, xmm5); /* xmm5 = op5 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
208 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
209 movdqu_r2m(xmm3, *O(3)); /* Write out op3 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
210 psubsw_r2r(xmm0, xmm7); /* xmm7 = G. - C. = R7 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
211 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
212 paddsw_m2r(*Eight, xmm7); /* Adjust R7 and R0 before shifting */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
213 paddsw_r2r(xmm0, xmm0); /* xmm0 = C. + C. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
214 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
215 paddsw_r2r(xmm7, xmm0); /* xmm0 = G. + C. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
216 psraw_i2r(4, xmm7); /* xmm7 = op7 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
217 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
218 movdqu_r2m(xmm6, *O(6)); /* Write out op6 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
219 psraw_i2r(4, xmm0); /* xmm0 = op0 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
220 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
221 movdqu_r2m(xmm5, *O(5)); /* Write out op5 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
222 movdqu_r2m(xmm7, *O(7)); /* Write out op7 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
223 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
224 movdqu_r2m(xmm0, *O(0)); /* Write out op0 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
225 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
226 } /* End of SSE2_Column_IDCT macro */
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
227
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
228
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
229 #define SSE2_Row_IDCT() { \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
230 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
231 movdqu_m2r(*I(3), xmm2); /* xmm2 = i3 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
232 movdqu_m2r(*C(3), xmm6); /* xmm6 = c3 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
233 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
234 movdqu_r2r(xmm2, xmm4); /* xmm4 = i3 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
235 movdqu_m2r(*I(5), xmm7); /* xmm7 = i5 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
236 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
237 pmulhw_r2r(xmm6, xmm4); /* xmm4 = c3 * i3 - i3 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
238 movdqu_m2r(*C(5), xmm1); /* xmm1 = c5 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
239 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
240 pmulhw_r2r(xmm7, xmm6); /* xmm6 = c3 * i5 - i5 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
241 movdqu_r2r(xmm1, xmm5); /* xmm5 = c5 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
242 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
243 pmulhw_r2r(xmm2, xmm1); /* xmm1 = c5 * i3 - i3 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
244 movdqu_m2r(*I(1), xmm3); /* xmm3 = i1 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
245 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
246 pmulhw_r2r(xmm7, xmm5); /* xmm5 = c5 * i5 - i5 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
247 movdqu_m2r(*C(1), xmm0); /* xmm0 = c1 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
248 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
249 /* all registers are in use */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
250 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
251 paddw_r2r(xmm2, xmm4); /* xmm4 = c3 * i3 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
252 paddw_r2r(xmm7, xmm6); /* xmm6 = c3 * i5 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
253 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
254 paddw_r2r(xmm1, xmm2); /* xmm2 = c5 * i3 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
255 movdqu_m2r(*I(7), xmm1); /* xmm1 = i7 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
256 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
257 paddw_r2r(xmm5, xmm7); /* xmm7 = c5 * i5 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
258 movdqu_r2r(xmm0, xmm5); /* xmm5 = c1 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
259 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
260 pmulhw_r2r(xmm3, xmm0); /* xmm0 = c1 * i1 - i1 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
261 paddsw_r2r(xmm7, xmm4); /* xmm4 = c3 * i3 + c5 * i5 = C */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
262 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
263 pmulhw_r2r(xmm1, xmm5); /* xmm5 = c1 * i7 - i7 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
264 movdqu_m2r(*C(7), xmm7); /* xmm7 = c7 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
265 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
266 psubsw_r2r(xmm2, xmm6); /* xmm6 = c3 * i5 - c5 * i3 = D */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
267 paddw_r2r(xmm3, xmm0); /* xmm0 = c1 * i1 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
268 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
269 pmulhw_r2r(xmm7, xmm3); /* xmm3 = c7 * i1 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
270 movdqu_m2r(*I(2), xmm2); /* xmm2 = i2 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
271 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
272 pmulhw_r2r(xmm1, xmm7); /* xmm7 = c7 * i7 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
273 paddw_r2r(xmm1, xmm5); /* xmm5 = c1 * i7 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
274 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
275 movdqu_r2r(xmm2, xmm1); /* xmm1 = i2 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
276 pmulhw_m2r(*C(2), xmm2); /* xmm2 = i2 * c2 -i2 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
277 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
278 psubsw_r2r(xmm5, xmm3); /* xmm3 = c7 * i1 - c1 * i7 = B */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
279 movdqu_m2r(*I(6), xmm5); /* xmm5 = i6 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
280 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
281 paddsw_r2r(xmm7, xmm0); /* xmm0 = c1 * i1 + c7 * i7 = A */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
282 movdqu_r2r(xmm5, xmm7); /* xmm7 = i6 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
283 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
284 psubsw_r2r(xmm4, xmm0); /* xmm0 = A - C */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
285 pmulhw_m2r(*C(2), xmm5); /* xmm5 = c2 * i6 - i6 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
286 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
287 paddw_r2r(xmm1, xmm2); /* xmm2 = i2 * c2 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
288 pmulhw_m2r(*C(6), xmm1); /* xmm1 = c6 * i2 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
289 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
290 paddsw_r2r(xmm4, xmm4); /* xmm4 = C + C */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
291 paddsw_r2r(xmm0, xmm4); /* xmm4 = A + C = C. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
292 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
293 psubsw_r2r(xmm6, xmm3); /* xmm3 = B - D */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
294 paddw_r2r(xmm7, xmm5); /* xmm5 = c2 * i6 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
295 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
296 paddsw_r2r(xmm6, xmm6); /* xmm6 = D + D */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
297 pmulhw_m2r(*C(6), xmm7); /* xmm7 = c6 * i6 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
298 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
299 paddsw_r2r(xmm3, xmm6); /* xmm6 = B + D = D. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
300 movdqu_r2m(xmm4, *I(1)); /* Save C. at I(1) */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
301 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
302 psubsw_r2r(xmm5, xmm1); /* xmm1 = c6 * i2 - c2 * i6 = H */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
303 movdqu_m2r(*C(4), xmm4); /* xmm4 = c4 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
304 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
305 movdqu_r2r(xmm3, xmm5); /* xmm5 = B - D */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
306 pmulhw_r2r(xmm4, xmm3); /* xmm3 = ( c4 -1 ) * ( B - D ) */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
307 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
308 paddsw_r2r(xmm2, xmm7); /* xmm7 = c2 * i2 + c6 * i6 = G */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
309 movdqu_r2m(xmm6, *I(2)); /* Save D. at I(2) */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
310 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
311 movdqu_r2r(xmm0, xmm2); /* xmm2 = A - C */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
312 movdqu_m2r(*I(0), xmm6); /* xmm6 = i0 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
313 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
314 pmulhw_r2r(xmm4, xmm0); /* xmm0 = ( c4 - 1 ) * ( A - C ) = A. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
315 paddw_r2r(xmm3, xmm5); /* xmm5 = c4 * ( B - D ) = B. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
316 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
317 movdqu_m2r(*I(4), xmm3); /* xmm3 = i4 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
318 psubsw_r2r(xmm1, xmm5); /* xmm5 = B. - H = B.. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
319 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
320 paddw_r2r(xmm0, xmm2); /* xmm2 = c4 * ( A - C) = A. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
321 psubsw_r2r(xmm3, xmm6); /* xmm6 = i0 - i4 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
322 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
323 movdqu_r2r(xmm6, xmm0); /* xmm0 = i0 - i4 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
324 pmulhw_r2r(xmm4, xmm6); /* xmm6 = ( c4 - 1 ) * ( i0 - i4 ) = F */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
325 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
326 paddsw_r2r(xmm3, xmm3); /* xmm3 = i4 + i4 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
327 paddsw_r2r(xmm1, xmm1); /* xmm1 = H + H */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
328 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
329 paddsw_r2r(xmm0, xmm3); /* xmm3 = i0 + i4 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
330 paddsw_r2r(xmm5, xmm1); /* xmm1 = B. + H = H. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
331 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
332 pmulhw_r2r(xmm3, xmm4); /* xmm4 = ( c4 - 1 ) * ( i0 + i4 ) */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
333 paddw_r2r(xmm0, xmm6); /* xmm6 = c4 * ( i0 - i4 ) */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
334 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
335 psubsw_r2r(xmm2, xmm6); /* xmm6 = F - A. = F. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
336 paddsw_r2r(xmm2, xmm2); /* xmm2 = A. + A. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
337 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
338 movdqu_m2r(*I(1), xmm0); /* Load C. from I(1) */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
339 paddsw_r2r(xmm6, xmm2); /* xmm2 = F + A. = A.. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
340 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
341 paddw_r2r(xmm3, xmm4); /* xmm4 = c4 * ( i0 + i4 ) = 3 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
342 psubsw_r2r(xmm1, xmm2); /* xmm2 = A.. - H. = R2 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
343 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
344 paddsw_r2r(xmm1, xmm1); /* xmm1 = H. + H. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
345 paddsw_r2r(xmm2, xmm1); /* xmm1 = A.. + H. = R1 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
346 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
347 psubsw_r2r(xmm7, xmm4); /* xmm4 = E - G = E. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
348 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
349 movdqu_m2r(*I(2), xmm3); /* Load D. from I(2) */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
350 paddsw_r2r(xmm7, xmm7); /* xmm7 = G + G */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
351 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
352 movdqu_r2m(xmm2, *I(2)); /* Write out op2 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
353 paddsw_r2r(xmm4, xmm7); /* xmm7 = E + G = G. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
354 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
355 movdqu_r2m(xmm1, *I(1)); /* Write out op1 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
356 psubsw_r2r(xmm3, xmm4); /* xmm4 = E. - D. = R4 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
357 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
358 paddsw_r2r(xmm3, xmm3); /* xmm3 = D. + D. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
359 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
360 paddsw_r2r(xmm4, xmm3); /* xmm3 = E. + D. = R3 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
361 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
362 psubsw_r2r(xmm5, xmm6); /* xmm6 = F. - B..= R6 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
363 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
364 paddsw_r2r(xmm5, xmm5); /* xmm5 = B.. + B.. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
365 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
366 paddsw_r2r(xmm6, xmm5); /* xmm5 = F. + B.. = R5 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
367 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
368 movdqu_r2m(xmm4, *I(4)); /* Write out op4 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
369 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
370 movdqu_r2m(xmm3, *I(3)); /* Write out op3 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
371 psubsw_r2r(xmm0, xmm7); /* xmm7 = G. - C. = R7 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
372 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
373 paddsw_r2r(xmm0, xmm0); /* xmm0 = C. + C. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
374 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
375 paddsw_r2r(xmm7, xmm0); /* xmm0 = G. + C. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
376 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
377 movdqu_r2m(xmm6, *I(6)); /* Write out op6 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
378 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
379 movdqu_r2m(xmm5, *I(5)); /* Write out op5 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
380 movdqu_r2m(xmm7, *I(7)); /* Write out op7 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
381 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
382 movdqu_r2m(xmm0, *I(0)); /* Write out op0 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
383 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
384 } /* End of SSE2_Row_IDCT macro */
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
385
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
386
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
387 #define SSE2_Transpose() { \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
388 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
389 movdqu_m2r(*I(4), xmm4); /* xmm4=e7e6e5e4e3e2e1e0 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
390 movdqu_m2r(*I(5), xmm0); /* xmm4=f7f6f5f4f3f2f1f0 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
391 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
392 movdqu_r2r(xmm4, xmm5); /* make a copy */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
393 punpcklwd_r2r(xmm0, xmm4); /* xmm4=f3e3f2e2f1e1f0e0 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
394 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
395 punpckhwd_r2r(xmm0, xmm5); /* xmm5=f7e7f6e6f5e5f4e4 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
396 movdqu_m2r(*I(6), xmm6); /* xmm6=g7g6g5g4g3g2g1g0 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
397 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
398 movdqu_m2r(*I(7), xmm0); /* xmm0=h7h6h5h4h3h2h1h0 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
399 movdqu_r2r(xmm6, xmm7); /* make a copy */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
400 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
401 punpcklwd_r2r(xmm0, xmm6); /* xmm6=h3g3h3g2h1g1h0g0 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
402 punpckhwd_r2r(xmm0, xmm7); /* xmm7=h7g7h6g6h5g5h4g4 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
403 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
404 movdqu_r2r(xmm4, xmm3); /* make a copy */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
405 punpckldq_r2r(xmm6, xmm4); /* xmm4=h1g1f1e1h0g0f0e0 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
406 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
407 punpckhdq_r2r(xmm6, xmm3); /* xmm3=h3g3g3e3h2g2f2e2 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
408 movdqu_r2m(xmm3, *I(6)); /* save h3g3g3e3h2g2f2e2 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
409 /* Free xmm6 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
410 movdqu_r2r(xmm5, xmm6); /* make a copy */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
411 punpckldq_r2r(xmm7, xmm5); /* xmm5=h5g5f5e5h4g4f4e4 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
412 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
413 punpckhdq_r2r(xmm7, xmm6); /* xmm6=h7g7f7e7h6g6f6e6 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
414 movdqu_m2r(*I(0), xmm0); /* xmm0=a7a6a5a4a3a2a1a0 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
415 /* Free xmm7 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
416 movdqu_m2r(*I(1), xmm1); /* xmm1=b7b6b5b4b3b2b1b0 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
417 movdqu_r2r(xmm0, xmm7); /* make a copy */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
418 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
419 punpcklwd_r2r(xmm1, xmm0); /* xmm0=b3a3b2a2b1a1b0a0 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
420 punpckhwd_r2r(xmm1, xmm7); /* xmm7=b7a7b6a6b5a5b4a4 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
421 /* Free xmm1 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
422 movdqu_m2r(*I(2), xmm2); /* xmm2=c7c6c5c4c3c2c1c0 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
423 movdqu_m2r(*I(3), xmm3); /* xmm3=d7d6d5d4d3d2d1d0 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
424 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
425 movdqu_r2r(xmm2, xmm1); /* make a copy */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
426 punpcklwd_r2r(xmm3, xmm2); /* xmm2=d3c3d2c2d1c1d0c0 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
427 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
428 punpckhwd_r2r(xmm3, xmm1); /* xmm1=d7c7d6c6d5c5d4c4 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
429 movdqu_r2r(xmm0, xmm3); /* make a copy */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
430 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
431 punpckldq_r2r(xmm2, xmm0); /* xmm0=d1c1b1a1d0c0b0a0 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
432 punpckhdq_r2r(xmm2, xmm3); /* xmm3=d3c3b3a3d2c2b2a2 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
433 /* Free xmm2 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
434 movdqu_r2r(xmm7, xmm2); /* make a copy */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
435 punpckldq_r2r(xmm1, xmm2); /* xmm2=d5c5b5a5d4c4b4a4 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
436 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
437 punpckhdq_r2r(xmm1, xmm7); /* xmm7=d7c7b7a7d6c6b6a6 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
438 movdqu_r2r(xmm0, xmm1); /* make a copy */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
439 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
440 punpcklqdq_r2r(xmm4, xmm0); /* xmm0=h0g0f0e0d0c0b0a0 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
441 punpckhqdq_r2r(xmm4, xmm1); /* xmm1=h1g1g1e1d1c1b1a1 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
442 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
443 movdqu_r2m(xmm0, *I(0)); /* save I(0) */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
444 movdqu_r2m(xmm1, *I(1)); /* save I(1) */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
445 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
446 movdqu_m2r(*I(6), xmm0); /* load h3g3g3e3h2g2f2e2 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
447 movdqu_r2r(xmm3, xmm1); /* make a copy */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
448 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
449 punpcklqdq_r2r(xmm0, xmm1); /* xmm1=h2g2f2e2d2c2b2a2 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
450 punpckhqdq_r2r(xmm0, xmm3); /* xmm3=h3g3f3e3d3c3b3a3 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
451 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
452 movdqu_r2r(xmm2, xmm4); /* make a copy */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
453 punpcklqdq_r2r(xmm5, xmm4); /* xmm4=h4g4f4e4d4c4b4a4 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
454 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
455 punpckhqdq_r2r(xmm5, xmm2); /* xmm2=h5g5f5e5d5c5b5a5 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
456 movdqu_r2m(xmm1, *I(2)); /* save I(2) */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
457 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
458 movdqu_r2m(xmm3, *I(3)); /* save I(3) */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
459 movdqu_r2m(xmm4, *I(4)); /* save I(4) */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
460 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
461 movdqu_r2m(xmm2, *I(5)); /* save I(5) */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
462 movdqu_r2r(xmm7, xmm5); /* make a copy */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
463 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
464 punpcklqdq_r2r(xmm6, xmm5); /* xmm5=h6g6f6e6d6c6b6a6 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
465 punpckhqdq_r2r(xmm6, xmm7); /* xmm7=h7g7f7e7d7c7b7a7 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
466 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
467 movdqu_r2m(xmm5, *I(6)); /* save I(6) */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
468 movdqu_r2m(xmm7, *I(7)); /* save I(7) */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
469 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
470 } /* End of Transpose Macro */
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
471
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
472
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
473 #define SSE2_Dequantize() { \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
474 movdqu_m2r(*(eax), xmm0); \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
475 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
476 pmullw_m2r(*(ebx), xmm0); /* xmm0 = 07 06 05 04 03 02 01 00 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
477 movdqu_m2r(*(eax + 16), xmm1); \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
478 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
479 pmullw_m2r(*(ebx + 16), xmm1); /* xmm1 = 17 16 15 14 13 12 11 10 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
480 pshuflw_r2r(xmm0, xmm3, 0x078); /* xmm3 = 07 06 05 04 01 03 02 00 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
481 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
482 movdqu_r2r(xmm1, xmm2); /* xmm2 = 17 16 15 14 13 12 11 10 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
483 movdqu_m2r(*(ecx), xmm7); /* xmm7 = -- -- -- -- -- FF FF -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
484 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
485 movdqu_m2r(*(eax + 32), xmm4); \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
486 movdqu_m2r(*(eax + 64), xmm5); \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
487 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
488 pmullw_m2r(*(ebx + 32), xmm4); /* xmm4 = 27 26 25 24 23 22 21 20 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
489 pmullw_m2r(*(ebx + 64), xmm5); /* xmm5 = 47 46 45 44 43 42 41 40 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
490 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
491 movdqu_m2r(*(ecx + 16), xmm6); /* xmm6 = -- -- FF FF -- -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
492 pand_r2r(xmm2, xmm7); /* xmm7 = -- -- -- -- -- 12 11 -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
493 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
494 pand_r2r(xmm4, xmm6); /* xmm6 = -- -- 25 24 -- -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
495 pxor_r2r(xmm7, xmm2); /* xmm2 = 17 16 15 14 13 -- -- 10 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
496 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
497 pxor_r2r(xmm6, xmm4); /* xmm4 = 27 26 -- -- 23 22 21 20 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
498 pslldq_i2r(4, xmm7); /* xmm7 = -- -- -- 12 11 -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
499 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
500 pslldq_i2r(2, xmm6); /* xmm6 = -- 25 24 -- -- -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
501 por_r2r(xmm6, xmm7); /* xmm7 = -- 25 24 12 11 -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
502 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
503 movdqu_m2r(*(ecx + 32), xmm0); /* xmm0 = -- -- -- -- -- FF FF FF */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
504 movdqu_m2r(*(ecx + 48), xmm6); /* xmm6 = -- -- -- -- FF -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
505 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
506 pand_r2r(xmm3, xmm0); /* xmm0 = -- -- -- -- -- 03 02 00 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
507 pand_r2r(xmm5, xmm6); /* xmm6 = -- -- -- -- 43 -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
508 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
509 pxor_r2r(xmm0, xmm3); /* xmm3 = 07 06 05 04 01 -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
510 pxor_r2r(xmm6, xmm5); /* xmm5 = 47 46 45 44 -- 42 41 40 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
511 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
512 por_r2r(xmm7, xmm0); /* xmm0 = -- 25 24 12 11 03 02 00 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
513 pslldq_i2r(8, xmm6); /* xmm6 = 43 -- -- -- -- -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
514 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
515 por_r2r(xmm6, xmm0); /* xmm0 = 43 25 24 12 11 03 02 00 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
516 /* 02345 in use */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
517 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
518 movdqu_m2r(*(ecx + 64 ), xmm1); /* xmm1 = -- -- -- FF FF -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
519 pshuflw_r2r(xmm5, xmm5, 0x0B4); /* xmm5 = 47 46 45 44 42 -- 41 40 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
520 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
521 movdqu_r2r(xmm1, xmm7); /* xmm7 = -- -- -- FF FF -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
522 movdqu_r2r(xmm1, xmm6); /* xmm6 = -- -- -- FF FF -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
523 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
524 movdqu_r2m(xmm0, *(eax)); /* write 43 25 24 12 11 03 02 00 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
525 pshufhw_r2r(xmm4, xmm4, 0x0C2); /* xmm4 = 27 -- -- 26 23 22 21 20 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
526 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
527 pand_r2r(xmm4, xmm7); /* xmm7 = -- -- -- 26 23 -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
528 pand_r2r(xmm5, xmm1); /* xmm1 = -- -- -- 44 42 -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
529 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
530 pxor_r2r(xmm7, xmm4); /* xmm4 = 27 -- -- -- -- 22 21 20 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
531 pxor_r2r(xmm1, xmm5); /* xmm5 = 47 46 45 -- -- -- 41 40 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
532 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
533 pshuflw_r2r(xmm2, xmm2, 0x0C6); /* xmm2 = 17 16 15 14 13 10 -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
534 movdqu_r2r(xmm6, xmm0); /* xmm0 = -- -- -- FF FF -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
535 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
536 pslldq_i2r(2, xmm7); /* xmm7 = -- -- 26 23 -- -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
537 pslldq_i2r(6, xmm1); /* xmm1 = 44 42 -- -- -- -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
538 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
539 psrldq_i2r(2, xmm0); /* xmm0 = -- -- -- -- FF FF -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
540 pand_r2r(xmm3, xmm6); /* xmm6 = -- -- -- 04 01 -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
541 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
542 pand_r2r(xmm2, xmm0); /* xmm0 = -- -- -- -- 13 10 -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
543 pxor_r2r(xmm6, xmm3); /* xmm3 = 07 06 05 -- -- -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
544 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
545 pxor_r2r(xmm0, xmm2); /* xmm2 = 17 16 15 14 -- -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
546 psrldq_i2r(6, xmm6); /* xmm0 = -- -- -- -- -- -- 04 01 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
547 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
548 por_r2r(xmm7, xmm1); /* xmm1 = 44 42 26 23 -- -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
549 por_r2r(xmm6, xmm0); /* xmm1 = -- -- -- -- 13 10 04 01 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
550 /* 12345 in use */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
551 por_r2r(xmm0, xmm1); /* xmm1 = 44 42 26 23 13 10 04 01 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
552 pshuflw_r2r(xmm4, xmm4, 0x093); /* xmm4 = 27 -- -- -- 22 21 20 -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
553 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
554 pshufhw_r2r(xmm4, xmm4, 0x093); /* xmm4 = -- -- -- 27 22 21 20 -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
555 movdqu_r2m(xmm1, *(eax + 16)); /* write 44 42 26 23 13 10 04 01 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
556 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
557 pshufhw_r2r(xmm3, xmm3, 0x0D2); /* xmm3 = 07 05 -- 06 -- -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
558 movdqu_m2r(*(ecx + 64), xmm0); /* xmm0 = -- -- -- FF FF -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
559 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
560 pand_r2r(xmm3, xmm0); /* xmm0 = -- -- -- 06 -- -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
561 psrldq_i2r(12, xmm3); /* xmm3 = -- -- -- -- -- -- 07 05 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
562 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
563 psrldq_i2r(8, xmm0); /* xmm0 = -- -- -- -- -- -- -- 06 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
564 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
565 movdqu_m2r(*(ecx + 64), xmm6); /* xmm6 = -- -- -- FF FF -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
566 movdqu_m2r(*(ecx + 96), xmm7); /* xmm7 = -- -- -- -- FF FF -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
567 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
568 pand_r2r(xmm4, xmm6); /* xmm6 = -- -- -- 27 22 -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
569 pxor_r2r(xmm6, xmm4); /* xmm4 = -- -- -- -- -- 21 20 -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
570 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
571 por_r2r(xmm6, xmm3); /* xmm3 = -- -- -- 27 22 -- 07 05 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
572 pand_r2r(xmm4, xmm7); /* xmm7 = -- -- -- -- -- 21 -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
573 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
574 por_r2r(xmm7, xmm0); /* xmm0 = -- -- -- -- -- 21 -- 06 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
575 pxor_r2r(xmm7, xmm4); /* xmm4 = -- -- -- -- -- -- 20 -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
576 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
577 movdqu_m2r(*(ecx + 16 ), xmm6); /* xmm6 = -- -- FF FF -- -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
578 movdqu_m2r(*(ecx + 64 ), xmm1); /* xmm1 = -- -- -- FF FF -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
579 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
580 pand_r2r(xmm2, xmm6); /* xmm6 = -- -- 15 14 -- -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
581 pand_r2r(xmm6, xmm1); /* xmm1 = -- -- -- 14 -- -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
582 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
583 pxor_r2r(xmm6, xmm2); /* xmm2 = 17 16 -- -- -- -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
584 pxor_r2r(xmm1, xmm6); /* xmm6 = -- -- 15 -- -- -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
585 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
586 psrldq_i2r(4, xmm1); /* xmm1 = -- -- -- -- -- 14 -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
587 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
588 psrldq_i2r(8, xmm6); /* xmm6 = -- -- -- -- -- -- 15 -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
589 por_r2r(xmm1, xmm3); /* xmm3 = -- -- -- 27 22 14 07 05 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
590 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
591 por_r2r(xmm6, xmm0); /* xmm0 = -- -- -- -- -- 21 15 06 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
592 pshufhw_r2r(xmm5, xmm5, 0x0E1); /* xmm5 = 47 46 -- 45 -- -- 41 40 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
593 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
594 movdqu_m2r(*(ecx + 64), xmm1); /* xmm1 = -- -- -- FF FF -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
595 pshuflw_r2r(xmm5, xmm5, 0x072); /* xmm5 = 47 46 -- 45 41 -- 40 -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
596 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
597 movdqu_r2r(xmm1, xmm6); /* xmm6 = -- -- -- FF FF -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
598 pand_r2r(xmm5, xmm1); /* xmm1 = -- -- -- 45 41 -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
599 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
600 pxor_r2r(xmm1, xmm5); /* xmm5 = 47 46 -- -- -- -- 40 -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
601 pslldq_i2r(4, xmm1); /* xmm1 = -- 45 41 -- -- -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
602 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
603 pshufd_r2r(xmm5, xmm5, 0x09C); /* xmm5 = -- -- -- -- 47 46 40 -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
604 por_r2r(xmm1, xmm3); /* xmm3 = -- 45 41 27 22 14 07 05 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
605 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
606 movdqu_m2r(*(eax + 96), xmm1); /* xmm1 = 67 66 65 64 63 62 61 60 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
607 pmullw_m2r(*(ebx + 96), xmm1); \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
608 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
609 movdqu_m2r(*(ecx), xmm7); /* xmm7 = -- -- -- -- -- FF FF -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
610 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
611 psrldq_i2r(8, xmm6); /* xmm6 = -- -- -- -- -- -- -- FF */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
612 pand_r2r(xmm5, xmm7); /* xmm7 = -- -- -- -- -- 46 40 -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
613 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
614 pand_r2r(xmm1, xmm6); /* xmm6 = -- -- -- -- -- -- -- 60 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
615 pxor_r2r(xmm7, xmm5); /* xmm5 = -- -- -- -- 47 -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
616 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
617 pxor_r2r(xmm6, xmm1); /* xmm1 = 67 66 65 64 63 62 61 -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
618 pslldq_i2r(2, xmm5); /* xmm5 = -- -- -- 47 -- -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
619 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
620 pslldq_i2r(14, xmm6); /* xmm6 = 60 -- -- -- -- -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
621 por_r2r(xmm5, xmm4); /* xmm4 = -- -- -- 47 -- -- 20 -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
622 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
623 por_r2r(xmm6, xmm3); /* xmm3 = 60 45 41 27 22 14 07 05 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
624 pslldq_i2r(6, xmm7); /* xmm7 = -- -- 46 40 -- -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
625 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
626 movdqu_r2m(xmm3, *(eax+32)); /* write 60 45 41 27 22 14 07 05 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
627 por_r2r(xmm7, xmm0); /* xmm0 = -- -- 46 40 -- 21 15 06 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
628 /* 0, 1, 2, 4 in use */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
629 movdqu_m2r(*(eax + 48), xmm3); /* xmm3 = 37 36 35 34 33 32 31 30 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
630 movdqu_m2r(*(eax + 80), xmm5); /* xmm5 = 57 56 55 54 53 52 51 50 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
631 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
632 pmullw_m2r(*(ebx + 48), xmm3); \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
633 pmullw_m2r(*(ebx + 80), xmm5); \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
634 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
635 movdqu_m2r(*(ecx + 64), xmm6); /* xmm6 = -- -- -- FF FF -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
636 movdqu_m2r(*(ecx + 64), xmm7); /* xmm7 = -- -- -- FF FF -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
637 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
638 psrldq_i2r(8, xmm6); /* xmm6 = -- -- -- -- -- -- -- FF */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
639 pslldq_i2r(8, xmm7); /* xmm7 = FF -- -- -- -- -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
640 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
641 pand_r2r(xmm3, xmm6); /* xmm6 = -- -- -- -- -- -- -- 30 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
642 pand_r2r(xmm5, xmm7); /* xmm7 = 57 -- -- -- -- -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
643 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
644 pxor_r2r(xmm6, xmm3); /* xmm3 = 37 36 35 34 33 32 31 -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
645 pxor_r2r(xmm7, xmm5); /* xmm5 = __ 56 55 54 53 52 51 50 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
646 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
647 pslldq_i2r(6, xmm6); /* xmm6 = -- -- -- -- 30 -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
648 psrldq_i2r(2, xmm7); /* xmm7 = -- 57 -- -- -- -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
649 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
650 por_r2r(xmm7, xmm6); /* xmm6 = -- 57 -- -- 30 -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
651 movdqu_m2r(*(ecx), xmm7); /* xmm7 = -- -- -- -- -- FF FF -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
652 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
653 por_r2r(xmm6, xmm0); /* xmm0 = -- 57 46 40 30 21 15 06 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
654 psrldq_i2r(2, xmm7); /* xmm7 = -- -- -- -- -- -- FF FF */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
655 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
656 movdqu_r2r(xmm2, xmm6); /* xmm6 = 17 16 -- -- -- -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
657 pand_r2r(xmm1, xmm7); /* xmm7 = -- -- -- -- -- -- 61 -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
658 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
659 pslldq_i2r(2, xmm6); /* xmm6 = 16 -- -- -- -- -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
660 psrldq_i2r(14, xmm2); /* xmm2 = -- -- -- -- -- -- -- 17 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
661 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
662 pxor_r2r(xmm7, xmm1); /* xmm1 = 67 66 65 64 63 62 -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
663 pslldq_i2r(12, xmm7); /* xmm7 = 61 -- -- -- -- -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
664 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
665 psrldq_i2r(14, xmm6); /* xmm6 = -- -- -- -- -- -- -- 16 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
666 por_r2r(xmm6, xmm4); /* xmm4 = -- -- -- 47 -- -- 20 16 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
667 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
668 por_r2r(xmm7, xmm0); /* xmm0 = 61 57 46 40 30 21 15 06 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
669 movdqu_m2r(*(ecx), xmm6); /* xmm6 = -- -- -- -- -- FF FF -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
670 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
671 psrldq_i2r(2, xmm6); /* xmm6 = -- -- -- -- -- -- FF FF */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
672 movdqu_r2m(xmm0, *(eax+48)); /* write 61 57 46 40 30 21 15 06 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
673 /* 1, 2, 3, 4, 5 in use */\
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
674 movdqu_m2r(*(ecx), xmm0); /* xmm0 = -- -- -- -- -- FF FF -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
675 pand_r2r(xmm3, xmm6); /* xmm6 = -- -- -- -- -- -- 31 -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
676 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
677 movdqu_r2r(xmm3, xmm7); /* xmm7 = 37 36 35 34 33 32 31 -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
678 pxor_r2r(xmm6, xmm3); /* xmm3 = 37 36 35 34 33 32 -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
679 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
680 pslldq_i2r(2, xmm3); /* xmm3 = 36 35 34 33 32 -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
681 pand_r2r(xmm1, xmm0); /* xmm0 = -- -- -- -- -- 62 -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
682 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
683 psrldq_i2r(14, xmm7); /* xmm7 = -- -- -- -- -- -- -- 37 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
684 pxor_r2r(xmm0, xmm1); /* xmm1 = 67 66 65 64 63 -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
685 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
686 por_r2r(xmm7, xmm6); /* xmm6 = -- -- -- -- -- -- 31 37 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
687 movdqu_m2r(*(ecx + 64), xmm7); /* xmm7 = -- -- -- FF FF -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
688 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
689 pshuflw_r2r(xmm6, xmm6, 0x01E); /* xmm6 = -- -- -- -- 37 31 -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
690 pslldq_i2r(6, xmm7); /* xmm7 = FF FF -- -- -- -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
691 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
692 por_r2r(xmm6, xmm4); /* xmm4 = -- -- -- 47 37 31 20 16 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
693 pand_r2r(xmm5, xmm7); /* xmm7 = -- 56 -- -- -- -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
694 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
695 pslldq_i2r(8, xmm0); /* xmm0 = -- 62 -- -- -- -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
696 pxor_r2r(xmm7, xmm5); /* xmm5 = -- -- 55 54 53 52 51 50 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
697 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
698 psrldq_i2r(2, xmm7); /* xmm7 = -- -- 56 -- -- -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
699 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
700 pshufhw_r2r(xmm3, xmm3, 0x087); /* xmm3 = 35 33 34 36 32 -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
701 por_r2r(xmm7, xmm0); /* xmm0 = -- 62 56 -- -- -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
702 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
703 movdqu_m2r(*(eax + 112), xmm7); /* xmm7 = 77 76 75 74 73 72 71 70 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
704 pmullw_m2r(*(ebx + 112), xmm7); \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
705 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
706 movdqu_m2r(*(ecx + 64), xmm6); /* xmm6 = -- -- -- FF FF -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
707 por_r2r(xmm0, xmm4); /* xmm4 = -- 62 56 47 37 31 20 16 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
708 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
709 pshuflw_r2r(xmm7, xmm7, 0x0E1); /* xmm7 = 77 76 75 74 73 72 70 71 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
710 psrldq_i2r(8, xmm6); /* xmm6 = -- -- -- -- -- -- -- FF */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
711 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
712 movdqu_m2r(*(ecx + 64), xmm0); /* xmm0 = -- -- -- FF FF -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
713 pand_r2r(xmm7, xmm6); /* xmm6 = -- -- -- -- -- -- -- 71 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
714 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
715 pand_r2r(xmm3, xmm0); /* xmm0 = -- -- -- 36 32 -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
716 pxor_r2r(xmm6, xmm7); /* xmm7 = 77 76 75 74 73 72 70 -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
717 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
718 pxor_r2r(xmm0, xmm3); /* xmm3 = 35 33 34 -- -- -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
719 pslldq_i2r(14, xmm6); /* xmm6 = 71 -- -- -- -- -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
720 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
721 psrldq_i2r(4, xmm0); /* xmm0 = -- -- -- -- -- 36 32 -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
722 por_r2r(xmm6, xmm4); /* xmm4 = 71 62 56 47 37 31 20 16 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
723 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
724 por_r2r(xmm0, xmm2); /* xmm2 = -- -- -- -- -- 36 32 17 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
725 movdqu_r2m(xmm4, *(eax + 64)); /* write 71 62 56 47 37 31 20 16 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
726 /* 1, 2, 3, 5, 7 in use */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
727 movdqu_m2r(*(ecx + 80), xmm6); /* xmm6 = -- -- FF -- -- -- -- FF */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
728 pshufhw_r2r(xmm7, xmm7, 0x0D2); /* xmm7 = 77 75 74 76 73 72 70 __ */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
729 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
730 movdqu_m2r(*(ecx), xmm4); /* xmm4 = -- -- -- -- -- FF FF -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
731 movdqu_m2r(*(ecx+48), xmm0); /* xmm0 = -- -- -- -- FF -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
732 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
733 pand_r2r(xmm5, xmm6); /* xmm6 = -- -- 55 -- -- -- -- 50 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
734 pand_r2r(xmm7, xmm4); /* xmm4 = -- -- -- -- -- 72 70 -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
735 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
736 pand_r2r(xmm1, xmm0); /* xmm0 = -- -- -- -- 63 -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
737 pxor_r2r(xmm6, xmm5); /* xmm5 = -- -- -- 54 53 52 51 -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
738 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
739 pxor_r2r(xmm4, xmm7); /* xmm7 = 77 75 74 76 73 -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
740 pxor_r2r(xmm0, xmm1); /* xmm1 = 67 66 65 64 -- -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
741 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
742 pshuflw_r2r(xmm6, xmm6, 0x02B); /* xmm6 = -- -- 55 -- 50 -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
743 pslldq_i2r(10, xmm4); /* xmm4 = 72 20 -- -- -- -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
744 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
745 pshufhw_r2r(xmm6, xmm6, 0x0B1); /* xmm6 = -- -- -- 55 50 -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
746 pslldq_i2r(4, xmm0); /* xmm0 = -- -- 63 -- -- -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
747 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
748 por_r2r(xmm4, xmm6); /* xmm6 = 72 70 -- 55 50 -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
749 por_r2r(xmm0, xmm2); /* xmm2 = -- -- 63 -- -- 36 32 17 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
750 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
751 por_r2r(xmm6, xmm2); /* xmm2 = 72 70 64 55 50 36 32 17 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
752 pshufhw_r2r(xmm1, xmm1, 0x0C9); /* xmm1 = 67 64 66 65 -- -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
753 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
754 movdqu_r2r(xmm3, xmm6); /* xmm6 = 35 33 34 -- -- -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
755 movdqu_r2m(xmm2, *(eax+80)); /* write 72 70 64 55 50 36 32 17 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
756 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
757 psrldq_i2r(12, xmm6); /* xmm6 = -- -- -- -- -- -- 35 33 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
758 pslldq_i2r(4, xmm3); /* xmm3 = 34 -- -- -- -- -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
759 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
760 pshuflw_r2r(xmm5, xmm5, 0x04E); /* xmm5 = -- -- -- 54 51 -- 53 52 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
761 movdqu_r2r(xmm7, xmm4); /* xmm4 = 77 75 74 76 73 -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
762 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
763 movdqu_r2r(xmm5, xmm2); /* xmm2 = -- -- -- 54 51 -- 53 52 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
764 psrldq_i2r(10, xmm7); /* xmm7 = -- -- -- -- -- 77 75 74 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
765 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
766 pslldq_i2r(6, xmm4); /* xmm4 = 76 73 -- -- -- -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
767 pslldq_i2r(12, xmm2); /* xmm2 = 53 52 -- -- -- -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
768 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
769 movdqu_r2r(xmm1, xmm0); /* xmm0 = 67 64 66 65 -- -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
770 psrldq_i2r(12, xmm1); /* xmm1 = -- -- -- -- -- -- 67 64 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
771 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
772 psrldq_i2r(6, xmm5); /* xmm5 = -- -- -- -- -- -- 54 51 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
773 psrldq_i2r(14, xmm3); /* xmm3 = -- -- -- -- -- -- -- 34 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
774 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
775 pslldq_i2r(10, xmm7); /* xmm7 = 77 75 74 -- -- -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
776 por_r2r(xmm6, xmm4); /* xmm4 = 76 73 -- -- -- -- 35 33 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
777 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
778 psrldq_i2r(10, xmm2); /* xmm2 = -- -- -- -- -- 53 52 -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
779 pslldq_i2r(4, xmm0); /* xmm0 = 66 65 -- -- -- -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
780 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
781 pslldq_i2r(8, xmm1); /* xmm1 = -- -- 67 64 -- -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
782 por_r2r(xmm7, xmm3); /* xmm3 = 77 75 74 -- -- -- -- 34 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
783 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
784 psrldq_i2r(6, xmm0); /* xmm0 = -- -- -- 66 65 -- -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
785 pslldq_i2r(4, xmm5); /* xmm5 = -- -- -- -- 54 51 -- -- */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
786 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
787 por_r2r(xmm1, xmm4); /* xmm4 = 76 73 67 64 -- -- 35 33 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
788 por_r2r(xmm2, xmm3); /* xmm3 = 77 75 74 -- -- 53 52 34 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
789 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
790 por_r2r(xmm5, xmm4); /* xmm4 = 76 73 67 64 54 51 35 33 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
791 por_r2r(xmm0, xmm3); /* xmm3 = 77 75 74 66 65 53 52 34 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
792 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
793 movdqu_r2m(xmm4, *(eax+96)); /* write 76 73 67 64 54 51 35 33 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
794 movdqu_r2m(xmm3, *(eax+112)); /* write 77 75 74 66 65 53 52 34 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
795 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
796 } /* end of SSE2_Dequantize Macro */
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
797
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
798
2696
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 1978
diff changeset
799 void ff_vp3_idct_sse2(int16_t *input_data)
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
800 {
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
801 unsigned char *input_bytes = (unsigned char *)input_data;
2696
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 1978
diff changeset
802 unsigned char *output_data_bytes = (unsigned char *)input_data;
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
803 unsigned char *idct_data_bytes = (unsigned char *)SSE2_idct_data;
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
804 unsigned char *Eight = (unsigned char *)eight_data;
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
805
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
806 #define eax input_bytes
2696
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 1978
diff changeset
807 //#define ebx dequant_matrix_bytes
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
808 #define ecx dequant_const_bytes
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
809 #define edx idct_data_bytes
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
810
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
811 #define I(i) (eax + 16 * i)
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
812 #define O(i) (ebx + 16 * i)
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
813 #define C(i) (edx + 16 * (i-1))
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
814
2696
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 1978
diff changeset
815 // SSE2_Dequantize();
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
816
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
817 #undef ebx
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
818 #define ebx output_data_bytes
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
819
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
820 SSE2_Row_IDCT();
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
821
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
822 SSE2_Transpose();
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
823
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
824 SSE2_Column_IDCT();
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
825 }