annotate i386/vp3dsp_sse2.c @ 7778:e31b0b920475 libavcodec

theoradec: skip decoding of uncoded MV in 4MV code Thusnelda, the new experimental Theora encoder is using this Theora feature that was previously not exploited. fixes issue579
author aurel
date Wed, 03 Sep 2008 00:17:11 +0000
parents dfbf377bd066
children 4ab419106cb1
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
1 /*
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
2 * Copyright (C) 2004 the ffmpeg project
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
3 *
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3089
diff changeset
4 * This file is part of FFmpeg.
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3089
diff changeset
5 *
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3089
diff changeset
6 * FFmpeg is free software; you can redistribute it and/or
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
7 * modify it under the terms of the GNU Lesser General Public
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
8 * License as published by the Free Software Foundation; either
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3089
diff changeset
9 * version 2.1 of the License, or (at your option) any later version.
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
10 *
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3089
diff changeset
11 * FFmpeg is distributed in the hope that it will be useful,
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
14 * Lesser General Public License for more details.
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
15 *
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
16 * You should have received a copy of the GNU Lesser General Public
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3089
diff changeset
17 * License along with FFmpeg; if not, write to the Free Software
3036
0b546eab515d Update licensing information: The FSF changed postal address.
diego
parents: 2967
diff changeset
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
19 */
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
20
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
21 /**
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
22 * @file vp3dsp_sse2.c
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
23 * SSE2-optimized functions cribbed from the original VP3 source code.
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
24 */
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
25
6763
f7cbb7733146 Use full path for #includes from another directory.
diego
parents: 6597
diff changeset
26 #include "libavcodec/dsputil.h"
7742
bff9b5fea03f Use ff_pw_8 in MMX/SSE VP3 IDCT
conrad
parents: 6763
diff changeset
27 #include "dsputil_mmx.h"
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
28 #include "mmx.h"
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
29
7758
dfbf377bd066 Declare ff_vp3_idct_data to be uint16_t
conrad
parents: 7757
diff changeset
30 DECLARE_ALIGNED_16(const uint16_t, ff_vp3_idct_data[7 * 8]) =
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
31 {
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2834
diff changeset
32 64277,64277,64277,64277,64277,64277,64277,64277,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2834
diff changeset
33 60547,60547,60547,60547,60547,60547,60547,60547,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2834
diff changeset
34 54491,54491,54491,54491,54491,54491,54491,54491,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2834
diff changeset
35 46341,46341,46341,46341,46341,46341,46341,46341,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2834
diff changeset
36 36410,36410,36410,36410,36410,36410,36410,36410,
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2834
diff changeset
37 25080,25080,25080,25080,25080,25080,25080,25080,
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
38 12785,12785,12785,12785,12785,12785,12785,12785
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
39 };
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
40
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
41
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
42 #define SSE2_Column_IDCT() { \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
43 \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
44 movdqa_m2r(*I(3), xmm2); /* xmm2 = i3 */ \
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
45 movdqa_m2r(*C(3), xmm6); /* xmm6 = c3 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
46 \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
47 movdqa_r2r(xmm2, xmm4); /* xmm4 = i3 */ \
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
48 movdqa_m2r(*I(5), xmm7); /* xmm7 = i5 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
49 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
50 pmulhw_r2r(xmm6, xmm4); /* xmm4 = c3 * i3 - i3 */ \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
51 movdqa_m2r(*C(5), xmm1); /* xmm1 = c5 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
52 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
53 pmulhw_r2r(xmm7, xmm6); /* xmm6 = c3 * i5 - i5 */ \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
54 movdqa_r2r(xmm1, xmm5); /* xmm5 = c5 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
55 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
56 pmulhw_r2r(xmm2, xmm1); /* xmm1 = c5 * i3 - i3 */ \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
57 movdqa_m2r(*I(1), xmm3); /* xmm3 = i1 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
58 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
59 pmulhw_r2r(xmm7, xmm5); /* xmm5 = c5 * i5 - i5 */ \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
60 movdqa_m2r(*C(1), xmm0); /* xmm0 = c1 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
61 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
62 /* all registers are in use */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
63 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
64 paddw_r2r(xmm2, xmm4); /* xmm4 = c3 * i3 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
65 paddw_r2r(xmm7, xmm6); /* xmm6 = c3 * i5 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
66 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
67 paddw_r2r(xmm1, xmm2); /* xmm2 = c5 * i3 */ \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
68 movdqa_m2r(*I(7), xmm1); /* xmm1 = i7 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
69 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
70 paddw_r2r(xmm5, xmm7); /* xmm7 = c5 * i5 */ \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
71 movdqa_r2r(xmm0, xmm5); /* xmm5 = c1 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
72 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
73 pmulhw_r2r(xmm3, xmm0); /* xmm0 = c1 * i1 - i1 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
74 paddsw_r2r(xmm7, xmm4); /* xmm4 = c3 * i3 + c5 * i5 = C */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
75 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
76 pmulhw_r2r(xmm1, xmm5); /* xmm5 = c1 * i7 - i7 */ \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
77 movdqa_m2r(*C(7), xmm7); /* xmm7 = c7 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
78 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
79 psubsw_r2r(xmm2, xmm6); /* xmm6 = c3 * i5 - c5 * i3 = D */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
80 paddw_r2r(xmm3, xmm0); /* xmm0 = c1 * i1 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
81 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
82 pmulhw_r2r(xmm7, xmm3); /* xmm3 = c7 * i1 */ \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
83 movdqa_m2r(*I(2), xmm2); /* xmm2 = i2 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
84 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
85 pmulhw_r2r(xmm1, xmm7); /* xmm7 = c7 * i7 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
86 paddw_r2r(xmm1, xmm5); /* xmm5 = c1 * i7 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
87 \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
88 movdqa_r2r(xmm2, xmm1); /* xmm1 = i2 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
89 pmulhw_m2r(*C(2), xmm2); /* xmm2 = i2 * c2 -i2 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
90 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
91 psubsw_r2r(xmm5, xmm3); /* xmm3 = c7 * i1 - c1 * i7 = B */ \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
92 movdqa_m2r(*I(6), xmm5); /* xmm5 = i6 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
93 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
94 paddsw_r2r(xmm7, xmm0); /* xmm0 = c1 * i1 + c7 * i7 = A */ \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
95 movdqa_r2r(xmm5, xmm7); /* xmm7 = i6 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
96 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
97 psubsw_r2r(xmm4, xmm0); /* xmm0 = A - C */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
98 pmulhw_m2r(*C(2), xmm5); /* xmm5 = c2 * i6 - i6 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
99 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
100 paddw_r2r(xmm1, xmm2); /* xmm2 = i2 * c2 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
101 pmulhw_m2r(*C(6), xmm1); /* xmm1 = c6 * i2 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
102 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
103 paddsw_r2r(xmm4, xmm4); /* xmm4 = C + C */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
104 paddsw_r2r(xmm0, xmm4); /* xmm4 = A + C = C. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
105 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
106 psubsw_r2r(xmm6, xmm3); /* xmm3 = B - D */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
107 paddw_r2r(xmm7, xmm5); /* xmm5 = c2 * i6 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
108 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
109 paddsw_r2r(xmm6, xmm6); /* xmm6 = D + D */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
110 pmulhw_m2r(*C(6), xmm7); /* xmm7 = c6 * i6 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
111 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
112 paddsw_r2r(xmm3, xmm6); /* xmm6 = B + D = D. */ \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
113 movdqa_r2m(xmm4, *I(1)); /* Save C. at I(1) */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
114 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
115 psubsw_r2r(xmm5, xmm1); /* xmm1 = c6 * i2 - c2 * i6 = H */ \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
116 movdqa_m2r(*C(4), xmm4); /* xmm4 = c4 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
117 \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
118 movdqa_r2r(xmm3, xmm5); /* xmm5 = B - D */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
119 pmulhw_r2r(xmm4, xmm3); /* xmm3 = ( c4 -1 ) * ( B - D ) */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
120 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
121 paddsw_r2r(xmm2, xmm7); /* xmm7 = c2 * i2 + c6 * i6 = G */ \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
122 movdqa_r2m(xmm6, *I(2)); /* Save D. at I(2) */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
123 \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
124 movdqa_r2r(xmm0, xmm2); /* xmm2 = A - C */ \
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
125 movdqa_m2r(*I(0), xmm6); /* xmm6 = i0 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
126 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
127 pmulhw_r2r(xmm4, xmm0); /* xmm0 = ( c4 - 1 ) * ( A - C ) = A. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
128 paddw_r2r(xmm3, xmm5); /* xmm5 = c4 * ( B - D ) = B. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
129 \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
130 movdqa_m2r(*I(4), xmm3); /* xmm3 = i4 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
131 psubsw_r2r(xmm1, xmm5); /* xmm5 = B. - H = B.. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
132 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
133 paddw_r2r(xmm0, xmm2); /* xmm2 = c4 * ( A - C) = A. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
134 psubsw_r2r(xmm3, xmm6); /* xmm6 = i0 - i4 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
135 \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
136 movdqa_r2r(xmm6, xmm0); /* xmm0 = i0 - i4 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
137 pmulhw_r2r(xmm4, xmm6); /* xmm6 = (c4 - 1) * (i0 - i4) = F */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
138 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
139 paddsw_r2r(xmm3, xmm3); /* xmm3 = i4 + i4 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
140 paddsw_r2r(xmm1, xmm1); /* xmm1 = H + H */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
141 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
142 paddsw_r2r(xmm0, xmm3); /* xmm3 = i0 + i4 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
143 paddsw_r2r(xmm5, xmm1); /* xmm1 = B. + H = H. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
144 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
145 pmulhw_r2r(xmm3, xmm4); /* xmm4 = ( c4 - 1 ) * ( i0 + i4 ) */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
146 paddw_r2r(xmm0, xmm6); /* xmm6 = c4 * ( i0 - i4 ) */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
147 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
148 psubsw_r2r(xmm2, xmm6); /* xmm6 = F - A. = F. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
149 paddsw_r2r(xmm2, xmm2); /* xmm2 = A. + A. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
150 \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
151 movdqa_m2r(*I(1), xmm0); /* Load C. from I(1) */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
152 paddsw_r2r(xmm6, xmm2); /* xmm2 = F + A. = A.. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
153 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
154 paddw_r2r(xmm3, xmm4); /* xmm4 = c4 * ( i0 + i4 ) = 3 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
155 psubsw_r2r(xmm1, xmm2); /* xmm2 = A.. - H. = R2 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
156 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
157 paddsw_m2r(*Eight, xmm2); /* Adjust R2 and R1 before shifting */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
158 paddsw_r2r(xmm1, xmm1); /* xmm1 = H. + H. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
159 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
160 paddsw_r2r(xmm2, xmm1); /* xmm1 = A.. + H. = R1 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
161 psraw_i2r(4, xmm2); /* xmm2 = op2 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
162 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
163 psubsw_r2r(xmm7, xmm4); /* xmm4 = E - G = E. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
164 psraw_i2r(4, xmm1); /* xmm1 = op1 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
165 \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
166 movdqa_m2r(*I(2), xmm3); /* Load D. from I(2) */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
167 paddsw_r2r(xmm7, xmm7); /* xmm7 = G + G */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
168 \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
169 movdqa_r2m(xmm2, *O(2)); /* Write out op2 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
170 paddsw_r2r(xmm4, xmm7); /* xmm7 = E + G = G. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
171 \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
172 movdqa_r2m(xmm1, *O(1)); /* Write out op1 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
173 psubsw_r2r(xmm3, xmm4); /* xmm4 = E. - D. = R4 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
174 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
175 paddsw_m2r(*Eight, xmm4); /* Adjust R4 and R3 before shifting */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
176 paddsw_r2r(xmm3, xmm3); /* xmm3 = D. + D. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
177 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
178 paddsw_r2r(xmm4, xmm3); /* xmm3 = E. + D. = R3 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
179 psraw_i2r(4, xmm4); /* xmm4 = op4 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
180 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
181 psubsw_r2r(xmm5, xmm6); /* xmm6 = F. - B..= R6 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
182 psraw_i2r(4, xmm3); /* xmm3 = op3 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
183 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
184 paddsw_m2r(*Eight, xmm6); /* Adjust R6 and R5 before shifting */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
185 paddsw_r2r(xmm5, xmm5); /* xmm5 = B.. + B.. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
186 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
187 paddsw_r2r(xmm6, xmm5); /* xmm5 = F. + B.. = R5 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
188 psraw_i2r(4, xmm6); /* xmm6 = op6 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
189 \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
190 movdqa_r2m(xmm4, *O(4)); /* Write out op4 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
191 psraw_i2r(4, xmm5); /* xmm5 = op5 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
192 \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
193 movdqa_r2m(xmm3, *O(3)); /* Write out op3 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
194 psubsw_r2r(xmm0, xmm7); /* xmm7 = G. - C. = R7 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
195 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
196 paddsw_m2r(*Eight, xmm7); /* Adjust R7 and R0 before shifting */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
197 paddsw_r2r(xmm0, xmm0); /* xmm0 = C. + C. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
198 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
199 paddsw_r2r(xmm7, xmm0); /* xmm0 = G. + C. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
200 psraw_i2r(4, xmm7); /* xmm7 = op7 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
201 \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
202 movdqa_r2m(xmm6, *O(6)); /* Write out op6 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
203 psraw_i2r(4, xmm0); /* xmm0 = op0 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
204 \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
205 movdqa_r2m(xmm5, *O(5)); /* Write out op5 */ \
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
206 movdqa_r2m(xmm7, *O(7)); /* Write out op7 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
207 \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
208 movdqa_r2m(xmm0, *O(0)); /* Write out op0 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
209 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
210 } /* End of SSE2_Column_IDCT macro */
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
211
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
212
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
213 #define SSE2_Row_IDCT() { \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
214 \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
215 movdqa_m2r(*I(3), xmm2); /* xmm2 = i3 */ \
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
216 movdqa_m2r(*C(3), xmm6); /* xmm6 = c3 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
217 \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
218 movdqa_r2r(xmm2, xmm4); /* xmm4 = i3 */ \
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
219 movdqa_m2r(*I(5), xmm7); /* xmm7 = i5 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
220 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
221 pmulhw_r2r(xmm6, xmm4); /* xmm4 = c3 * i3 - i3 */ \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
222 movdqa_m2r(*C(5), xmm1); /* xmm1 = c5 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
223 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
224 pmulhw_r2r(xmm7, xmm6); /* xmm6 = c3 * i5 - i5 */ \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
225 movdqa_r2r(xmm1, xmm5); /* xmm5 = c5 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
226 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
227 pmulhw_r2r(xmm2, xmm1); /* xmm1 = c5 * i3 - i3 */ \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
228 movdqa_m2r(*I(1), xmm3); /* xmm3 = i1 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
229 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
230 pmulhw_r2r(xmm7, xmm5); /* xmm5 = c5 * i5 - i5 */ \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
231 movdqa_m2r(*C(1), xmm0); /* xmm0 = c1 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
232 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
233 /* all registers are in use */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
234 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
235 paddw_r2r(xmm2, xmm4); /* xmm4 = c3 * i3 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
236 paddw_r2r(xmm7, xmm6); /* xmm6 = c3 * i5 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
237 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
238 paddw_r2r(xmm1, xmm2); /* xmm2 = c5 * i3 */ \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
239 movdqa_m2r(*I(7), xmm1); /* xmm1 = i7 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
240 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
241 paddw_r2r(xmm5, xmm7); /* xmm7 = c5 * i5 */ \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
242 movdqa_r2r(xmm0, xmm5); /* xmm5 = c1 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
243 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
244 pmulhw_r2r(xmm3, xmm0); /* xmm0 = c1 * i1 - i1 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
245 paddsw_r2r(xmm7, xmm4); /* xmm4 = c3 * i3 + c5 * i5 = C */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
246 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
247 pmulhw_r2r(xmm1, xmm5); /* xmm5 = c1 * i7 - i7 */ \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
248 movdqa_m2r(*C(7), xmm7); /* xmm7 = c7 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
249 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
250 psubsw_r2r(xmm2, xmm6); /* xmm6 = c3 * i5 - c5 * i3 = D */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
251 paddw_r2r(xmm3, xmm0); /* xmm0 = c1 * i1 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
252 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
253 pmulhw_r2r(xmm7, xmm3); /* xmm3 = c7 * i1 */ \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
254 movdqa_m2r(*I(2), xmm2); /* xmm2 = i2 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
255 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
256 pmulhw_r2r(xmm1, xmm7); /* xmm7 = c7 * i7 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
257 paddw_r2r(xmm1, xmm5); /* xmm5 = c1 * i7 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
258 \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
259 movdqa_r2r(xmm2, xmm1); /* xmm1 = i2 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
260 pmulhw_m2r(*C(2), xmm2); /* xmm2 = i2 * c2 -i2 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
261 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
262 psubsw_r2r(xmm5, xmm3); /* xmm3 = c7 * i1 - c1 * i7 = B */ \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
263 movdqa_m2r(*I(6), xmm5); /* xmm5 = i6 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
264 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
265 paddsw_r2r(xmm7, xmm0); /* xmm0 = c1 * i1 + c7 * i7 = A */ \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
266 movdqa_r2r(xmm5, xmm7); /* xmm7 = i6 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
267 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
268 psubsw_r2r(xmm4, xmm0); /* xmm0 = A - C */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
269 pmulhw_m2r(*C(2), xmm5); /* xmm5 = c2 * i6 - i6 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
270 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
271 paddw_r2r(xmm1, xmm2); /* xmm2 = i2 * c2 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
272 pmulhw_m2r(*C(6), xmm1); /* xmm1 = c6 * i2 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
273 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
274 paddsw_r2r(xmm4, xmm4); /* xmm4 = C + C */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
275 paddsw_r2r(xmm0, xmm4); /* xmm4 = A + C = C. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
276 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
277 psubsw_r2r(xmm6, xmm3); /* xmm3 = B - D */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
278 paddw_r2r(xmm7, xmm5); /* xmm5 = c2 * i6 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
279 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
280 paddsw_r2r(xmm6, xmm6); /* xmm6 = D + D */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
281 pmulhw_m2r(*C(6), xmm7); /* xmm7 = c6 * i6 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
282 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
283 paddsw_r2r(xmm3, xmm6); /* xmm6 = B + D = D. */ \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
284 movdqa_r2m(xmm4, *I(1)); /* Save C. at I(1) */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
285 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
286 psubsw_r2r(xmm5, xmm1); /* xmm1 = c6 * i2 - c2 * i6 = H */ \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
287 movdqa_m2r(*C(4), xmm4); /* xmm4 = c4 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
288 \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
289 movdqa_r2r(xmm3, xmm5); /* xmm5 = B - D */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
290 pmulhw_r2r(xmm4, xmm3); /* xmm3 = ( c4 -1 ) * ( B - D ) */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
291 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
292 paddsw_r2r(xmm2, xmm7); /* xmm7 = c2 * i2 + c6 * i6 = G */ \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
293 movdqa_r2m(xmm6, *I(2)); /* Save D. at I(2) */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
294 \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
295 movdqa_r2r(xmm0, xmm2); /* xmm2 = A - C */ \
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
296 movdqa_m2r(*I(0), xmm6); /* xmm6 = i0 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
297 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
298 pmulhw_r2r(xmm4, xmm0); /* xmm0 = ( c4 - 1 ) * ( A - C ) = A. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
299 paddw_r2r(xmm3, xmm5); /* xmm5 = c4 * ( B - D ) = B. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
300 \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
301 movdqa_m2r(*I(4), xmm3); /* xmm3 = i4 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
302 psubsw_r2r(xmm1, xmm5); /* xmm5 = B. - H = B.. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
303 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
304 paddw_r2r(xmm0, xmm2); /* xmm2 = c4 * ( A - C) = A. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
305 psubsw_r2r(xmm3, xmm6); /* xmm6 = i0 - i4 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
306 \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
307 movdqa_r2r(xmm6, xmm0); /* xmm0 = i0 - i4 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
308 pmulhw_r2r(xmm4, xmm6); /* xmm6 = ( c4 - 1 ) * ( i0 - i4 ) = F */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
309 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
310 paddsw_r2r(xmm3, xmm3); /* xmm3 = i4 + i4 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
311 paddsw_r2r(xmm1, xmm1); /* xmm1 = H + H */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
312 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
313 paddsw_r2r(xmm0, xmm3); /* xmm3 = i0 + i4 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
314 paddsw_r2r(xmm5, xmm1); /* xmm1 = B. + H = H. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
315 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
316 pmulhw_r2r(xmm3, xmm4); /* xmm4 = ( c4 - 1 ) * ( i0 + i4 ) */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
317 paddw_r2r(xmm0, xmm6); /* xmm6 = c4 * ( i0 - i4 ) */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
318 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
319 psubsw_r2r(xmm2, xmm6); /* xmm6 = F - A. = F. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
320 paddsw_r2r(xmm2, xmm2); /* xmm2 = A. + A. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
321 \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
322 movdqa_m2r(*I(1), xmm0); /* Load C. from I(1) */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
323 paddsw_r2r(xmm6, xmm2); /* xmm2 = F + A. = A.. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
324 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
325 paddw_r2r(xmm3, xmm4); /* xmm4 = c4 * ( i0 + i4 ) = 3 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
326 psubsw_r2r(xmm1, xmm2); /* xmm2 = A.. - H. = R2 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
327 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
328 paddsw_r2r(xmm1, xmm1); /* xmm1 = H. + H. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
329 paddsw_r2r(xmm2, xmm1); /* xmm1 = A.. + H. = R1 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
330 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
331 psubsw_r2r(xmm7, xmm4); /* xmm4 = E - G = E. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
332 \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
333 movdqa_m2r(*I(2), xmm3); /* Load D. from I(2) */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
334 paddsw_r2r(xmm7, xmm7); /* xmm7 = G + G */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
335 \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
336 movdqa_r2m(xmm2, *I(2)); /* Write out op2 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
337 paddsw_r2r(xmm4, xmm7); /* xmm7 = E + G = G. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
338 \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
339 movdqa_r2m(xmm1, *I(1)); /* Write out op1 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
340 psubsw_r2r(xmm3, xmm4); /* xmm4 = E. - D. = R4 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
341 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
342 paddsw_r2r(xmm3, xmm3); /* xmm3 = D. + D. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
343 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
344 paddsw_r2r(xmm4, xmm3); /* xmm3 = E. + D. = R3 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
345 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
346 psubsw_r2r(xmm5, xmm6); /* xmm6 = F. - B..= R6 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
347 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
348 paddsw_r2r(xmm5, xmm5); /* xmm5 = B.. + B.. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
349 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
350 paddsw_r2r(xmm6, xmm5); /* xmm5 = F. + B.. = R5 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
351 \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
352 movdqa_r2m(xmm4, *I(4)); /* Write out op4 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
353 \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
354 movdqa_r2m(xmm3, *I(3)); /* Write out op3 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
355 psubsw_r2r(xmm0, xmm7); /* xmm7 = G. - C. = R7 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
356 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
357 paddsw_r2r(xmm0, xmm0); /* xmm0 = C. + C. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
358 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
359 paddsw_r2r(xmm7, xmm0); /* xmm0 = G. + C. */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
360 \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
361 movdqa_r2m(xmm6, *I(6)); /* Write out op6 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
362 \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
363 movdqa_r2m(xmm5, *I(5)); /* Write out op5 */ \
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
364 movdqa_r2m(xmm7, *I(7)); /* Write out op7 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
365 \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
366 movdqa_r2m(xmm0, *I(0)); /* Write out op0 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
367 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
368 } /* End of SSE2_Row_IDCT macro */
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
369
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
370
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
371 #define SSE2_Transpose() { \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
372 \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
373 movdqa_m2r(*I(4), xmm4); /* xmm4=e7e6e5e4e3e2e1e0 */ \
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
374 movdqa_m2r(*I(5), xmm0); /* xmm4=f7f6f5f4f3f2f1f0 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
375 \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
376 movdqa_r2r(xmm4, xmm5); /* make a copy */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
377 punpcklwd_r2r(xmm0, xmm4); /* xmm4=f3e3f2e2f1e1f0e0 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
378 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
379 punpckhwd_r2r(xmm0, xmm5); /* xmm5=f7e7f6e6f5e5f4e4 */ \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
380 movdqa_m2r(*I(6), xmm6); /* xmm6=g7g6g5g4g3g2g1g0 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
381 \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
382 movdqa_m2r(*I(7), xmm0); /* xmm0=h7h6h5h4h3h2h1h0 */ \
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
383 movdqa_r2r(xmm6, xmm7); /* make a copy */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
384 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
385 punpcklwd_r2r(xmm0, xmm6); /* xmm6=h3g3h3g2h1g1h0g0 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
386 punpckhwd_r2r(xmm0, xmm7); /* xmm7=h7g7h6g6h5g5h4g4 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
387 \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
388 movdqa_r2r(xmm4, xmm3); /* make a copy */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
389 punpckldq_r2r(xmm6, xmm4); /* xmm4=h1g1f1e1h0g0f0e0 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
390 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
391 punpckhdq_r2r(xmm6, xmm3); /* xmm3=h3g3g3e3h2g2f2e2 */ \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
392 movdqa_r2m(xmm3, *I(6)); /* save h3g3g3e3h2g2f2e2 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
393 /* Free xmm6 */ \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
394 movdqa_r2r(xmm5, xmm6); /* make a copy */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
395 punpckldq_r2r(xmm7, xmm5); /* xmm5=h5g5f5e5h4g4f4e4 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
396 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
397 punpckhdq_r2r(xmm7, xmm6); /* xmm6=h7g7f7e7h6g6f6e6 */ \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
398 movdqa_m2r(*I(0), xmm0); /* xmm0=a7a6a5a4a3a2a1a0 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
399 /* Free xmm7 */ \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
400 movdqa_m2r(*I(1), xmm1); /* xmm1=b7b6b5b4b3b2b1b0 */ \
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
401 movdqa_r2r(xmm0, xmm7); /* make a copy */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
402 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
403 punpcklwd_r2r(xmm1, xmm0); /* xmm0=b3a3b2a2b1a1b0a0 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
404 punpckhwd_r2r(xmm1, xmm7); /* xmm7=b7a7b6a6b5a5b4a4 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
405 /* Free xmm1 */ \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
406 movdqa_m2r(*I(2), xmm2); /* xmm2=c7c6c5c4c3c2c1c0 */ \
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
407 movdqa_m2r(*I(3), xmm3); /* xmm3=d7d6d5d4d3d2d1d0 */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
408 \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
409 movdqa_r2r(xmm2, xmm1); /* make a copy */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
410 punpcklwd_r2r(xmm3, xmm2); /* xmm2=d3c3d2c2d1c1d0c0 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
411 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
412 punpckhwd_r2r(xmm3, xmm1); /* xmm1=d7c7d6c6d5c5d4c4 */ \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
413 movdqa_r2r(xmm0, xmm3); /* make a copy */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
414 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
415 punpckldq_r2r(xmm2, xmm0); /* xmm0=d1c1b1a1d0c0b0a0 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
416 punpckhdq_r2r(xmm2, xmm3); /* xmm3=d3c3b3a3d2c2b2a2 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
417 /* Free xmm2 */ \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
418 movdqa_r2r(xmm7, xmm2); /* make a copy */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
419 punpckldq_r2r(xmm1, xmm2); /* xmm2=d5c5b5a5d4c4b4a4 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
420 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
421 punpckhdq_r2r(xmm1, xmm7); /* xmm7=d7c7b7a7d6c6b6a6 */ \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
422 movdqa_r2r(xmm0, xmm1); /* make a copy */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
423 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
424 punpcklqdq_r2r(xmm4, xmm0); /* xmm0=h0g0f0e0d0c0b0a0 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
425 punpckhqdq_r2r(xmm4, xmm1); /* xmm1=h1g1g1e1d1c1b1a1 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
426 \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
427 movdqa_r2m(xmm0, *I(0)); /* save I(0) */ \
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
428 movdqa_r2m(xmm1, *I(1)); /* save I(1) */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
429 \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
430 movdqa_m2r(*I(6), xmm0); /* load h3g3g3e3h2g2f2e2 */ \
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
431 movdqa_r2r(xmm3, xmm1); /* make a copy */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
432 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
433 punpcklqdq_r2r(xmm0, xmm1); /* xmm1=h2g2f2e2d2c2b2a2 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
434 punpckhqdq_r2r(xmm0, xmm3); /* xmm3=h3g3f3e3d3c3b3a3 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
435 \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
436 movdqa_r2r(xmm2, xmm4); /* make a copy */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
437 punpcklqdq_r2r(xmm5, xmm4); /* xmm4=h4g4f4e4d4c4b4a4 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
438 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
439 punpckhqdq_r2r(xmm5, xmm2); /* xmm2=h5g5f5e5d5c5b5a5 */ \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
440 movdqa_r2m(xmm1, *I(2)); /* save I(2) */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
441 \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
442 movdqa_r2m(xmm3, *I(3)); /* save I(3) */ \
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
443 movdqa_r2m(xmm4, *I(4)); /* save I(4) */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
444 \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
445 movdqa_r2m(xmm2, *I(5)); /* save I(5) */ \
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
446 movdqa_r2r(xmm7, xmm5); /* make a copy */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
447 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
448 punpcklqdq_r2r(xmm6, xmm5); /* xmm5=h6g6f6e6d6c6b6a6 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
449 punpckhqdq_r2r(xmm6, xmm7); /* xmm7=h7g7f7e7d7c7b7a7 */ \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
450 \
6597
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
451 movdqa_r2m(xmm5, *I(6)); /* save I(6) */ \
25ed93c16c00 Eliminate movdqu in vp3dsp_sse2, patch from Alexander Strange astrangeAtithinkswDoTcom
lu_zero
parents: 6376
diff changeset
452 movdqa_r2m(xmm7, *I(7)); /* save I(7) */ \
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
453 \
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
454 } /* End of Transpose Macro */
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
455
2696
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 1978
diff changeset
456 void ff_vp3_idct_sse2(int16_t *input_data)
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
457 {
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
458 unsigned char *input_bytes = (unsigned char *)input_data;
2696
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 1978
diff changeset
459 unsigned char *output_data_bytes = (unsigned char *)input_data;
7757
b04a9742669c Don't declare SSE vp3 idct data static, so it can be used in the mmx version
conrad
parents: 7743
diff changeset
460 const unsigned char *idct_data_bytes = (const unsigned char *)ff_vp3_idct_data;
7742
bff9b5fea03f Use ff_pw_8 in MMX/SSE VP3 IDCT
conrad
parents: 6763
diff changeset
461 const unsigned char *Eight = (const unsigned char *)&ff_pw_8;
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
462
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
463 #define eax input_bytes
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
464 #define edx idct_data_bytes
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
465
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
466 #define I(i) (eax + 16 * i)
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
467 #define O(i) (ebx + 16 * i)
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
468 #define C(i) (edx + 16 * (i-1))
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
469
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
470 #define ebx output_data_bytes
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
471
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
472 SSE2_Row_IDCT();
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
473
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
474 SSE2_Transpose();
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2834
diff changeset
475
1970
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
476 SSE2_Column_IDCT();
51c098b1f404 SSE2-optimized variant of VP3 IDCT
melanson
parents:
diff changeset
477 }
5014
42b99a3aadde better separation of vp3dsp functions from dsputil_mmx.c
aurel
parents: 5010
diff changeset
478
42b99a3aadde better separation of vp3dsp functions from dsputil_mmx.c
aurel
parents: 5010
diff changeset
479 void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block)
42b99a3aadde better separation of vp3dsp functions from dsputil_mmx.c
aurel
parents: 5010
diff changeset
480 {
42b99a3aadde better separation of vp3dsp functions from dsputil_mmx.c
aurel
parents: 5010
diff changeset
481 ff_vp3_idct_sse2(block);
42b99a3aadde better separation of vp3dsp functions from dsputil_mmx.c
aurel
parents: 5010
diff changeset
482 put_signed_pixels_clamped_mmx(block, dest, line_size);
42b99a3aadde better separation of vp3dsp functions from dsputil_mmx.c
aurel
parents: 5010
diff changeset
483 }
42b99a3aadde better separation of vp3dsp functions from dsputil_mmx.c
aurel
parents: 5010
diff changeset
484
42b99a3aadde better separation of vp3dsp functions from dsputil_mmx.c
aurel
parents: 5010
diff changeset
485 void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block)
42b99a3aadde better separation of vp3dsp functions from dsputil_mmx.c
aurel
parents: 5010
diff changeset
486 {
42b99a3aadde better separation of vp3dsp functions from dsputil_mmx.c
aurel
parents: 5010
diff changeset
487 ff_vp3_idct_sse2(block);
42b99a3aadde better separation of vp3dsp functions from dsputil_mmx.c
aurel
parents: 5010
diff changeset
488 add_pixels_clamped_mmx(block, dest, line_size);
42b99a3aadde better separation of vp3dsp functions from dsputil_mmx.c
aurel
parents: 5010
diff changeset
489 }