annotate i386/vp3dsp_mmx.c @ 2841:bceeca1bb30f libavcodec

vbr audio encode patch by (Justin Ruggles: jruggle, earthlink net) with changes by me int->float as video uses float too remove silent cliping to some per codec range, this should result in an error instead remove change to utils.c as its inconsistant with video
author michael
date Sun, 21 Aug 2005 20:27:00 +0000
parents ba8ecddf5598
children ef2149182f1c
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1866
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
1 /*
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
2 * Copyright (C) 2004 the ffmpeg project
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
3 *
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
4 * This library is free software; you can redistribute it and/or
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
5 * modify it under the terms of the GNU Lesser General Public
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
6 * License as published by the Free Software Foundation; either
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
7 * version 2 of the License, or (at your option) any later version.
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
8 *
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
9 * This library is distributed in the hope that it will be useful,
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
12 * Lesser General Public License for more details.
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
13 *
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
14 * You should have received a copy of the GNU Lesser General Public
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
15 * License along with this library; if not, write to the Free Software
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
17 */
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
18
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
19 /**
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
20 * @file vp3dsp_mmx.c
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
21 * MMX-optimized functions cribbed from the original VP3 source code.
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
22 */
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
23
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
24 #include "../dsputil.h"
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
25 #include "mmx.h"
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
26
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
27 #define IdctAdjustBeforeShift 8
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
28
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
29 /* (12 * 4) 2-byte memory locations ( = 96 bytes total)
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
30 * idct_constants[0..15] = Mask table (M(I))
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
31 * idct_constants[16..43] = Cosine table (C(I))
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
32 * idct_constants[44..47] = 8
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
33 */
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
34 static uint16_t idct_constants[(4 + 7 + 1) * 4];
2753
ba8ecddf5598 adding a few const
michael
parents: 2696
diff changeset
35 static const uint16_t idct_cosine_table[7] = {
1866
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
36 64277, 60547, 54491, 46341, 36410, 25080, 12785
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
37 };
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
38
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
39 #define r0 mm0
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
40 #define r1 mm1
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
41 #define r2 mm2
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
42 #define r3 mm3
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
43 #define r4 mm4
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
44 #define r5 mm5
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
45 #define r6 mm6
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
46 #define r7 mm7
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
47
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
48 /* from original comments: The Macro does IDct on 4 1-D Dcts */
1969
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
49 #define BeginIDCT() { \
1866
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
50 movq_m2r(*I(3), r2); \
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
51 movq_m2r(*C(3), r6); \
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
52 movq_r2r(r2, r4); \
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
53 movq_m2r(*J(5), r7); \
1969
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
54 pmulhw_r2r(r6, r4); /* r4 = c3*i3 - i3 */ \
1866
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
55 movq_m2r(*C(5), r1); \
1969
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
56 pmulhw_r2r(r7, r6); /* r6 = c3*i5 - i5 */ \
1866
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
57 movq_r2r(r1, r5); \
1969
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
58 pmulhw_r2r(r2, r1); /* r1 = c5*i3 - i3 */ \
1866
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
59 movq_m2r(*I(1), r3); \
1969
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
60 pmulhw_r2r(r7, r5); /* r5 = c5*i5 - i5 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
61 movq_m2r(*C(1), r0); /* (all registers are in use) */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
62 paddw_r2r(r2, r4); /* r4 = c3*i3 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
63 paddw_r2r(r7, r6); /* r6 = c3*i5 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
64 paddw_r2r(r1, r2); /* r2 = c5*i3 */ \
1866
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
65 movq_m2r(*J(7), r1); \
1969
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
66 paddw_r2r(r5, r7); /* r7 = c5*i5 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
67 movq_r2r(r0, r5); /* r5 = c1 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
68 pmulhw_r2r(r3, r0); /* r0 = c1*i1 - i1 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
69 paddsw_r2r(r7, r4); /* r4 = C = c3*i3 + c5*i5 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
70 pmulhw_r2r(r1, r5); /* r5 = c1*i7 - i7 */ \
1866
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
71 movq_m2r(*C(7), r7); \
1969
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
72 psubsw_r2r(r2, r6); /* r6 = D = c3*i5 - c5*i3 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
73 paddw_r2r(r3, r0); /* r0 = c1*i1 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
74 pmulhw_r2r(r7, r3); /* r3 = c7*i1 */ \
1866
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
75 movq_m2r(*I(2), r2); \
1969
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
76 pmulhw_r2r(r1, r7); /* r7 = c7*i7 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
77 paddw_r2r(r1, r5); /* r5 = c1*i7 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
78 movq_r2r(r2, r1); /* r1 = i2 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
79 pmulhw_m2r(*C(2), r2); /* r2 = c2*i2 - i2 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
80 psubsw_r2r(r5, r3); /* r3 = B = c7*i1 - c1*i7 */ \
1866
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
81 movq_m2r(*J(6), r5); \
1969
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
82 paddsw_r2r(r7, r0); /* r0 = A = c1*i1 + c7*i7 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
83 movq_r2r(r5, r7); /* r7 = i6 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
84 psubsw_r2r(r4, r0); /* r0 = A - C */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
85 pmulhw_m2r(*C(2), r5); /* r5 = c2*i6 - i6 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
86 paddw_r2r(r1, r2); /* r2 = c2*i2 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
87 pmulhw_m2r(*C(6), r1); /* r1 = c6*i2 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
88 paddsw_r2r(r4, r4); /* r4 = C + C */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
89 paddsw_r2r(r0, r4); /* r4 = C. = A + C */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
90 psubsw_r2r(r6, r3); /* r3 = B - D */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
91 paddw_r2r(r7, r5); /* r5 = c2*i6 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
92 paddsw_r2r(r6, r6); /* r6 = D + D */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
93 pmulhw_m2r(*C(6), r7); /* r7 = c6*i6 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
94 paddsw_r2r(r3, r6); /* r6 = D. = B + D */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
95 movq_r2m(r4, *I(1)); /* save C. at I(1) */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
96 psubsw_r2r(r5, r1); /* r1 = H = c6*i2 - c2*i6 */ \
1866
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
97 movq_m2r(*C(4), r4); \
1969
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
98 movq_r2r(r3, r5); /* r5 = B - D */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
99 pmulhw_r2r(r4, r3); /* r3 = (c4 - 1) * (B - D) */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
100 paddsw_r2r(r2, r7); /* r7 = G = c6*i6 + c2*i2 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
101 movq_r2m(r6, *I(2)); /* save D. at I(2) */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
102 movq_r2r(r0, r2); /* r2 = A - C */ \
1866
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
103 movq_m2r(*I(0), r6); \
1969
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
104 pmulhw_r2r(r4, r0); /* r0 = (c4 - 1) * (A - C) */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
105 paddw_r2r(r3, r5); /* r5 = B. = c4 * (B - D) */ \
1866
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
106 movq_m2r(*J(4), r3); \
1969
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
107 psubsw_r2r(r1, r5); /* r5 = B.. = B. - H */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
108 paddw_r2r(r0, r2); /* r0 = A. = c4 * (A - C) */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
109 psubsw_r2r(r3, r6); /* r6 = i0 - i4 */ \
1866
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
110 movq_r2r(r6, r0); \
1969
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
111 pmulhw_r2r(r4, r6); /* r6 = (c4 - 1) * (i0 - i4) */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
112 paddsw_r2r(r3, r3); /* r3 = i4 + i4 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
113 paddsw_r2r(r1, r1); /* r1 = H + H */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
114 paddsw_r2r(r0, r3); /* r3 = i0 + i4 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
115 paddsw_r2r(r5, r1); /* r1 = H. = B + H */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
116 pmulhw_r2r(r3, r4); /* r4 = (c4 - 1) * (i0 + i4) */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
117 paddsw_r2r(r0, r6); /* r6 = F = c4 * (i0 - i4) */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
118 psubsw_r2r(r2, r6); /* r6 = F. = F - A. */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
119 paddsw_r2r(r2, r2); /* r2 = A. + A. */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
120 movq_m2r(*I(1), r0); /* r0 = C. */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
121 paddsw_r2r(r6, r2); /* r2 = A.. = F + A. */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
122 paddw_r2r(r3, r4); /* r4 = E = c4 * (i0 + i4) */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
123 psubsw_r2r(r1, r2); /* r2 = R2 = A.. - H. */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
124 }
1866
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
125
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
126 /* RowIDCT gets ready to transpose */
1969
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
127 #define RowIDCT() { \
1866
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
128 \
1969
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
129 BeginIDCT(); \
1866
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
130 \
1969
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
131 movq_m2r(*I(2), r3); /* r3 = D. */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
132 psubsw_r2r(r7, r4); /* r4 = E. = E - G */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
133 paddsw_r2r(r1, r1); /* r1 = H. + H. */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
134 paddsw_r2r(r7, r7); /* r7 = G + G */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
135 paddsw_r2r(r2, r1); /* r1 = R1 = A.. + H. */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
136 paddsw_r2r(r4, r7); /* r7 = G. = E + G */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
137 psubsw_r2r(r3, r4); /* r4 = R4 = E. - D. */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
138 paddsw_r2r(r3, r3); \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
139 psubsw_r2r(r5, r6); /* r6 = R6 = F. - B.. */ \
1866
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
140 paddsw_r2r(r5, r5); \
1969
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
141 paddsw_r2r(r4, r3); /* r3 = R3 = E. + D. */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
142 paddsw_r2r(r6, r5); /* r5 = R5 = F. + B.. */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
143 psubsw_r2r(r0, r7); /* r7 = R7 = G. - C. */ \
1866
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
144 paddsw_r2r(r0, r0); \
1969
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
145 movq_r2m(r1, *I(1)); /* save R1 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
146 paddsw_r2r(r7, r0); /* r0 = R0 = G. + C. */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
147 }
1866
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
148
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
149 /* Column IDCT normalizes and stores final results */
1969
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
150 #define ColumnIDCT() { \
1866
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
151 \
1969
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
152 BeginIDCT(); \
1866
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
153 \
1969
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
154 paddsw_m2r(*Eight, r2); /* adjust R2 (and R1) for shift */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
155 paddsw_r2r(r1, r1); /* r1 = H. + H. */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
156 paddsw_r2r(r2, r1); /* r1 = R1 = A.. + H. */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
157 psraw_i2r(4, r2); /* r2 = NR2 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
158 psubsw_r2r(r7, r4); /* r4 = E. = E - G */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
159 psraw_i2r(4, r1); /* r1 = NR1 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
160 movq_m2r(*I(2), r3); /* r3 = D. */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
161 paddsw_r2r(r7, r7); /* r7 = G + G */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
162 movq_r2m(r2, *I(2)); /* store NR2 at I2 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
163 paddsw_r2r(r4, r7); /* r7 = G. = E + G */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
164 movq_r2m(r1, *I(1)); /* store NR1 at I1 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
165 psubsw_r2r(r3, r4); /* r4 = R4 = E. - D. */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
166 paddsw_m2r(*Eight, r4); /* adjust R4 (and R3) for shift */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
167 paddsw_r2r(r3, r3); /* r3 = D. + D. */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
168 paddsw_r2r(r4, r3); /* r3 = R3 = E. + D. */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
169 psraw_i2r(4, r4); /* r4 = NR4 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
170 psubsw_r2r(r5, r6); /* r6 = R6 = F. - B.. */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
171 psraw_i2r(4, r3); /* r3 = NR3 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
172 paddsw_m2r(*Eight, r6); /* adjust R6 (and R5) for shift */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
173 paddsw_r2r(r5, r5); /* r5 = B.. + B.. */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
174 paddsw_r2r(r6, r5); /* r5 = R5 = F. + B.. */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
175 psraw_i2r(4, r6); /* r6 = NR6 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
176 movq_r2m(r4, *J(4)); /* store NR4 at J4 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
177 psraw_i2r(4, r5); /* r5 = NR5 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
178 movq_r2m(r3, *I(3)); /* store NR3 at I3 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
179 psubsw_r2r(r0, r7); /* r7 = R7 = G. - C. */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
180 paddsw_m2r(*Eight, r7); /* adjust R7 (and R0) for shift */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
181 paddsw_r2r(r0, r0); /* r0 = C. + C. */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
182 paddsw_r2r(r7, r0); /* r0 = R0 = G. + C. */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
183 psraw_i2r(4, r7); /* r7 = NR7 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
184 movq_r2m(r6, *J(6)); /* store NR6 at J6 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
185 psraw_i2r(4, r0); /* r0 = NR0 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
186 movq_r2m(r5, *J(5)); /* store NR5 at J5 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
187 movq_r2m(r7, *J(7)); /* store NR7 at J7 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
188 movq_r2m(r0, *I(0)); /* store NR0 at I0 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
189 }
1866
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
190
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
191 /* Following macro does two 4x4 transposes in place.
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
192
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
193 At entry (we assume):
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
194
1969
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
195 r0 = a3 a2 a1 a0
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
196 I(1) = b3 b2 b1 b0
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
197 r2 = c3 c2 c1 c0
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
198 r3 = d3 d2 d1 d0
1866
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
199
1969
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
200 r4 = e3 e2 e1 e0
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
201 r5 = f3 f2 f1 f0
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
202 r6 = g3 g2 g1 g0
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
203 r7 = h3 h2 h1 h0
1866
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
204
1969
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
205 At exit, we have:
1866
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
206
1969
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
207 I(0) = d0 c0 b0 a0
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
208 I(1) = d1 c1 b1 a1
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
209 I(2) = d2 c2 b2 a2
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
210 I(3) = d3 c3 b3 a3
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
211
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
212 J(4) = h0 g0 f0 e0
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
213 J(5) = h1 g1 f1 e1
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
214 J(6) = h2 g2 f2 e2
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
215 J(7) = h3 g3 f3 e3
1866
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
216
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
217 I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
218 J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7.
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
219
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
220 Since r1 is free at entry, we calculate the Js first. */
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
221
1969
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
222 #define Transpose() { \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
223 movq_r2r(r4, r1); /* r1 = e3 e2 e1 e0 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
224 punpcklwd_r2r(r5, r4); /* r4 = f1 e1 f0 e0 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
225 movq_r2m(r0, *I(0)); /* save a3 a2 a1 a0 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
226 punpckhwd_r2r(r5, r1); /* r1 = f3 e3 f2 e2 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
227 movq_r2r(r6, r0); /* r0 = g3 g2 g1 g0 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
228 punpcklwd_r2r(r7, r6); /* r6 = h1 g1 h0 g0 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
229 movq_r2r(r4, r5); /* r5 = f1 e1 f0 e0 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
230 punpckldq_r2r(r6, r4); /* r4 = h0 g0 f0 e0 = R4 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
231 punpckhdq_r2r(r6, r5); /* r5 = h1 g1 f1 e1 = R5 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
232 movq_r2r(r1, r6); /* r6 = f3 e3 f2 e2 */ \
1866
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
233 movq_r2m(r4, *J(4)); \
1969
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
234 punpckhwd_r2r(r7, r0); /* r0 = h3 g3 h2 g2 */ \
1866
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
235 movq_r2m(r5, *J(5)); \
1969
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
236 punpckhdq_r2r(r0, r6); /* r6 = h3 g3 f3 e3 = R7 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
237 movq_m2r(*I(0), r4); /* r4 = a3 a2 a1 a0 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
238 punpckldq_r2r(r0, r1); /* r1 = h2 g2 f2 e2 = R6 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
239 movq_m2r(*I(1), r5); /* r5 = b3 b2 b1 b0 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
240 movq_r2r(r4, r0); /* r0 = a3 a2 a1 a0 */ \
1866
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
241 movq_r2m(r6, *J(7)); \
1969
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
242 punpcklwd_r2r(r5, r0); /* r0 = b1 a1 b0 a0 */ \
1866
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
243 movq_r2m(r1, *J(6)); \
1969
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
244 punpckhwd_r2r(r5, r4); /* r4 = b3 a3 b2 a2 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
245 movq_r2r(r2, r5); /* r5 = c3 c2 c1 c0 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
246 punpcklwd_r2r(r3, r2); /* r2 = d1 c1 d0 c0 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
247 movq_r2r(r0, r1); /* r1 = b1 a1 b0 a0 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
248 punpckldq_r2r(r2, r0); /* r0 = d0 c0 b0 a0 = R0 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
249 punpckhdq_r2r(r2, r1); /* r1 = d1 c1 b1 a1 = R1 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
250 movq_r2r(r4, r2); /* r2 = b3 a3 b2 a2 */ \
1866
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
251 movq_r2m(r0, *I(0)); \
1969
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
252 punpckhwd_r2r(r3, r5); /* r5 = d3 c3 d2 c2 */ \
1866
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
253 movq_r2m(r1, *I(1)); \
1969
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
254 punpckhdq_r2r(r5, r4); /* r4 = d3 c3 b3 a3 = R3 */ \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
255 punpckldq_r2r(r5, r2); /* r2 = d2 c2 b2 a2 = R2 */ \
1866
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
256 movq_r2m(r4, *I(3)); \
1969
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
257 movq_r2m(r2, *I(2)); \
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
258 }
1866
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
259
2696
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 1977
diff changeset
260 void ff_vp3_dsp_init_mmx(void)
1866
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
261 {
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
262 int j = 16;
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
263 uint16_t *p;
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
264
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
265 j = 1;
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
266 do {
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
267 p = idct_constants + ((j + 3) << 2);
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
268 p[0] = p[1] = p[2] = p[3] = idct_cosine_table[j - 1];
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
269 } while (++j <= 7);
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
270
1969
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
271 idct_constants[44] = idct_constants[45] =
1866
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
272 idct_constants[46] = idct_constants[47] = IdctAdjustBeforeShift;
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
273 }
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
274
2696
9699d325049d porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents: 1977
diff changeset
275 void ff_vp3_idct_mmx(int16_t *output_data)
1866
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
276 {
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
277 /* eax = quantized input
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
278 * ebx = dequantizer matrix
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
279 * ecx = IDCT constants
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
280 * M(I) = ecx + MaskOffset(0) + I * 8
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
281 * C(I) = ecx + CosineOffset(32) + (I-1) * 8
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
282 * edx = output
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
283 * r0..r7 = mm0..mm7
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
284 */
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
285
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
286 #define C(x) (idct_constants + 16 + (x - 1) * 4)
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
287 #define Eight (idct_constants + 44)
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
288
1969
56cb752222cc correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents: 1866
diff changeset
289 /* at this point, function has completed dequantization + dezigzag +
1866
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
290 * partial transposition; now do the idct itself */
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
291 #define I(K) (output_data + K * 8)
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
292 #define J(K) (output_data + ((K - 4) * 8) + 4)
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
293
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
294 RowIDCT();
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
295 Transpose();
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
296
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
297 #undef I
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
298 #undef J
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
299 #define I(K) (output_data + (K * 8) + 32)
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
300 #define J(K) (output_data + ((K - 4) * 8) + 36)
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
301
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
302 RowIDCT();
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
303 Transpose();
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
304
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
305 #undef I
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
306 #undef J
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
307 #define I(K) (output_data + K * 8)
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
308 #define J(K) (output_data + K * 8)
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
309
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
310 ColumnIDCT();
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
311
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
312 #undef I
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
313 #undef J
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
314 #define I(K) (output_data + (K * 8) + 4)
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
315 #define J(K) (output_data + (K * 8) + 4)
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
316
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
317 ColumnIDCT();
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
318
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
319 #undef I
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
320 #undef J
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
321
1755f959ab7f seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff changeset
322 }