Mercurial > libavcodec.hg
annotate i386/vp3dsp_mmx.c @ 2841:bceeca1bb30f libavcodec
vbr audio encode patch by (Justin Ruggles: jruggle, earthlink net)
with changes by me
int->float as video uses float too
remove silent cliping to some per codec range, this should result in an error instead
remove change to utils.c as its inconsistant with video
author | michael |
---|---|
date | Sun, 21 Aug 2005 20:27:00 +0000 |
parents | ba8ecddf5598 |
children | ef2149182f1c |
rev | line source |
---|---|
1866
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
1 /* |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
2 * Copyright (C) 2004 the ffmpeg project |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
3 * |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
4 * This library is free software; you can redistribute it and/or |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
5 * modify it under the terms of the GNU Lesser General Public |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
6 * License as published by the Free Software Foundation; either |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
7 * version 2 of the License, or (at your option) any later version. |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
8 * |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
9 * This library is distributed in the hope that it will be useful, |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
12 * Lesser General Public License for more details. |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
13 * |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
14 * You should have received a copy of the GNU Lesser General Public |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
15 * License along with this library; if not, write to the Free Software |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
17 */ |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
18 |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
19 /** |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
20 * @file vp3dsp_mmx.c |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
21 * MMX-optimized functions cribbed from the original VP3 source code. |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
22 */ |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
23 |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
24 #include "../dsputil.h" |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
25 #include "mmx.h" |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
26 |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
27 #define IdctAdjustBeforeShift 8 |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
28 |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
29 /* (12 * 4) 2-byte memory locations ( = 96 bytes total) |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
30 * idct_constants[0..15] = Mask table (M(I)) |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
31 * idct_constants[16..43] = Cosine table (C(I)) |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
32 * idct_constants[44..47] = 8 |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
33 */ |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
34 static uint16_t idct_constants[(4 + 7 + 1) * 4]; |
2753 | 35 static const uint16_t idct_cosine_table[7] = { |
1866
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
36 64277, 60547, 54491, 46341, 36410, 25080, 12785 |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
37 }; |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
38 |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
39 #define r0 mm0 |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
40 #define r1 mm1 |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
41 #define r2 mm2 |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
42 #define r3 mm3 |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
43 #define r4 mm4 |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
44 #define r5 mm5 |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
45 #define r6 mm6 |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
46 #define r7 mm7 |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
47 |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
48 /* from original comments: The Macro does IDct on 4 1-D Dcts */ |
1969
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
49 #define BeginIDCT() { \ |
1866
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
50 movq_m2r(*I(3), r2); \ |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
51 movq_m2r(*C(3), r6); \ |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
52 movq_r2r(r2, r4); \ |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
53 movq_m2r(*J(5), r7); \ |
1969
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
54 pmulhw_r2r(r6, r4); /* r4 = c3*i3 - i3 */ \ |
1866
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
55 movq_m2r(*C(5), r1); \ |
1969
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
56 pmulhw_r2r(r7, r6); /* r6 = c3*i5 - i5 */ \ |
1866
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
57 movq_r2r(r1, r5); \ |
1969
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
58 pmulhw_r2r(r2, r1); /* r1 = c5*i3 - i3 */ \ |
1866
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
59 movq_m2r(*I(1), r3); \ |
1969
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
60 pmulhw_r2r(r7, r5); /* r5 = c5*i5 - i5 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
61 movq_m2r(*C(1), r0); /* (all registers are in use) */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
62 paddw_r2r(r2, r4); /* r4 = c3*i3 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
63 paddw_r2r(r7, r6); /* r6 = c3*i5 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
64 paddw_r2r(r1, r2); /* r2 = c5*i3 */ \ |
1866
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
65 movq_m2r(*J(7), r1); \ |
1969
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
66 paddw_r2r(r5, r7); /* r7 = c5*i5 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
67 movq_r2r(r0, r5); /* r5 = c1 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
68 pmulhw_r2r(r3, r0); /* r0 = c1*i1 - i1 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
69 paddsw_r2r(r7, r4); /* r4 = C = c3*i3 + c5*i5 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
70 pmulhw_r2r(r1, r5); /* r5 = c1*i7 - i7 */ \ |
1866
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
71 movq_m2r(*C(7), r7); \ |
1969
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
72 psubsw_r2r(r2, r6); /* r6 = D = c3*i5 - c5*i3 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
73 paddw_r2r(r3, r0); /* r0 = c1*i1 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
74 pmulhw_r2r(r7, r3); /* r3 = c7*i1 */ \ |
1866
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
75 movq_m2r(*I(2), r2); \ |
1969
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
76 pmulhw_r2r(r1, r7); /* r7 = c7*i7 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
77 paddw_r2r(r1, r5); /* r5 = c1*i7 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
78 movq_r2r(r2, r1); /* r1 = i2 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
79 pmulhw_m2r(*C(2), r2); /* r2 = c2*i2 - i2 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
80 psubsw_r2r(r5, r3); /* r3 = B = c7*i1 - c1*i7 */ \ |
1866
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
81 movq_m2r(*J(6), r5); \ |
1969
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
82 paddsw_r2r(r7, r0); /* r0 = A = c1*i1 + c7*i7 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
83 movq_r2r(r5, r7); /* r7 = i6 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
84 psubsw_r2r(r4, r0); /* r0 = A - C */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
85 pmulhw_m2r(*C(2), r5); /* r5 = c2*i6 - i6 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
86 paddw_r2r(r1, r2); /* r2 = c2*i2 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
87 pmulhw_m2r(*C(6), r1); /* r1 = c6*i2 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
88 paddsw_r2r(r4, r4); /* r4 = C + C */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
89 paddsw_r2r(r0, r4); /* r4 = C. = A + C */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
90 psubsw_r2r(r6, r3); /* r3 = B - D */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
91 paddw_r2r(r7, r5); /* r5 = c2*i6 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
92 paddsw_r2r(r6, r6); /* r6 = D + D */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
93 pmulhw_m2r(*C(6), r7); /* r7 = c6*i6 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
94 paddsw_r2r(r3, r6); /* r6 = D. = B + D */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
95 movq_r2m(r4, *I(1)); /* save C. at I(1) */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
96 psubsw_r2r(r5, r1); /* r1 = H = c6*i2 - c2*i6 */ \ |
1866
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
97 movq_m2r(*C(4), r4); \ |
1969
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
98 movq_r2r(r3, r5); /* r5 = B - D */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
99 pmulhw_r2r(r4, r3); /* r3 = (c4 - 1) * (B - D) */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
100 paddsw_r2r(r2, r7); /* r7 = G = c6*i6 + c2*i2 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
101 movq_r2m(r6, *I(2)); /* save D. at I(2) */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
102 movq_r2r(r0, r2); /* r2 = A - C */ \ |
1866
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
103 movq_m2r(*I(0), r6); \ |
1969
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
104 pmulhw_r2r(r4, r0); /* r0 = (c4 - 1) * (A - C) */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
105 paddw_r2r(r3, r5); /* r5 = B. = c4 * (B - D) */ \ |
1866
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
106 movq_m2r(*J(4), r3); \ |
1969
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
107 psubsw_r2r(r1, r5); /* r5 = B.. = B. - H */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
108 paddw_r2r(r0, r2); /* r0 = A. = c4 * (A - C) */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
109 psubsw_r2r(r3, r6); /* r6 = i0 - i4 */ \ |
1866
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
110 movq_r2r(r6, r0); \ |
1969
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
111 pmulhw_r2r(r4, r6); /* r6 = (c4 - 1) * (i0 - i4) */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
112 paddsw_r2r(r3, r3); /* r3 = i4 + i4 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
113 paddsw_r2r(r1, r1); /* r1 = H + H */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
114 paddsw_r2r(r0, r3); /* r3 = i0 + i4 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
115 paddsw_r2r(r5, r1); /* r1 = H. = B + H */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
116 pmulhw_r2r(r3, r4); /* r4 = (c4 - 1) * (i0 + i4) */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
117 paddsw_r2r(r0, r6); /* r6 = F = c4 * (i0 - i4) */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
118 psubsw_r2r(r2, r6); /* r6 = F. = F - A. */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
119 paddsw_r2r(r2, r2); /* r2 = A. + A. */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
120 movq_m2r(*I(1), r0); /* r0 = C. */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
121 paddsw_r2r(r6, r2); /* r2 = A.. = F + A. */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
122 paddw_r2r(r3, r4); /* r4 = E = c4 * (i0 + i4) */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
123 psubsw_r2r(r1, r2); /* r2 = R2 = A.. - H. */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
124 } |
1866
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
125 |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
126 /* RowIDCT gets ready to transpose */ |
1969
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
127 #define RowIDCT() { \ |
1866
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
128 \ |
1969
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
129 BeginIDCT(); \ |
1866
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
130 \ |
1969
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
131 movq_m2r(*I(2), r3); /* r3 = D. */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
132 psubsw_r2r(r7, r4); /* r4 = E. = E - G */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
133 paddsw_r2r(r1, r1); /* r1 = H. + H. */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
134 paddsw_r2r(r7, r7); /* r7 = G + G */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
135 paddsw_r2r(r2, r1); /* r1 = R1 = A.. + H. */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
136 paddsw_r2r(r4, r7); /* r7 = G. = E + G */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
137 psubsw_r2r(r3, r4); /* r4 = R4 = E. - D. */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
138 paddsw_r2r(r3, r3); \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
139 psubsw_r2r(r5, r6); /* r6 = R6 = F. - B.. */ \ |
1866
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
140 paddsw_r2r(r5, r5); \ |
1969
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
141 paddsw_r2r(r4, r3); /* r3 = R3 = E. + D. */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
142 paddsw_r2r(r6, r5); /* r5 = R5 = F. + B.. */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
143 psubsw_r2r(r0, r7); /* r7 = R7 = G. - C. */ \ |
1866
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
144 paddsw_r2r(r0, r0); \ |
1969
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
145 movq_r2m(r1, *I(1)); /* save R1 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
146 paddsw_r2r(r7, r0); /* r0 = R0 = G. + C. */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
147 } |
1866
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
148 |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
149 /* Column IDCT normalizes and stores final results */ |
1969
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
150 #define ColumnIDCT() { \ |
1866
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
151 \ |
1969
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
152 BeginIDCT(); \ |
1866
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
153 \ |
1969
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
154 paddsw_m2r(*Eight, r2); /* adjust R2 (and R1) for shift */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
155 paddsw_r2r(r1, r1); /* r1 = H. + H. */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
156 paddsw_r2r(r2, r1); /* r1 = R1 = A.. + H. */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
157 psraw_i2r(4, r2); /* r2 = NR2 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
158 psubsw_r2r(r7, r4); /* r4 = E. = E - G */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
159 psraw_i2r(4, r1); /* r1 = NR1 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
160 movq_m2r(*I(2), r3); /* r3 = D. */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
161 paddsw_r2r(r7, r7); /* r7 = G + G */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
162 movq_r2m(r2, *I(2)); /* store NR2 at I2 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
163 paddsw_r2r(r4, r7); /* r7 = G. = E + G */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
164 movq_r2m(r1, *I(1)); /* store NR1 at I1 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
165 psubsw_r2r(r3, r4); /* r4 = R4 = E. - D. */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
166 paddsw_m2r(*Eight, r4); /* adjust R4 (and R3) for shift */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
167 paddsw_r2r(r3, r3); /* r3 = D. + D. */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
168 paddsw_r2r(r4, r3); /* r3 = R3 = E. + D. */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
169 psraw_i2r(4, r4); /* r4 = NR4 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
170 psubsw_r2r(r5, r6); /* r6 = R6 = F. - B.. */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
171 psraw_i2r(4, r3); /* r3 = NR3 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
172 paddsw_m2r(*Eight, r6); /* adjust R6 (and R5) for shift */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
173 paddsw_r2r(r5, r5); /* r5 = B.. + B.. */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
174 paddsw_r2r(r6, r5); /* r5 = R5 = F. + B.. */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
175 psraw_i2r(4, r6); /* r6 = NR6 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
176 movq_r2m(r4, *J(4)); /* store NR4 at J4 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
177 psraw_i2r(4, r5); /* r5 = NR5 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
178 movq_r2m(r3, *I(3)); /* store NR3 at I3 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
179 psubsw_r2r(r0, r7); /* r7 = R7 = G. - C. */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
180 paddsw_m2r(*Eight, r7); /* adjust R7 (and R0) for shift */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
181 paddsw_r2r(r0, r0); /* r0 = C. + C. */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
182 paddsw_r2r(r7, r0); /* r0 = R0 = G. + C. */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
183 psraw_i2r(4, r7); /* r7 = NR7 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
184 movq_r2m(r6, *J(6)); /* store NR6 at J6 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
185 psraw_i2r(4, r0); /* r0 = NR0 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
186 movq_r2m(r5, *J(5)); /* store NR5 at J5 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
187 movq_r2m(r7, *J(7)); /* store NR7 at J7 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
188 movq_r2m(r0, *I(0)); /* store NR0 at I0 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
189 } |
1866
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
190 |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
191 /* Following macro does two 4x4 transposes in place. |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
192 |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
193 At entry (we assume): |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
194 |
1969
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
195 r0 = a3 a2 a1 a0 |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
196 I(1) = b3 b2 b1 b0 |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
197 r2 = c3 c2 c1 c0 |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
198 r3 = d3 d2 d1 d0 |
1866
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
199 |
1969
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
200 r4 = e3 e2 e1 e0 |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
201 r5 = f3 f2 f1 f0 |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
202 r6 = g3 g2 g1 g0 |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
203 r7 = h3 h2 h1 h0 |
1866
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
204 |
1969
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
205 At exit, we have: |
1866
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
206 |
1969
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
207 I(0) = d0 c0 b0 a0 |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
208 I(1) = d1 c1 b1 a1 |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
209 I(2) = d2 c2 b2 a2 |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
210 I(3) = d3 c3 b3 a3 |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
211 |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
212 J(4) = h0 g0 f0 e0 |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
213 J(5) = h1 g1 f1 e1 |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
214 J(6) = h2 g2 f2 e2 |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
215 J(7) = h3 g3 f3 e3 |
1866
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
216 |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
217 I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3. |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
218 J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7. |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
219 |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
220 Since r1 is free at entry, we calculate the Js first. */ |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
221 |
1969
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
222 #define Transpose() { \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
223 movq_r2r(r4, r1); /* r1 = e3 e2 e1 e0 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
224 punpcklwd_r2r(r5, r4); /* r4 = f1 e1 f0 e0 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
225 movq_r2m(r0, *I(0)); /* save a3 a2 a1 a0 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
226 punpckhwd_r2r(r5, r1); /* r1 = f3 e3 f2 e2 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
227 movq_r2r(r6, r0); /* r0 = g3 g2 g1 g0 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
228 punpcklwd_r2r(r7, r6); /* r6 = h1 g1 h0 g0 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
229 movq_r2r(r4, r5); /* r5 = f1 e1 f0 e0 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
230 punpckldq_r2r(r6, r4); /* r4 = h0 g0 f0 e0 = R4 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
231 punpckhdq_r2r(r6, r5); /* r5 = h1 g1 f1 e1 = R5 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
232 movq_r2r(r1, r6); /* r6 = f3 e3 f2 e2 */ \ |
1866
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
233 movq_r2m(r4, *J(4)); \ |
1969
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
234 punpckhwd_r2r(r7, r0); /* r0 = h3 g3 h2 g2 */ \ |
1866
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
235 movq_r2m(r5, *J(5)); \ |
1969
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
236 punpckhdq_r2r(r0, r6); /* r6 = h3 g3 f3 e3 = R7 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
237 movq_m2r(*I(0), r4); /* r4 = a3 a2 a1 a0 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
238 punpckldq_r2r(r0, r1); /* r1 = h2 g2 f2 e2 = R6 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
239 movq_m2r(*I(1), r5); /* r5 = b3 b2 b1 b0 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
240 movq_r2r(r4, r0); /* r0 = a3 a2 a1 a0 */ \ |
1866
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
241 movq_r2m(r6, *J(7)); \ |
1969
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
242 punpcklwd_r2r(r5, r0); /* r0 = b1 a1 b0 a0 */ \ |
1866
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
243 movq_r2m(r1, *J(6)); \ |
1969
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
244 punpckhwd_r2r(r5, r4); /* r4 = b3 a3 b2 a2 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
245 movq_r2r(r2, r5); /* r5 = c3 c2 c1 c0 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
246 punpcklwd_r2r(r3, r2); /* r2 = d1 c1 d0 c0 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
247 movq_r2r(r0, r1); /* r1 = b1 a1 b0 a0 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
248 punpckldq_r2r(r2, r0); /* r0 = d0 c0 b0 a0 = R0 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
249 punpckhdq_r2r(r2, r1); /* r1 = d1 c1 b1 a1 = R1 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
250 movq_r2r(r4, r2); /* r2 = b3 a3 b2 a2 */ \ |
1866
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
251 movq_r2m(r0, *I(0)); \ |
1969
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
252 punpckhwd_r2r(r3, r5); /* r5 = d3 c3 d2 c2 */ \ |
1866
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
253 movq_r2m(r1, *I(1)); \ |
1969
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
254 punpckhdq_r2r(r5, r4); /* r4 = d3 c3 b3 a3 = R3 */ \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
255 punpckldq_r2r(r5, r2); /* r2 = d2 c2 b2 a2 = R2 */ \ |
1866
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
256 movq_r2m(r4, *I(3)); \ |
1969
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
257 movq_r2m(r2, *I(2)); \ |
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
258 } |
1866
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
259 |
2696
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
1977
diff
changeset
|
260 void ff_vp3_dsp_init_mmx(void) |
1866
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
261 { |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
262 int j = 16; |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
263 uint16_t *p; |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
264 |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
265 j = 1; |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
266 do { |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
267 p = idct_constants + ((j + 3) << 2); |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
268 p[0] = p[1] = p[2] = p[3] = idct_cosine_table[j - 1]; |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
269 } while (++j <= 7); |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
270 |
1969
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
271 idct_constants[44] = idct_constants[45] = |
1866
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
272 idct_constants[46] = idct_constants[47] = IdctAdjustBeforeShift; |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
273 } |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
274 |
2696
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
1977
diff
changeset
|
275 void ff_vp3_idct_mmx(int16_t *output_data) |
1866
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
276 { |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
277 /* eax = quantized input |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
278 * ebx = dequantizer matrix |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
279 * ecx = IDCT constants |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
280 * M(I) = ecx + MaskOffset(0) + I * 8 |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
281 * C(I) = ecx + CosineOffset(32) + (I-1) * 8 |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
282 * edx = output |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
283 * r0..r7 = mm0..mm7 |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
284 */ |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
285 |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
286 #define C(x) (idct_constants + 16 + (x - 1) * 4) |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
287 #define Eight (idct_constants + 44) |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
288 |
1969
56cb752222cc
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
melanson
parents:
1866
diff
changeset
|
289 /* at this point, function has completed dequantization + dezigzag + |
1866
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
290 * partial transposition; now do the idct itself */ |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
291 #define I(K) (output_data + K * 8) |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
292 #define J(K) (output_data + ((K - 4) * 8) + 4) |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
293 |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
294 RowIDCT(); |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
295 Transpose(); |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
296 |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
297 #undef I |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
298 #undef J |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
299 #define I(K) (output_data + (K * 8) + 32) |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
300 #define J(K) (output_data + ((K - 4) * 8) + 36) |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
301 |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
302 RowIDCT(); |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
303 Transpose(); |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
304 |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
305 #undef I |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
306 #undef J |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
307 #define I(K) (output_data + K * 8) |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
308 #define J(K) (output_data + K * 8) |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
309 |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
310 ColumnIDCT(); |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
311 |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
312 #undef I |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
313 #undef J |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
314 #define I(K) (output_data + (K * 8) + 4) |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
315 #define J(K) (output_data + (K * 8) + 4) |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
316 |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
317 ColumnIDCT(); |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
318 |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
319 #undef I |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
320 #undef J |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
321 |
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
diff
changeset
|
322 } |