Mercurial > libavcodec.hg
comparison i386/vp3dsp_mmx.c @ 7877:8759422d660a libavcodec
Rewrite MMX VP3 IDCT in inline asm
author | conrad |
---|---|
date | Wed, 17 Sep 2008 19:30:33 +0000 |
parents | 892ca48b7d76 |
children | e05e021fce72 |
comparison
equal
deleted
inserted
replaced
7876:3fd591f125b5 | 7877:8759422d660a |
---|---|
23 * MMX-optimized functions cribbed from the original VP3 source code. | 23 * MMX-optimized functions cribbed from the original VP3 source code. |
24 */ | 24 */ |
25 | 25 |
26 #include "libavcodec/dsputil.h" | 26 #include "libavcodec/dsputil.h" |
27 #include "dsputil_mmx.h" | 27 #include "dsputil_mmx.h" |
28 #include "mmx.h" | |
29 | 28 |
30 extern const uint16_t ff_vp3_idct_data[]; | 29 extern const uint16_t ff_vp3_idct_data[]; |
31 | 30 |
32 #define r0 mm0 | |
33 #define r1 mm1 | |
34 #define r2 mm2 | |
35 #define r3 mm3 | |
36 #define r4 mm4 | |
37 #define r5 mm5 | |
38 #define r6 mm6 | |
39 #define r7 mm7 | |
40 | |
41 /* from original comments: The Macro does IDct on 4 1-D Dcts */ | 31 /* from original comments: The Macro does IDct on 4 1-D Dcts */ |
42 #define BeginIDCT() { \ | 32 #define BeginIDCT() \ |
43 movq_m2r(*I(3), r2); \ | 33 "movq "I(3)", %%mm2 \n\t" \ |
44 movq_m2r(*C(3), r6); \ | 34 "movq "C(3)", %%mm6 \n\t" \ |
45 movq_r2r(r2, r4); \ | 35 "movq %%mm2, %%mm4 \n\t" \ |
46 movq_m2r(*J(5), r7); \ | 36 "movq "J(5)", %%mm7 \n\t" \ |
47 pmulhw_r2r(r6, r4); /* r4 = c3*i3 - i3 */ \ | 37 "pmulhw %%mm6, %%mm4 \n\t" /* r4 = c3*i3 - i3 */ \ |
48 movq_m2r(*C(5), r1); \ | 38 "movq "C(5)", %%mm1 \n\t" \ |
49 pmulhw_r2r(r7, r6); /* r6 = c3*i5 - i5 */ \ | 39 "pmulhw %%mm7, %%mm6 \n\t" /* r6 = c3*i5 - i5 */ \ |
50 movq_r2r(r1, r5); \ | 40 "movq %%mm1, %%mm5 \n\t" \ |
51 pmulhw_r2r(r2, r1); /* r1 = c5*i3 - i3 */ \ | 41 "pmulhw %%mm2, %%mm1 \n\t" /* r1 = c5*i3 - i3 */ \ |
52 movq_m2r(*I(1), r3); \ | 42 "movq "I(1)", %%mm3 \n\t" \ |
53 pmulhw_r2r(r7, r5); /* r5 = c5*i5 - i5 */ \ | 43 "pmulhw %%mm7, %%mm5 \n\t" /* r5 = c5*i5 - i5 */ \ |
54 movq_m2r(*C(1), r0); /* (all registers are in use) */ \ | 44 "movq "C(1)", %%mm0 \n\t" \ |
55 paddw_r2r(r2, r4); /* r4 = c3*i3 */ \ | 45 "paddw %%mm2, %%mm4 \n\t" /* r4 = c3*i3 */ \ |
56 paddw_r2r(r7, r6); /* r6 = c3*i5 */ \ | 46 "paddw %%mm7, %%mm6 \n\t" /* r6 = c3*i5 */ \ |
57 paddw_r2r(r1, r2); /* r2 = c5*i3 */ \ | 47 "paddw %%mm1, %%mm2 \n\t" /* r2 = c5*i3 */ \ |
58 movq_m2r(*J(7), r1); \ | 48 "movq "J(7)", %%mm1 \n\t" \ |
59 paddw_r2r(r5, r7); /* r7 = c5*i5 */ \ | 49 "paddw %%mm5, %%mm7 \n\t" /* r7 = c5*i5 */ \ |
60 movq_r2r(r0, r5); /* r5 = c1 */ \ | 50 "movq %%mm0, %%mm5 \n\t" /* r5 = c1 */ \ |
61 pmulhw_r2r(r3, r0); /* r0 = c1*i1 - i1 */ \ | 51 "pmulhw %%mm3, %%mm0 \n\t" /* r0 = c1*i1 - i1 */ \ |
62 paddsw_r2r(r7, r4); /* r4 = C = c3*i3 + c5*i5 */ \ | 52 "paddsw %%mm7, %%mm4 \n\t" /* r4 = C = c3*i3 + c5*i5 */ \ |
63 pmulhw_r2r(r1, r5); /* r5 = c1*i7 - i7 */ \ | 53 "pmulhw %%mm1, %%mm5 \n\t" /* r5 = c1*i7 - i7 */ \ |
64 movq_m2r(*C(7), r7); \ | 54 "movq "C(7)", %%mm7 \n\t" \ |
65 psubsw_r2r(r2, r6); /* r6 = D = c3*i5 - c5*i3 */ \ | 55 "psubsw %%mm2, %%mm6 \n\t" /* r6 = D = c3*i5 - c5*i3 */ \ |
66 paddw_r2r(r3, r0); /* r0 = c1*i1 */ \ | 56 "paddw %%mm3, %%mm0 \n\t" /* r0 = c1*i1 */ \ |
67 pmulhw_r2r(r7, r3); /* r3 = c7*i1 */ \ | 57 "pmulhw %%mm7, %%mm3 \n\t" /* r3 = c7*i1 */ \ |
68 movq_m2r(*I(2), r2); \ | 58 "movq "I(2)", %%mm2 \n\t" \ |
69 pmulhw_r2r(r1, r7); /* r7 = c7*i7 */ \ | 59 "pmulhw %%mm1, %%mm7 \n\t" /* r7 = c7*i7 */ \ |
70 paddw_r2r(r1, r5); /* r5 = c1*i7 */ \ | 60 "paddw %%mm1, %%mm5 \n\t" /* r5 = c1*i7 */ \ |
71 movq_r2r(r2, r1); /* r1 = i2 */ \ | 61 "movq %%mm2, %%mm1 \n\t" /* r1 = i2 */ \ |
72 pmulhw_m2r(*C(2), r2); /* r2 = c2*i2 - i2 */ \ | 62 "pmulhw "C(2)", %%mm2 \n\t" /* r2 = c2*i2 - i2 */ \ |
73 psubsw_r2r(r5, r3); /* r3 = B = c7*i1 - c1*i7 */ \ | 63 "psubsw %%mm5, %%mm3 \n\t" /* r3 = B = c7*i1 - c1*i7 */ \ |
74 movq_m2r(*J(6), r5); \ | 64 "movq "J(6)", %%mm5 \n\t" \ |
75 paddsw_r2r(r7, r0); /* r0 = A = c1*i1 + c7*i7 */ \ | 65 "paddsw %%mm7, %%mm0 \n\t" /* r0 = A = c1*i1 + c7*i7 */ \ |
76 movq_r2r(r5, r7); /* r7 = i6 */ \ | 66 "movq %%mm5, %%mm7 \n\t" /* r7 = i6 */ \ |
77 psubsw_r2r(r4, r0); /* r0 = A - C */ \ | 67 "psubsw %%mm4, %%mm0 \n\t" /* r0 = A - C */ \ |
78 pmulhw_m2r(*C(2), r5); /* r5 = c2*i6 - i6 */ \ | 68 "pmulhw "C(2)", %%mm5 \n\t" /* r5 = c2*i6 - i6 */ \ |
79 paddw_r2r(r1, r2); /* r2 = c2*i2 */ \ | 69 "paddw %%mm1, %%mm2 \n\t" /* r2 = c2*i2 */ \ |
80 pmulhw_m2r(*C(6), r1); /* r1 = c6*i2 */ \ | 70 "pmulhw "C(6)", %%mm1 \n\t" /* r1 = c6*i2 */ \ |
81 paddsw_r2r(r4, r4); /* r4 = C + C */ \ | 71 "paddsw %%mm4, %%mm4 \n\t" /* r4 = C + C */ \ |
82 paddsw_r2r(r0, r4); /* r4 = C. = A + C */ \ | 72 "paddsw %%mm0, %%mm4 \n\t" /* r4 = C. = A + C */ \ |
83 psubsw_r2r(r6, r3); /* r3 = B - D */ \ | 73 "psubsw %%mm6, %%mm3 \n\t" /* r3 = B - D */ \ |
84 paddw_r2r(r7, r5); /* r5 = c2*i6 */ \ | 74 "paddw %%mm7, %%mm5 \n\t" /* r5 = c2*i6 */ \ |
85 paddsw_r2r(r6, r6); /* r6 = D + D */ \ | 75 "paddsw %%mm6, %%mm6 \n\t" /* r6 = D + D */ \ |
86 pmulhw_m2r(*C(6), r7); /* r7 = c6*i6 */ \ | 76 "pmulhw "C(6)", %%mm7 \n\t" /* r7 = c6*i6 */ \ |
87 paddsw_r2r(r3, r6); /* r6 = D. = B + D */ \ | 77 "paddsw %%mm3, %%mm6 \n\t" /* r6 = D. = B + D */ \ |
88 movq_r2m(r4, *I(1)); /* save C. at I(1) */ \ | 78 "movq %%mm4, "I(1)"\n\t" /* save C. at I(1) */ \ |
89 psubsw_r2r(r5, r1); /* r1 = H = c6*i2 - c2*i6 */ \ | 79 "psubsw %%mm5, %%mm1 \n\t" /* r1 = H = c6*i2 - c2*i6 */ \ |
90 movq_m2r(*C(4), r4); \ | 80 "movq "C(4)", %%mm4 \n\t" \ |
91 movq_r2r(r3, r5); /* r5 = B - D */ \ | 81 "movq %%mm3, %%mm5 \n\t" /* r5 = B - D */ \ |
92 pmulhw_r2r(r4, r3); /* r3 = (c4 - 1) * (B - D) */ \ | 82 "pmulhw %%mm4, %%mm3 \n\t" /* r3 = (c4 - 1) * (B - D) */ \ |
93 paddsw_r2r(r2, r7); /* r7 = G = c6*i6 + c2*i2 */ \ | 83 "paddsw %%mm2, %%mm7 \n\t" /* r3 = (c4 - 1) * (B - D) */ \ |
94 movq_r2m(r6, *I(2)); /* save D. at I(2) */ \ | 84 "movq %%mm6, "I(2)"\n\t" /* save D. at I(2) */ \ |
95 movq_r2r(r0, r2); /* r2 = A - C */ \ | 85 "movq %%mm0, %%mm2 \n\t" /* r2 = A - C */ \ |
96 movq_m2r(*I(0), r6); \ | 86 "movq "I(0)", %%mm6 \n\t" \ |
97 pmulhw_r2r(r4, r0); /* r0 = (c4 - 1) * (A - C) */ \ | 87 "pmulhw %%mm4, %%mm0 \n\t" /* r0 = (c4 - 1) * (A - C) */ \ |
98 paddw_r2r(r3, r5); /* r5 = B. = c4 * (B - D) */ \ | 88 "paddw %%mm3, %%mm5 \n\t" /* r5 = B. = c4 * (B - D) */ \ |
99 movq_m2r(*J(4), r3); \ | 89 "movq "J(4)", %%mm3 \n\t" \ |
100 psubsw_r2r(r1, r5); /* r5 = B.. = B. - H */ \ | 90 "psubsw %%mm1, %%mm5 \n\t" /* r5 = B.. = B. - H */ \ |
101 paddw_r2r(r0, r2); /* r0 = A. = c4 * (A - C) */ \ | 91 "paddw %%mm0, %%mm2 \n\t" /* r0 = A. = c4 * (A - C) */ \ |
102 psubsw_r2r(r3, r6); /* r6 = i0 - i4 */ \ | 92 "psubsw %%mm3, %%mm6 \n\t" /* r6 = i0 - i4 */ \ |
103 movq_r2r(r6, r0); \ | 93 "movq %%mm6, %%mm0 \n\t" \ |
104 pmulhw_r2r(r4, r6); /* r6 = (c4 - 1) * (i0 - i4) */ \ | 94 "pmulhw %%mm4, %%mm6 \n\t" /* r6 = (c4 - 1) * (i0 - i4) */ \ |
105 paddsw_r2r(r3, r3); /* r3 = i4 + i4 */ \ | 95 "paddsw %%mm3, %%mm3 \n\t" /* r3 = i4 + i4 */ \ |
106 paddsw_r2r(r1, r1); /* r1 = H + H */ \ | 96 "paddsw %%mm1, %%mm1 \n\t" /* r1 = H + H */ \ |
107 paddsw_r2r(r0, r3); /* r3 = i0 + i4 */ \ | 97 "paddsw %%mm0, %%mm3 \n\t" /* r3 = i0 + i4 */ \ |
108 paddsw_r2r(r5, r1); /* r1 = H. = B + H */ \ | 98 "paddsw %%mm5, %%mm1 \n\t" /* r1 = H. = B + H */ \ |
109 pmulhw_r2r(r3, r4); /* r4 = (c4 - 1) * (i0 + i4) */ \ | 99 "pmulhw %%mm3, %%mm4 \n\t" /* r4 = (c4 - 1) * (i0 + i4) */ \ |
110 paddsw_r2r(r0, r6); /* r6 = F = c4 * (i0 - i4) */ \ | 100 "paddsw %%mm0, %%mm6 \n\t" /* r6 = F = c4 * (i0 - i4) */ \ |
111 psubsw_r2r(r2, r6); /* r6 = F. = F - A. */ \ | 101 "psubsw %%mm2, %%mm6 \n\t" /* r6 = F. = F - A. */ \ |
112 paddsw_r2r(r2, r2); /* r2 = A. + A. */ \ | 102 "paddsw %%mm2, %%mm2 \n\t" /* r2 = A. + A. */ \ |
113 movq_m2r(*I(1), r0); /* r0 = C. */ \ | 103 "movq "I(1)", %%mm0 \n\t" /* r0 = C. */ \ |
114 paddsw_r2r(r6, r2); /* r2 = A.. = F + A. */ \ | 104 "paddsw %%mm6, %%mm2 \n\t" /* r2 = A.. = F + A. */ \ |
115 paddw_r2r(r3, r4); /* r4 = E = c4 * (i0 + i4) */ \ | 105 "paddw %%mm3, %%mm4 \n\t" /* r4 = E = c4 * (i0 + i4) */ \ |
116 psubsw_r2r(r1, r2); /* r2 = R2 = A.. - H. */ \ | 106 "psubsw %%mm1, %%mm2 \n\t" /* r2 = R2 = A.. - H. */ |
117 } | |
118 | 107 |
119 /* RowIDCT gets ready to transpose */ | 108 /* RowIDCT gets ready to transpose */ |
120 #define RowIDCT() { \ | 109 #define RowIDCT() \ |
121 \ | 110 BeginIDCT() \ |
122 BeginIDCT(); \ | 111 "movq "I(2)", %%mm3 \n\t" /* r3 = D. */ \ |
123 \ | 112 "psubsw %%mm7, %%mm4 \n\t" /* r4 = E. = E - G */ \ |
124 movq_m2r(*I(2), r3); /* r3 = D. */ \ | 113 "paddsw %%mm1, %%mm1 \n\t" /* r1 = H. + H. */ \ |
125 psubsw_r2r(r7, r4); /* r4 = E. = E - G */ \ | 114 "paddsw %%mm7, %%mm7 \n\t" /* r7 = G + G */ \ |
126 paddsw_r2r(r1, r1); /* r1 = H. + H. */ \ | 115 "paddsw %%mm2, %%mm1 \n\t" /* r1 = R1 = A.. + H. */ \ |
127 paddsw_r2r(r7, r7); /* r7 = G + G */ \ | 116 "paddsw %%mm4, %%mm7 \n\t" /* r1 = R1 = A.. + H. */ \ |
128 paddsw_r2r(r2, r1); /* r1 = R1 = A.. + H. */ \ | 117 "psubsw %%mm3, %%mm4 \n\t" /* r4 = R4 = E. - D. */ \ |
129 paddsw_r2r(r4, r7); /* r7 = G. = E + G */ \ | 118 "paddsw %%mm3, %%mm3 \n\t" \ |
130 psubsw_r2r(r3, r4); /* r4 = R4 = E. - D. */ \ | 119 "psubsw %%mm5, %%mm6 \n\t" /* r6 = R6 = F. - B.. */ \ |
131 paddsw_r2r(r3, r3); \ | 120 "paddsw %%mm5, %%mm5 \n\t" \ |
132 psubsw_r2r(r5, r6); /* r6 = R6 = F. - B.. */ \ | 121 "paddsw %%mm4, %%mm3 \n\t" /* r3 = R3 = E. + D. */ \ |
133 paddsw_r2r(r5, r5); \ | 122 "paddsw %%mm6, %%mm5 \n\t" /* r5 = R5 = F. + B.. */ \ |
134 paddsw_r2r(r4, r3); /* r3 = R3 = E. + D. */ \ | 123 "psubsw %%mm0, %%mm7 \n\t" /* r7 = R7 = G. - C. */ \ |
135 paddsw_r2r(r6, r5); /* r5 = R5 = F. + B.. */ \ | 124 "paddsw %%mm0, %%mm0 \n\t" \ |
136 psubsw_r2r(r0, r7); /* r7 = R7 = G. - C. */ \ | 125 "movq %%mm1, "I(1)"\n\t" /* save R1 */ \ |
137 paddsw_r2r(r0, r0); \ | 126 "paddsw %%mm7, %%mm0 \n\t" /* r0 = R0 = G. + C. */ |
138 movq_r2m(r1, *I(1)); /* save R1 */ \ | |
139 paddsw_r2r(r7, r0); /* r0 = R0 = G. + C. */ \ | |
140 } | |
141 | 127 |
142 /* Column IDCT normalizes and stores final results */ | 128 /* Column IDCT normalizes and stores final results */ |
143 #define ColumnIDCT() { \ | 129 #define ColumnIDCT() \ |
144 \ | 130 BeginIDCT() \ |
145 BeginIDCT(); \ | 131 "paddsw "OC_8", %%mm2 \n\t" /* adjust R2 (and R1) for shift */ \ |
146 \ | 132 "paddsw %%mm1, %%mm1 \n\t" /* r1 = H. + H. */ \ |
147 paddsw_m2r(*Eight, r2); /* adjust R2 (and R1) for shift */ \ | 133 "paddsw %%mm2, %%mm1 \n\t" /* r1 = R1 = A.. + H. */ \ |
148 paddsw_r2r(r1, r1); /* r1 = H. + H. */ \ | 134 "psraw $4, %%mm2 \n\t" /* r2 = NR2 */ \ |
149 paddsw_r2r(r2, r1); /* r1 = R1 = A.. + H. */ \ | 135 "psubsw %%mm7, %%mm4 \n\t" /* r4 = E. = E - G */ \ |
150 psraw_i2r(4, r2); /* r2 = NR2 */ \ | 136 "psraw $4, %%mm1 \n\t" /* r1 = NR1 */ \ |
151 psubsw_r2r(r7, r4); /* r4 = E. = E - G */ \ | 137 "movq "I(2)", %%mm3 \n\t" /* r3 = D. */ \ |
152 psraw_i2r(4, r1); /* r1 = NR1 */ \ | 138 "paddsw %%mm7, %%mm7 \n\t" /* r7 = G + G */ \ |
153 movq_m2r(*I(2), r3); /* r3 = D. */ \ | 139 "movq %%mm2, "I(2)"\n\t" /* store NR2 at I2 */ \ |
154 paddsw_r2r(r7, r7); /* r7 = G + G */ \ | 140 "paddsw %%mm4, %%mm7 \n\t" /* r7 = G. = E + G */ \ |
155 movq_r2m(r2, *I(2)); /* store NR2 at I2 */ \ | 141 "movq %%mm1, "I(1)"\n\t" /* store NR1 at I1 */ \ |
156 paddsw_r2r(r4, r7); /* r7 = G. = E + G */ \ | 142 "psubsw %%mm3, %%mm4 \n\t" /* r4 = R4 = E. - D. */ \ |
157 movq_r2m(r1, *I(1)); /* store NR1 at I1 */ \ | 143 "paddsw "OC_8", %%mm4 \n\t" /* adjust R4 (and R3) for shift */ \ |
158 psubsw_r2r(r3, r4); /* r4 = R4 = E. - D. */ \ | 144 "paddsw %%mm3, %%mm3 \n\t" /* r3 = D. + D. */ \ |
159 paddsw_m2r(*Eight, r4); /* adjust R4 (and R3) for shift */ \ | 145 "paddsw %%mm4, %%mm3 \n\t" /* r3 = R3 = E. + D. */ \ |
160 paddsw_r2r(r3, r3); /* r3 = D. + D. */ \ | 146 "psraw $4, %%mm4 \n\t" /* r4 = NR4 */ \ |
161 paddsw_r2r(r4, r3); /* r3 = R3 = E. + D. */ \ | 147 "psubsw %%mm5, %%mm6 \n\t" /* r6 = R6 = F. - B.. */ \ |
162 psraw_i2r(4, r4); /* r4 = NR4 */ \ | 148 "psraw $4, %%mm3 \n\t" /* r3 = NR3 */ \ |
163 psubsw_r2r(r5, r6); /* r6 = R6 = F. - B.. */ \ | 149 "paddsw "OC_8", %%mm6 \n\t" /* adjust R6 (and R5) for shift */ \ |
164 psraw_i2r(4, r3); /* r3 = NR3 */ \ | 150 "paddsw %%mm5, %%mm5 \n\t" /* r5 = B.. + B.. */ \ |
165 paddsw_m2r(*Eight, r6); /* adjust R6 (and R5) for shift */ \ | 151 "paddsw %%mm6, %%mm5 \n\t" /* r5 = R5 = F. + B.. */ \ |
166 paddsw_r2r(r5, r5); /* r5 = B.. + B.. */ \ | 152 "psraw $4, %%mm6 \n\t" /* r6 = NR6 */ \ |
167 paddsw_r2r(r6, r5); /* r5 = R5 = F. + B.. */ \ | 153 "movq %%mm4, "J(4)"\n\t" /* store NR4 at J4 */ \ |
168 psraw_i2r(4, r6); /* r6 = NR6 */ \ | 154 "psraw $4, %%mm5 \n\t" /* r5 = NR5 */ \ |
169 movq_r2m(r4, *J(4)); /* store NR4 at J4 */ \ | 155 "movq %%mm3, "I(3)"\n\t" /* store NR3 at I3 */ \ |
170 psraw_i2r(4, r5); /* r5 = NR5 */ \ | 156 "psubsw %%mm0, %%mm7 \n\t" /* r7 = R7 = G. - C. */ \ |
171 movq_r2m(r3, *I(3)); /* store NR3 at I3 */ \ | 157 "paddsw "OC_8", %%mm7 \n\t" /* adjust R7 (and R0) for shift */ \ |
172 psubsw_r2r(r0, r7); /* r7 = R7 = G. - C. */ \ | 158 "paddsw %%mm0, %%mm0 \n\t" /* r0 = C. + C. */ \ |
173 paddsw_m2r(*Eight, r7); /* adjust R7 (and R0) for shift */ \ | 159 "paddsw %%mm7, %%mm0 \n\t" /* r0 = R0 = G. + C. */ \ |
174 paddsw_r2r(r0, r0); /* r0 = C. + C. */ \ | 160 "psraw $4, %%mm7 \n\t" /* r7 = NR7 */ \ |
175 paddsw_r2r(r7, r0); /* r0 = R0 = G. + C. */ \ | 161 "movq %%mm6, "J(6)"\n\t" /* store NR6 at J6 */ \ |
176 psraw_i2r(4, r7); /* r7 = NR7 */ \ | 162 "psraw $4, %%mm0 \n\t" /* r0 = NR0 */ \ |
177 movq_r2m(r6, *J(6)); /* store NR6 at J6 */ \ | 163 "movq %%mm5, "J(5)"\n\t" /* store NR5 at J5 */ \ |
178 psraw_i2r(4, r0); /* r0 = NR0 */ \ | 164 "movq %%mm7, "J(7)"\n\t" /* store NR7 at J7 */ \ |
179 movq_r2m(r5, *J(5)); /* store NR5 at J5 */ \ | 165 "movq %%mm0, "I(0)"\n\t" /* store NR0 at I0 */ |
180 movq_r2m(r7, *J(7)); /* store NR7 at J7 */ \ | |
181 movq_r2m(r0, *I(0)); /* store NR0 at I0 */ \ | |
182 } | |
183 | 166 |
184 /* Following macro does two 4x4 transposes in place. | 167 /* Following macro does two 4x4 transposes in place. |
185 | 168 |
186 At entry (we assume): | 169 At entry (we assume): |
187 | 170 |
209 | 192 |
210 I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3. | 193 I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3. |
211 J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7. | 194 J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7. |
212 | 195 |
213 Since r1 is free at entry, we calculate the Js first. */ | 196 Since r1 is free at entry, we calculate the Js first. */ |
214 | 197 #define Transpose() \ |
215 #define Transpose() { \ | 198 "movq %%mm4, %%mm1 \n\t" /* r1 = e3 e2 e1 e0 */ \ |
216 movq_r2r(r4, r1); /* r1 = e3 e2 e1 e0 */ \ | 199 "punpcklwd %%mm5, %%mm4 \n\t" /* r4 = f1 e1 f0 e0 */ \ |
217 punpcklwd_r2r(r5, r4); /* r4 = f1 e1 f0 e0 */ \ | 200 "movq %%mm0, "I(0)"\n\t" /* save a3 a2 a1 a0 */ \ |
218 movq_r2m(r0, *I(0)); /* save a3 a2 a1 a0 */ \ | 201 "punpckhwd %%mm5, %%mm1 \n\t" /* r1 = f3 e3 f2 e2 */ \ |
219 punpckhwd_r2r(r5, r1); /* r1 = f3 e3 f2 e2 */ \ | 202 "movq %%mm6, %%mm0 \n\t" /* r0 = g3 g2 g1 g0 */ \ |
220 movq_r2r(r6, r0); /* r0 = g3 g2 g1 g0 */ \ | 203 "punpcklwd %%mm7, %%mm6 \n\t" /* r6 = h1 g1 h0 g0 */ \ |
221 punpcklwd_r2r(r7, r6); /* r6 = h1 g1 h0 g0 */ \ | 204 "movq %%mm4, %%mm5 \n\t" /* r5 = f1 e1 f0 e0 */ \ |
222 movq_r2r(r4, r5); /* r5 = f1 e1 f0 e0 */ \ | 205 "punpckldq %%mm6, %%mm4 \n\t" /* r4 = h0 g0 f0 e0 = R4 */ \ |
223 punpckldq_r2r(r6, r4); /* r4 = h0 g0 f0 e0 = R4 */ \ | 206 "punpckhdq %%mm6, %%mm5 \n\t" /* r5 = h1 g1 f1 e1 = R5 */ \ |
224 punpckhdq_r2r(r6, r5); /* r5 = h1 g1 f1 e1 = R5 */ \ | 207 "movq %%mm1, %%mm6 \n\t" /* r6 = f3 e3 f2 e2 */ \ |
225 movq_r2r(r1, r6); /* r6 = f3 e3 f2 e2 */ \ | 208 "movq %%mm4, "J(4)"\n\t" \ |
226 movq_r2m(r4, *J(4)); \ | 209 "punpckhwd %%mm7, %%mm0 \n\t" /* r0 = h3 g3 h2 g2 */ \ |
227 punpckhwd_r2r(r7, r0); /* r0 = h3 g3 h2 g2 */ \ | 210 "movq %%mm5, "J(5)"\n\t" \ |
228 movq_r2m(r5, *J(5)); \ | 211 "punpckhdq %%mm0, %%mm6 \n\t" /* r6 = h3 g3 f3 e3 = R7 */ \ |
229 punpckhdq_r2r(r0, r6); /* r6 = h3 g3 f3 e3 = R7 */ \ | 212 "movq "I(0)", %%mm4 \n\t" /* r4 = a3 a2 a1 a0 */ \ |
230 movq_m2r(*I(0), r4); /* r4 = a3 a2 a1 a0 */ \ | 213 "punpckldq %%mm0, %%mm1 \n\t" /* r1 = h2 g2 f2 e2 = R6 */ \ |
231 punpckldq_r2r(r0, r1); /* r1 = h2 g2 f2 e2 = R6 */ \ | 214 "movq "I(1)", %%mm5 \n\t" /* r5 = b3 b2 b1 b0 */ \ |
232 movq_m2r(*I(1), r5); /* r5 = b3 b2 b1 b0 */ \ | 215 "movq %%mm4, %%mm0 \n\t" /* r0 = a3 a2 a1 a0 */ \ |
233 movq_r2r(r4, r0); /* r0 = a3 a2 a1 a0 */ \ | 216 "movq %%mm6, "J(7)"\n\t" \ |
234 movq_r2m(r6, *J(7)); \ | 217 "punpcklwd %%mm5, %%mm0 \n\t" /* r0 = b1 a1 b0 a0 */ \ |
235 punpcklwd_r2r(r5, r0); /* r0 = b1 a1 b0 a0 */ \ | 218 "movq %%mm1, "J(6)"\n\t" \ |
236 movq_r2m(r1, *J(6)); \ | 219 "punpckhwd %%mm5, %%mm4 \n\t" /* r4 = b3 a3 b2 a2 */ \ |
237 punpckhwd_r2r(r5, r4); /* r4 = b3 a3 b2 a2 */ \ | 220 "movq %%mm2, %%mm5 \n\t" /* r5 = c3 c2 c1 c0 */ \ |
238 movq_r2r(r2, r5); /* r5 = c3 c2 c1 c0 */ \ | 221 "punpcklwd %%mm3, %%mm2 \n\t" /* r2 = d1 c1 d0 c0 */ \ |
239 punpcklwd_r2r(r3, r2); /* r2 = d1 c1 d0 c0 */ \ | 222 "movq %%mm0, %%mm1 \n\t" /* r1 = b1 a1 b0 a0 */ \ |
240 movq_r2r(r0, r1); /* r1 = b1 a1 b0 a0 */ \ | 223 "punpckldq %%mm2, %%mm0 \n\t" /* r0 = d0 c0 b0 a0 = R0 */ \ |
241 punpckldq_r2r(r2, r0); /* r0 = d0 c0 b0 a0 = R0 */ \ | 224 "punpckhdq %%mm2, %%mm1 \n\t" /* r1 = d1 c1 b1 a1 = R1 */ \ |
242 punpckhdq_r2r(r2, r1); /* r1 = d1 c1 b1 a1 = R1 */ \ | 225 "movq %%mm4, %%mm2 \n\t" /* r2 = b3 a3 b2 a2 */ \ |
243 movq_r2r(r4, r2); /* r2 = b3 a3 b2 a2 */ \ | 226 "movq %%mm0, "I(0)"\n\t" \ |
244 movq_r2m(r0, *I(0)); \ | 227 "punpckhwd %%mm3, %%mm5 \n\t" /* r5 = d3 c3 d2 c2 */ \ |
245 punpckhwd_r2r(r3, r5); /* r5 = d3 c3 d2 c2 */ \ | 228 "movq %%mm1, "I(1)"\n\t" \ |
246 movq_r2m(r1, *I(1)); \ | 229 "punpckhdq %%mm5, %%mm4 \n\t" /* r4 = d3 c3 b3 a3 = R3 */ \ |
247 punpckhdq_r2r(r5, r4); /* r4 = d3 c3 b3 a3 = R3 */ \ | 230 "punpckldq %%mm5, %%mm2 \n\t" /* r2 = d2 c2 b2 a2 = R2 */ \ |
248 punpckldq_r2r(r5, r2); /* r2 = d2 c2 b2 a2 = R2 */ \ | 231 "movq %%mm4, "I(3)"\n\t" \ |
249 movq_r2m(r4, *I(3)); \ | 232 "movq %%mm2, "I(2)"\n\t" |
250 movq_r2m(r2, *I(2)); \ | |
251 } | |
252 | 233 |
253 void ff_vp3_idct_mmx(int16_t *output_data) | 234 void ff_vp3_idct_mmx(int16_t *output_data) |
254 { | 235 { |
255 /* eax = quantized input | 236 /* eax = quantized input |
256 * ebx = dequantizer matrix | 237 * ebx = dequantizer matrix |
259 * C(I) = ecx + CosineOffset(32) + (I-1) * 8 | 240 * C(I) = ecx + CosineOffset(32) + (I-1) * 8 |
260 * edx = output | 241 * edx = output |
261 * r0..r7 = mm0..mm7 | 242 * r0..r7 = mm0..mm7 |
262 */ | 243 */ |
263 | 244 |
264 #define C(x) (ff_vp3_idct_data + (x - 1) * 8) | 245 #define C(x) AV_STRINGIFY(16*(x-1))"(%1)" |
265 #define Eight (&ff_pw_8) | 246 #define OC_8 "%2" |
266 | 247 |
267 /* at this point, function has completed dequantization + dezigzag + | 248 /* at this point, function has completed dequantization + dezigzag + |
268 * partial transposition; now do the idct itself */ | 249 * partial transposition; now do the idct itself */ |
269 #define I(K) (output_data + K * 8) | 250 #define I(x) AV_STRINGIFY(16* x )"(%0)" |
270 #define J(K) (output_data + ((K - 4) * 8) + 4) | 251 #define J(x) AV_STRINGIFY(16*(x-4) + 8)"(%0)" |
271 | 252 |
272 RowIDCT(); | 253 asm volatile ( |
273 Transpose(); | 254 RowIDCT() |
274 | 255 Transpose() |
275 #undef I | 256 |
276 #undef J | 257 #undef I |
277 #define I(K) (output_data + (K * 8) + 32) | 258 #undef J |
278 #define J(K) (output_data + ((K - 4) * 8) + 36) | 259 #define I(x) AV_STRINGIFY(16* x + 64)"(%0)" |
279 | 260 #define J(x) AV_STRINGIFY(16*(x-4) + 72)"(%0)" |
280 RowIDCT(); | 261 |
281 Transpose(); | 262 RowIDCT() |
282 | 263 Transpose() |
283 #undef I | 264 |
284 #undef J | 265 #undef I |
285 #define I(K) (output_data + K * 8) | 266 #undef J |
286 #define J(K) (output_data + K * 8) | 267 #define I(x) AV_STRINGIFY(16*x)"(%0)" |
287 | 268 #define J(x) AV_STRINGIFY(16*x)"(%0)" |
288 ColumnIDCT(); | 269 |
289 | 270 ColumnIDCT() |
290 #undef I | 271 |
291 #undef J | 272 #undef I |
292 #define I(K) (output_data + (K * 8) + 4) | 273 #undef J |
293 #define J(K) (output_data + (K * 8) + 4) | 274 #define I(x) AV_STRINGIFY(16*x + 8)"(%0)" |
294 | 275 #define J(x) AV_STRINGIFY(16*x + 8)"(%0)" |
295 ColumnIDCT(); | 276 |
296 | 277 ColumnIDCT() |
278 :: "r"(output_data), "r"(ff_vp3_idct_data), "m"(ff_pw_8) | |
279 ); | |
297 #undef I | 280 #undef I |
298 #undef J | 281 #undef J |
299 | 282 |
300 } | 283 } |
301 | 284 |