comparison i386/vp3dsp_mmx.c @ 7877:8759422d660a libavcodec

Rewrite MMX VP3 IDCT in inline asm
author conrad
date Wed, 17 Sep 2008 19:30:33 +0000
parents 892ca48b7d76
children e05e021fce72
comparison
equal deleted inserted replaced
7876:3fd591f125b5 7877:8759422d660a
23 * MMX-optimized functions cribbed from the original VP3 source code. 23 * MMX-optimized functions cribbed from the original VP3 source code.
24 */ 24 */
25 25
26 #include "libavcodec/dsputil.h" 26 #include "libavcodec/dsputil.h"
27 #include "dsputil_mmx.h" 27 #include "dsputil_mmx.h"
28 #include "mmx.h"
29 28
30 extern const uint16_t ff_vp3_idct_data[]; 29 extern const uint16_t ff_vp3_idct_data[];
31 30
32 #define r0 mm0
33 #define r1 mm1
34 #define r2 mm2
35 #define r3 mm3
36 #define r4 mm4
37 #define r5 mm5
38 #define r6 mm6
39 #define r7 mm7
40
41 /* from original comments: The Macro does IDct on 4 1-D Dcts */ 31 /* from original comments: The Macro does IDct on 4 1-D Dcts */
42 #define BeginIDCT() { \ 32 #define BeginIDCT() \
43 movq_m2r(*I(3), r2); \ 33 "movq "I(3)", %%mm2 \n\t" \
44 movq_m2r(*C(3), r6); \ 34 "movq "C(3)", %%mm6 \n\t" \
45 movq_r2r(r2, r4); \ 35 "movq %%mm2, %%mm4 \n\t" \
46 movq_m2r(*J(5), r7); \ 36 "movq "J(5)", %%mm7 \n\t" \
47 pmulhw_r2r(r6, r4); /* r4 = c3*i3 - i3 */ \ 37 "pmulhw %%mm6, %%mm4 \n\t" /* r4 = c3*i3 - i3 */ \
48 movq_m2r(*C(5), r1); \ 38 "movq "C(5)", %%mm1 \n\t" \
49 pmulhw_r2r(r7, r6); /* r6 = c3*i5 - i5 */ \ 39 "pmulhw %%mm7, %%mm6 \n\t" /* r6 = c3*i5 - i5 */ \
50 movq_r2r(r1, r5); \ 40 "movq %%mm1, %%mm5 \n\t" \
51 pmulhw_r2r(r2, r1); /* r1 = c5*i3 - i3 */ \ 41 "pmulhw %%mm2, %%mm1 \n\t" /* r1 = c5*i3 - i3 */ \
52 movq_m2r(*I(1), r3); \ 42 "movq "I(1)", %%mm3 \n\t" \
53 pmulhw_r2r(r7, r5); /* r5 = c5*i5 - i5 */ \ 43 "pmulhw %%mm7, %%mm5 \n\t" /* r5 = c5*i5 - i5 */ \
54 movq_m2r(*C(1), r0); /* (all registers are in use) */ \ 44 "movq "C(1)", %%mm0 \n\t" \
55 paddw_r2r(r2, r4); /* r4 = c3*i3 */ \ 45 "paddw %%mm2, %%mm4 \n\t" /* r4 = c3*i3 */ \
56 paddw_r2r(r7, r6); /* r6 = c3*i5 */ \ 46 "paddw %%mm7, %%mm6 \n\t" /* r6 = c3*i5 */ \
57 paddw_r2r(r1, r2); /* r2 = c5*i3 */ \ 47 "paddw %%mm1, %%mm2 \n\t" /* r2 = c5*i3 */ \
58 movq_m2r(*J(7), r1); \ 48 "movq "J(7)", %%mm1 \n\t" \
59 paddw_r2r(r5, r7); /* r7 = c5*i5 */ \ 49 "paddw %%mm5, %%mm7 \n\t" /* r7 = c5*i5 */ \
60 movq_r2r(r0, r5); /* r5 = c1 */ \ 50 "movq %%mm0, %%mm5 \n\t" /* r5 = c1 */ \
61 pmulhw_r2r(r3, r0); /* r0 = c1*i1 - i1 */ \ 51 "pmulhw %%mm3, %%mm0 \n\t" /* r0 = c1*i1 - i1 */ \
62 paddsw_r2r(r7, r4); /* r4 = C = c3*i3 + c5*i5 */ \ 52 "paddsw %%mm7, %%mm4 \n\t" /* r4 = C = c3*i3 + c5*i5 */ \
63 pmulhw_r2r(r1, r5); /* r5 = c1*i7 - i7 */ \ 53 "pmulhw %%mm1, %%mm5 \n\t" /* r5 = c1*i7 - i7 */ \
64 movq_m2r(*C(7), r7); \ 54 "movq "C(7)", %%mm7 \n\t" \
65 psubsw_r2r(r2, r6); /* r6 = D = c3*i5 - c5*i3 */ \ 55 "psubsw %%mm2, %%mm6 \n\t" /* r6 = D = c3*i5 - c5*i3 */ \
66 paddw_r2r(r3, r0); /* r0 = c1*i1 */ \ 56 "paddw %%mm3, %%mm0 \n\t" /* r0 = c1*i1 */ \
67 pmulhw_r2r(r7, r3); /* r3 = c7*i1 */ \ 57 "pmulhw %%mm7, %%mm3 \n\t" /* r3 = c7*i1 */ \
68 movq_m2r(*I(2), r2); \ 58 "movq "I(2)", %%mm2 \n\t" \
69 pmulhw_r2r(r1, r7); /* r7 = c7*i7 */ \ 59 "pmulhw %%mm1, %%mm7 \n\t" /* r7 = c7*i7 */ \
70 paddw_r2r(r1, r5); /* r5 = c1*i7 */ \ 60 "paddw %%mm1, %%mm5 \n\t" /* r5 = c1*i7 */ \
71 movq_r2r(r2, r1); /* r1 = i2 */ \ 61 "movq %%mm2, %%mm1 \n\t" /* r1 = i2 */ \
72 pmulhw_m2r(*C(2), r2); /* r2 = c2*i2 - i2 */ \ 62 "pmulhw "C(2)", %%mm2 \n\t" /* r2 = c2*i2 - i2 */ \
73 psubsw_r2r(r5, r3); /* r3 = B = c7*i1 - c1*i7 */ \ 63 "psubsw %%mm5, %%mm3 \n\t" /* r3 = B = c7*i1 - c1*i7 */ \
74 movq_m2r(*J(6), r5); \ 64 "movq "J(6)", %%mm5 \n\t" \
75 paddsw_r2r(r7, r0); /* r0 = A = c1*i1 + c7*i7 */ \ 65 "paddsw %%mm7, %%mm0 \n\t" /* r0 = A = c1*i1 + c7*i7 */ \
76 movq_r2r(r5, r7); /* r7 = i6 */ \ 66 "movq %%mm5, %%mm7 \n\t" /* r7 = i6 */ \
77 psubsw_r2r(r4, r0); /* r0 = A - C */ \ 67 "psubsw %%mm4, %%mm0 \n\t" /* r0 = A - C */ \
78 pmulhw_m2r(*C(2), r5); /* r5 = c2*i6 - i6 */ \ 68 "pmulhw "C(2)", %%mm5 \n\t" /* r5 = c2*i6 - i6 */ \
79 paddw_r2r(r1, r2); /* r2 = c2*i2 */ \ 69 "paddw %%mm1, %%mm2 \n\t" /* r2 = c2*i2 */ \
80 pmulhw_m2r(*C(6), r1); /* r1 = c6*i2 */ \ 70 "pmulhw "C(6)", %%mm1 \n\t" /* r1 = c6*i2 */ \
81 paddsw_r2r(r4, r4); /* r4 = C + C */ \ 71 "paddsw %%mm4, %%mm4 \n\t" /* r4 = C + C */ \
82 paddsw_r2r(r0, r4); /* r4 = C. = A + C */ \ 72 "paddsw %%mm0, %%mm4 \n\t" /* r4 = C. = A + C */ \
83 psubsw_r2r(r6, r3); /* r3 = B - D */ \ 73 "psubsw %%mm6, %%mm3 \n\t" /* r3 = B - D */ \
84 paddw_r2r(r7, r5); /* r5 = c2*i6 */ \ 74 "paddw %%mm7, %%mm5 \n\t" /* r5 = c2*i6 */ \
85 paddsw_r2r(r6, r6); /* r6 = D + D */ \ 75 "paddsw %%mm6, %%mm6 \n\t" /* r6 = D + D */ \
86 pmulhw_m2r(*C(6), r7); /* r7 = c6*i6 */ \ 76 "pmulhw "C(6)", %%mm7 \n\t" /* r7 = c6*i6 */ \
87 paddsw_r2r(r3, r6); /* r6 = D. = B + D */ \ 77 "paddsw %%mm3, %%mm6 \n\t" /* r6 = D. = B + D */ \
88 movq_r2m(r4, *I(1)); /* save C. at I(1) */ \ 78 "movq %%mm4, "I(1)"\n\t" /* save C. at I(1) */ \
89 psubsw_r2r(r5, r1); /* r1 = H = c6*i2 - c2*i6 */ \ 79 "psubsw %%mm5, %%mm1 \n\t" /* r1 = H = c6*i2 - c2*i6 */ \
90 movq_m2r(*C(4), r4); \ 80 "movq "C(4)", %%mm4 \n\t" \
91 movq_r2r(r3, r5); /* r5 = B - D */ \ 81 "movq %%mm3, %%mm5 \n\t" /* r5 = B - D */ \
92 pmulhw_r2r(r4, r3); /* r3 = (c4 - 1) * (B - D) */ \ 82 "pmulhw %%mm4, %%mm3 \n\t" /* r3 = (c4 - 1) * (B - D) */ \
93 paddsw_r2r(r2, r7); /* r7 = G = c6*i6 + c2*i2 */ \ 83 "paddsw %%mm2, %%mm7 \n\t" /* r3 = (c4 - 1) * (B - D) */ \
94 movq_r2m(r6, *I(2)); /* save D. at I(2) */ \ 84 "movq %%mm6, "I(2)"\n\t" /* save D. at I(2) */ \
95 movq_r2r(r0, r2); /* r2 = A - C */ \ 85 "movq %%mm0, %%mm2 \n\t" /* r2 = A - C */ \
96 movq_m2r(*I(0), r6); \ 86 "movq "I(0)", %%mm6 \n\t" \
97 pmulhw_r2r(r4, r0); /* r0 = (c4 - 1) * (A - C) */ \ 87 "pmulhw %%mm4, %%mm0 \n\t" /* r0 = (c4 - 1) * (A - C) */ \
98 paddw_r2r(r3, r5); /* r5 = B. = c4 * (B - D) */ \ 88 "paddw %%mm3, %%mm5 \n\t" /* r5 = B. = c4 * (B - D) */ \
99 movq_m2r(*J(4), r3); \ 89 "movq "J(4)", %%mm3 \n\t" \
100 psubsw_r2r(r1, r5); /* r5 = B.. = B. - H */ \ 90 "psubsw %%mm1, %%mm5 \n\t" /* r5 = B.. = B. - H */ \
101 paddw_r2r(r0, r2); /* r0 = A. = c4 * (A - C) */ \ 91 "paddw %%mm0, %%mm2 \n\t" /* r0 = A. = c4 * (A - C) */ \
102 psubsw_r2r(r3, r6); /* r6 = i0 - i4 */ \ 92 "psubsw %%mm3, %%mm6 \n\t" /* r6 = i0 - i4 */ \
103 movq_r2r(r6, r0); \ 93 "movq %%mm6, %%mm0 \n\t" \
104 pmulhw_r2r(r4, r6); /* r6 = (c4 - 1) * (i0 - i4) */ \ 94 "pmulhw %%mm4, %%mm6 \n\t" /* r6 = (c4 - 1) * (i0 - i4) */ \
105 paddsw_r2r(r3, r3); /* r3 = i4 + i4 */ \ 95 "paddsw %%mm3, %%mm3 \n\t" /* r3 = i4 + i4 */ \
106 paddsw_r2r(r1, r1); /* r1 = H + H */ \ 96 "paddsw %%mm1, %%mm1 \n\t" /* r1 = H + H */ \
107 paddsw_r2r(r0, r3); /* r3 = i0 + i4 */ \ 97 "paddsw %%mm0, %%mm3 \n\t" /* r3 = i0 + i4 */ \
108 paddsw_r2r(r5, r1); /* r1 = H. = B + H */ \ 98 "paddsw %%mm5, %%mm1 \n\t" /* r1 = H. = B + H */ \
109 pmulhw_r2r(r3, r4); /* r4 = (c4 - 1) * (i0 + i4) */ \ 99 "pmulhw %%mm3, %%mm4 \n\t" /* r4 = (c4 - 1) * (i0 + i4) */ \
110 paddsw_r2r(r0, r6); /* r6 = F = c4 * (i0 - i4) */ \ 100 "paddsw %%mm0, %%mm6 \n\t" /* r6 = F = c4 * (i0 - i4) */ \
111 psubsw_r2r(r2, r6); /* r6 = F. = F - A. */ \ 101 "psubsw %%mm2, %%mm6 \n\t" /* r6 = F. = F - A. */ \
112 paddsw_r2r(r2, r2); /* r2 = A. + A. */ \ 102 "paddsw %%mm2, %%mm2 \n\t" /* r2 = A. + A. */ \
113 movq_m2r(*I(1), r0); /* r0 = C. */ \ 103 "movq "I(1)", %%mm0 \n\t" /* r0 = C. */ \
114 paddsw_r2r(r6, r2); /* r2 = A.. = F + A. */ \ 104 "paddsw %%mm6, %%mm2 \n\t" /* r2 = A.. = F + A. */ \
115 paddw_r2r(r3, r4); /* r4 = E = c4 * (i0 + i4) */ \ 105 "paddw %%mm3, %%mm4 \n\t" /* r4 = E = c4 * (i0 + i4) */ \
116 psubsw_r2r(r1, r2); /* r2 = R2 = A.. - H. */ \ 106 "psubsw %%mm1, %%mm2 \n\t" /* r2 = R2 = A.. - H. */
117 }
118 107
119 /* RowIDCT gets ready to transpose */ 108 /* RowIDCT gets ready to transpose */
120 #define RowIDCT() { \ 109 #define RowIDCT() \
121 \ 110 BeginIDCT() \
122 BeginIDCT(); \ 111 "movq "I(2)", %%mm3 \n\t" /* r3 = D. */ \
123 \ 112 "psubsw %%mm7, %%mm4 \n\t" /* r4 = E. = E - G */ \
124 movq_m2r(*I(2), r3); /* r3 = D. */ \ 113 "paddsw %%mm1, %%mm1 \n\t" /* r1 = H. + H. */ \
125 psubsw_r2r(r7, r4); /* r4 = E. = E - G */ \ 114 "paddsw %%mm7, %%mm7 \n\t" /* r7 = G + G */ \
126 paddsw_r2r(r1, r1); /* r1 = H. + H. */ \ 115 "paddsw %%mm2, %%mm1 \n\t" /* r1 = R1 = A.. + H. */ \
127 paddsw_r2r(r7, r7); /* r7 = G + G */ \ 116 "paddsw %%mm4, %%mm7 \n\t" /* r1 = R1 = A.. + H. */ \
128 paddsw_r2r(r2, r1); /* r1 = R1 = A.. + H. */ \ 117 "psubsw %%mm3, %%mm4 \n\t" /* r4 = R4 = E. - D. */ \
129 paddsw_r2r(r4, r7); /* r7 = G. = E + G */ \ 118 "paddsw %%mm3, %%mm3 \n\t" \
130 psubsw_r2r(r3, r4); /* r4 = R4 = E. - D. */ \ 119 "psubsw %%mm5, %%mm6 \n\t" /* r6 = R6 = F. - B.. */ \
131 paddsw_r2r(r3, r3); \ 120 "paddsw %%mm5, %%mm5 \n\t" \
132 psubsw_r2r(r5, r6); /* r6 = R6 = F. - B.. */ \ 121 "paddsw %%mm4, %%mm3 \n\t" /* r3 = R3 = E. + D. */ \
133 paddsw_r2r(r5, r5); \ 122 "paddsw %%mm6, %%mm5 \n\t" /* r5 = R5 = F. + B.. */ \
134 paddsw_r2r(r4, r3); /* r3 = R3 = E. + D. */ \ 123 "psubsw %%mm0, %%mm7 \n\t" /* r7 = R7 = G. - C. */ \
135 paddsw_r2r(r6, r5); /* r5 = R5 = F. + B.. */ \ 124 "paddsw %%mm0, %%mm0 \n\t" \
136 psubsw_r2r(r0, r7); /* r7 = R7 = G. - C. */ \ 125 "movq %%mm1, "I(1)"\n\t" /* save R1 */ \
137 paddsw_r2r(r0, r0); \ 126 "paddsw %%mm7, %%mm0 \n\t" /* r0 = R0 = G. + C. */
138 movq_r2m(r1, *I(1)); /* save R1 */ \
139 paddsw_r2r(r7, r0); /* r0 = R0 = G. + C. */ \
140 }
141 127
142 /* Column IDCT normalizes and stores final results */ 128 /* Column IDCT normalizes and stores final results */
143 #define ColumnIDCT() { \ 129 #define ColumnIDCT() \
144 \ 130 BeginIDCT() \
145 BeginIDCT(); \ 131 "paddsw "OC_8", %%mm2 \n\t" /* adjust R2 (and R1) for shift */ \
146 \ 132 "paddsw %%mm1, %%mm1 \n\t" /* r1 = H. + H. */ \
147 paddsw_m2r(*Eight, r2); /* adjust R2 (and R1) for shift */ \ 133 "paddsw %%mm2, %%mm1 \n\t" /* r1 = R1 = A.. + H. */ \
148 paddsw_r2r(r1, r1); /* r1 = H. + H. */ \ 134 "psraw $4, %%mm2 \n\t" /* r2 = NR2 */ \
149 paddsw_r2r(r2, r1); /* r1 = R1 = A.. + H. */ \ 135 "psubsw %%mm7, %%mm4 \n\t" /* r4 = E. = E - G */ \
150 psraw_i2r(4, r2); /* r2 = NR2 */ \ 136 "psraw $4, %%mm1 \n\t" /* r1 = NR1 */ \
151 psubsw_r2r(r7, r4); /* r4 = E. = E - G */ \ 137 "movq "I(2)", %%mm3 \n\t" /* r3 = D. */ \
152 psraw_i2r(4, r1); /* r1 = NR1 */ \ 138 "paddsw %%mm7, %%mm7 \n\t" /* r7 = G + G */ \
153 movq_m2r(*I(2), r3); /* r3 = D. */ \ 139 "movq %%mm2, "I(2)"\n\t" /* store NR2 at I2 */ \
154 paddsw_r2r(r7, r7); /* r7 = G + G */ \ 140 "paddsw %%mm4, %%mm7 \n\t" /* r7 = G. = E + G */ \
155 movq_r2m(r2, *I(2)); /* store NR2 at I2 */ \ 141 "movq %%mm1, "I(1)"\n\t" /* store NR1 at I1 */ \
156 paddsw_r2r(r4, r7); /* r7 = G. = E + G */ \ 142 "psubsw %%mm3, %%mm4 \n\t" /* r4 = R4 = E. - D. */ \
157 movq_r2m(r1, *I(1)); /* store NR1 at I1 */ \ 143 "paddsw "OC_8", %%mm4 \n\t" /* adjust R4 (and R3) for shift */ \
158 psubsw_r2r(r3, r4); /* r4 = R4 = E. - D. */ \ 144 "paddsw %%mm3, %%mm3 \n\t" /* r3 = D. + D. */ \
159 paddsw_m2r(*Eight, r4); /* adjust R4 (and R3) for shift */ \ 145 "paddsw %%mm4, %%mm3 \n\t" /* r3 = R3 = E. + D. */ \
160 paddsw_r2r(r3, r3); /* r3 = D. + D. */ \ 146 "psraw $4, %%mm4 \n\t" /* r4 = NR4 */ \
161 paddsw_r2r(r4, r3); /* r3 = R3 = E. + D. */ \ 147 "psubsw %%mm5, %%mm6 \n\t" /* r6 = R6 = F. - B.. */ \
162 psraw_i2r(4, r4); /* r4 = NR4 */ \ 148 "psraw $4, %%mm3 \n\t" /* r3 = NR3 */ \
163 psubsw_r2r(r5, r6); /* r6 = R6 = F. - B.. */ \ 149 "paddsw "OC_8", %%mm6 \n\t" /* adjust R6 (and R5) for shift */ \
164 psraw_i2r(4, r3); /* r3 = NR3 */ \ 150 "paddsw %%mm5, %%mm5 \n\t" /* r5 = B.. + B.. */ \
165 paddsw_m2r(*Eight, r6); /* adjust R6 (and R5) for shift */ \ 151 "paddsw %%mm6, %%mm5 \n\t" /* r5 = R5 = F. + B.. */ \
166 paddsw_r2r(r5, r5); /* r5 = B.. + B.. */ \ 152 "psraw $4, %%mm6 \n\t" /* r6 = NR6 */ \
167 paddsw_r2r(r6, r5); /* r5 = R5 = F. + B.. */ \ 153 "movq %%mm4, "J(4)"\n\t" /* store NR4 at J4 */ \
168 psraw_i2r(4, r6); /* r6 = NR6 */ \ 154 "psraw $4, %%mm5 \n\t" /* r5 = NR5 */ \
169 movq_r2m(r4, *J(4)); /* store NR4 at J4 */ \ 155 "movq %%mm3, "I(3)"\n\t" /* store NR3 at I3 */ \
170 psraw_i2r(4, r5); /* r5 = NR5 */ \ 156 "psubsw %%mm0, %%mm7 \n\t" /* r7 = R7 = G. - C. */ \
171 movq_r2m(r3, *I(3)); /* store NR3 at I3 */ \ 157 "paddsw "OC_8", %%mm7 \n\t" /* adjust R7 (and R0) for shift */ \
172 psubsw_r2r(r0, r7); /* r7 = R7 = G. - C. */ \ 158 "paddsw %%mm0, %%mm0 \n\t" /* r0 = C. + C. */ \
173 paddsw_m2r(*Eight, r7); /* adjust R7 (and R0) for shift */ \ 159 "paddsw %%mm7, %%mm0 \n\t" /* r0 = R0 = G. + C. */ \
174 paddsw_r2r(r0, r0); /* r0 = C. + C. */ \ 160 "psraw $4, %%mm7 \n\t" /* r7 = NR7 */ \
175 paddsw_r2r(r7, r0); /* r0 = R0 = G. + C. */ \ 161 "movq %%mm6, "J(6)"\n\t" /* store NR6 at J6 */ \
176 psraw_i2r(4, r7); /* r7 = NR7 */ \ 162 "psraw $4, %%mm0 \n\t" /* r0 = NR0 */ \
177 movq_r2m(r6, *J(6)); /* store NR6 at J6 */ \ 163 "movq %%mm5, "J(5)"\n\t" /* store NR5 at J5 */ \
178 psraw_i2r(4, r0); /* r0 = NR0 */ \ 164 "movq %%mm7, "J(7)"\n\t" /* store NR7 at J7 */ \
179 movq_r2m(r5, *J(5)); /* store NR5 at J5 */ \ 165 "movq %%mm0, "I(0)"\n\t" /* store NR0 at I0 */
180 movq_r2m(r7, *J(7)); /* store NR7 at J7 */ \
181 movq_r2m(r0, *I(0)); /* store NR0 at I0 */ \
182 }
183 166
184 /* Following macro does two 4x4 transposes in place. 167 /* Following macro does two 4x4 transposes in place.
185 168
186 At entry (we assume): 169 At entry (we assume):
187 170
209 192
210 I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3. 193 I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
211 J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7. 194 J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7.
212 195
213 Since r1 is free at entry, we calculate the Js first. */ 196 Since r1 is free at entry, we calculate the Js first. */
214 197 #define Transpose() \
215 #define Transpose() { \ 198 "movq %%mm4, %%mm1 \n\t" /* r1 = e3 e2 e1 e0 */ \
216 movq_r2r(r4, r1); /* r1 = e3 e2 e1 e0 */ \ 199 "punpcklwd %%mm5, %%mm4 \n\t" /* r4 = f1 e1 f0 e0 */ \
217 punpcklwd_r2r(r5, r4); /* r4 = f1 e1 f0 e0 */ \ 200 "movq %%mm0, "I(0)"\n\t" /* save a3 a2 a1 a0 */ \
218 movq_r2m(r0, *I(0)); /* save a3 a2 a1 a0 */ \ 201 "punpckhwd %%mm5, %%mm1 \n\t" /* r1 = f3 e3 f2 e2 */ \
219 punpckhwd_r2r(r5, r1); /* r1 = f3 e3 f2 e2 */ \ 202 "movq %%mm6, %%mm0 \n\t" /* r0 = g3 g2 g1 g0 */ \
220 movq_r2r(r6, r0); /* r0 = g3 g2 g1 g0 */ \ 203 "punpcklwd %%mm7, %%mm6 \n\t" /* r6 = h1 g1 h0 g0 */ \
221 punpcklwd_r2r(r7, r6); /* r6 = h1 g1 h0 g0 */ \ 204 "movq %%mm4, %%mm5 \n\t" /* r5 = f1 e1 f0 e0 */ \
222 movq_r2r(r4, r5); /* r5 = f1 e1 f0 e0 */ \ 205 "punpckldq %%mm6, %%mm4 \n\t" /* r4 = h0 g0 f0 e0 = R4 */ \
223 punpckldq_r2r(r6, r4); /* r4 = h0 g0 f0 e0 = R4 */ \ 206 "punpckhdq %%mm6, %%mm5 \n\t" /* r5 = h1 g1 f1 e1 = R5 */ \
224 punpckhdq_r2r(r6, r5); /* r5 = h1 g1 f1 e1 = R5 */ \ 207 "movq %%mm1, %%mm6 \n\t" /* r6 = f3 e3 f2 e2 */ \
225 movq_r2r(r1, r6); /* r6 = f3 e3 f2 e2 */ \ 208 "movq %%mm4, "J(4)"\n\t" \
226 movq_r2m(r4, *J(4)); \ 209 "punpckhwd %%mm7, %%mm0 \n\t" /* r0 = h3 g3 h2 g2 */ \
227 punpckhwd_r2r(r7, r0); /* r0 = h3 g3 h2 g2 */ \ 210 "movq %%mm5, "J(5)"\n\t" \
228 movq_r2m(r5, *J(5)); \ 211 "punpckhdq %%mm0, %%mm6 \n\t" /* r6 = h3 g3 f3 e3 = R7 */ \
229 punpckhdq_r2r(r0, r6); /* r6 = h3 g3 f3 e3 = R7 */ \ 212 "movq "I(0)", %%mm4 \n\t" /* r4 = a3 a2 a1 a0 */ \
230 movq_m2r(*I(0), r4); /* r4 = a3 a2 a1 a0 */ \ 213 "punpckldq %%mm0, %%mm1 \n\t" /* r1 = h2 g2 f2 e2 = R6 */ \
231 punpckldq_r2r(r0, r1); /* r1 = h2 g2 f2 e2 = R6 */ \ 214 "movq "I(1)", %%mm5 \n\t" /* r5 = b3 b2 b1 b0 */ \
232 movq_m2r(*I(1), r5); /* r5 = b3 b2 b1 b0 */ \ 215 "movq %%mm4, %%mm0 \n\t" /* r0 = a3 a2 a1 a0 */ \
233 movq_r2r(r4, r0); /* r0 = a3 a2 a1 a0 */ \ 216 "movq %%mm6, "J(7)"\n\t" \
234 movq_r2m(r6, *J(7)); \ 217 "punpcklwd %%mm5, %%mm0 \n\t" /* r0 = b1 a1 b0 a0 */ \
235 punpcklwd_r2r(r5, r0); /* r0 = b1 a1 b0 a0 */ \ 218 "movq %%mm1, "J(6)"\n\t" \
236 movq_r2m(r1, *J(6)); \ 219 "punpckhwd %%mm5, %%mm4 \n\t" /* r4 = b3 a3 b2 a2 */ \
237 punpckhwd_r2r(r5, r4); /* r4 = b3 a3 b2 a2 */ \ 220 "movq %%mm2, %%mm5 \n\t" /* r5 = c3 c2 c1 c0 */ \
238 movq_r2r(r2, r5); /* r5 = c3 c2 c1 c0 */ \ 221 "punpcklwd %%mm3, %%mm2 \n\t" /* r2 = d1 c1 d0 c0 */ \
239 punpcklwd_r2r(r3, r2); /* r2 = d1 c1 d0 c0 */ \ 222 "movq %%mm0, %%mm1 \n\t" /* r1 = b1 a1 b0 a0 */ \
240 movq_r2r(r0, r1); /* r1 = b1 a1 b0 a0 */ \ 223 "punpckldq %%mm2, %%mm0 \n\t" /* r0 = d0 c0 b0 a0 = R0 */ \
241 punpckldq_r2r(r2, r0); /* r0 = d0 c0 b0 a0 = R0 */ \ 224 "punpckhdq %%mm2, %%mm1 \n\t" /* r1 = d1 c1 b1 a1 = R1 */ \
242 punpckhdq_r2r(r2, r1); /* r1 = d1 c1 b1 a1 = R1 */ \ 225 "movq %%mm4, %%mm2 \n\t" /* r2 = b3 a3 b2 a2 */ \
243 movq_r2r(r4, r2); /* r2 = b3 a3 b2 a2 */ \ 226 "movq %%mm0, "I(0)"\n\t" \
244 movq_r2m(r0, *I(0)); \ 227 "punpckhwd %%mm3, %%mm5 \n\t" /* r5 = d3 c3 d2 c2 */ \
245 punpckhwd_r2r(r3, r5); /* r5 = d3 c3 d2 c2 */ \ 228 "movq %%mm1, "I(1)"\n\t" \
246 movq_r2m(r1, *I(1)); \ 229 "punpckhdq %%mm5, %%mm4 \n\t" /* r4 = d3 c3 b3 a3 = R3 */ \
247 punpckhdq_r2r(r5, r4); /* r4 = d3 c3 b3 a3 = R3 */ \ 230 "punpckldq %%mm5, %%mm2 \n\t" /* r2 = d2 c2 b2 a2 = R2 */ \
248 punpckldq_r2r(r5, r2); /* r2 = d2 c2 b2 a2 = R2 */ \ 231 "movq %%mm4, "I(3)"\n\t" \
249 movq_r2m(r4, *I(3)); \ 232 "movq %%mm2, "I(2)"\n\t"
250 movq_r2m(r2, *I(2)); \
251 }
252 233
253 void ff_vp3_idct_mmx(int16_t *output_data) 234 void ff_vp3_idct_mmx(int16_t *output_data)
254 { 235 {
255 /* eax = quantized input 236 /* eax = quantized input
256 * ebx = dequantizer matrix 237 * ebx = dequantizer matrix
259 * C(I) = ecx + CosineOffset(32) + (I-1) * 8 240 * C(I) = ecx + CosineOffset(32) + (I-1) * 8
260 * edx = output 241 * edx = output
261 * r0..r7 = mm0..mm7 242 * r0..r7 = mm0..mm7
262 */ 243 */
263 244
264 #define C(x) (ff_vp3_idct_data + (x - 1) * 8) 245 #define C(x) AV_STRINGIFY(16*(x-1))"(%1)"
265 #define Eight (&ff_pw_8) 246 #define OC_8 "%2"
266 247
267 /* at this point, function has completed dequantization + dezigzag + 248 /* at this point, function has completed dequantization + dezigzag +
268 * partial transposition; now do the idct itself */ 249 * partial transposition; now do the idct itself */
269 #define I(K) (output_data + K * 8) 250 #define I(x) AV_STRINGIFY(16* x )"(%0)"
270 #define J(K) (output_data + ((K - 4) * 8) + 4) 251 #define J(x) AV_STRINGIFY(16*(x-4) + 8)"(%0)"
271 252
272 RowIDCT(); 253 asm volatile (
273 Transpose(); 254 RowIDCT()
274 255 Transpose()
275 #undef I 256
276 #undef J 257 #undef I
277 #define I(K) (output_data + (K * 8) + 32) 258 #undef J
278 #define J(K) (output_data + ((K - 4) * 8) + 36) 259 #define I(x) AV_STRINGIFY(16* x + 64)"(%0)"
279 260 #define J(x) AV_STRINGIFY(16*(x-4) + 72)"(%0)"
280 RowIDCT(); 261
281 Transpose(); 262 RowIDCT()
282 263 Transpose()
283 #undef I 264
284 #undef J 265 #undef I
285 #define I(K) (output_data + K * 8) 266 #undef J
286 #define J(K) (output_data + K * 8) 267 #define I(x) AV_STRINGIFY(16*x)"(%0)"
287 268 #define J(x) AV_STRINGIFY(16*x)"(%0)"
288 ColumnIDCT(); 269
289 270 ColumnIDCT()
290 #undef I 271
291 #undef J 272 #undef I
292 #define I(K) (output_data + (K * 8) + 4) 273 #undef J
293 #define J(K) (output_data + (K * 8) + 4) 274 #define I(x) AV_STRINGIFY(16*x + 8)"(%0)"
294 275 #define J(x) AV_STRINGIFY(16*x + 8)"(%0)"
295 ColumnIDCT(); 276
296 277 ColumnIDCT()
278 :: "r"(output_data), "r"(ff_vp3_idct_data), "m"(ff_pw_8)
279 );
297 #undef I 280 #undef I
298 #undef J 281 #undef J
299 282
300 } 283 }
301 284