comparison i386/dsputil_h264_template_mmx.c @ 3218:b2ffd9fb4153 libavcodec

simplified and slightly faster h264_chroma_mc8_mmx
author lorenm
date Sat, 25 Mar 2006 08:41:14 +0000
parents 91f89a395b28
children c8c591fe26f8
comparison
equal deleted inserted replaced
3217:d9eceb8313c2 3218:b2ffd9fb4153
35 return; 35 return;
36 } 36 }
37 37
38 assert(x<8 && y<8 && x>=0 && y>=0); 38 assert(x<8 && y<8 && x>=0 && y>=0);
39 39
40 if(y==0) 40 if(y==0 || x==0)
41 { 41 {
42 /* horizontal filter only */ 42 /* 1 dimensional filter only */
43 asm volatile("movd %0, %%mm5\n\t" 43 const int dxy = x ? 1 : stride;
44 "punpcklwd %%mm5, %%mm5\n\t" 44
45 "punpckldq %%mm5, %%mm5\n\t" /* mm5 = B = x */ 45 asm volatile(
46 "movq %1, %%mm4\n\t" 46 "movd %0, %%mm5\n\t"
47 "pxor %%mm7, %%mm7\n\t" 47 "movq %1, %%mm4\n\t"
48 "psubw %%mm5, %%mm4\n\t" /* mm4 = A = 8-x */ 48 "punpcklwd %%mm5, %%mm5\n\t"
49 : : "rm" (x), "m" (ff_pw_8)); 49 "punpckldq %%mm5, %%mm5\n\t" /* mm5 = B = x */
50 "movq %%mm4, %%mm6\n\t"
51 "pxor %%mm7, %%mm7\n\t"
52 "psubw %%mm5, %%mm4\n\t" /* mm4 = A = 8-x */
53 "psrlw $1, %%mm6\n\t" /* mm6 = 4 */
54 :: "rm"(x+y), "m"(ff_pw_8));
50 55
51 for(i=0; i<h; i++) { 56 for(i=0; i<h; i++) {
52 asm volatile( 57 asm volatile(
53 /* mm0 = src[0..7], mm1 = src[1..8] */ 58 /* mm0 = src[0..7], mm1 = src[1..8] */
54 "movq %0, %%mm0\n\t" 59 "movq %0, %%mm0\n\t"
55 "movq %1, %%mm1\n\t" 60 "movq %1, %%mm2\n\t"
56 : : "m" (src[0]), "m" (src[1])); 61 :: "m"(src[0]), "m"(src[dxy]));
57 62
58 asm volatile( 63 asm volatile(
59 /* [mm2,mm3] = A * src[0..7] */ 64 /* [mm0,mm1] = A * src[0..7] */
60 "movq %%mm0, %%mm2\n\t" 65 /* [mm2,mm3] = B * src[1..8] */
66 "movq %%mm0, %%mm1\n\t"
67 "movq %%mm2, %%mm3\n\t"
68 "punpcklbw %%mm7, %%mm0\n\t"
69 "punpckhbw %%mm7, %%mm1\n\t"
61 "punpcklbw %%mm7, %%mm2\n\t" 70 "punpcklbw %%mm7, %%mm2\n\t"
62 "pmullw %%mm4, %%mm2\n\t"
63 "movq %%mm0, %%mm3\n\t"
64 "punpckhbw %%mm7, %%mm3\n\t" 71 "punpckhbw %%mm7, %%mm3\n\t"
65 "pmullw %%mm4, %%mm3\n\t" 72 "pmullw %%mm4, %%mm0\n\t"
66 73 "pmullw %%mm4, %%mm1\n\t"
67 /* [mm2,mm3] += B * src[1..8] */ 74 "pmullw %%mm5, %%mm2\n\t"
68 "movq %%mm1, %%mm0\n\t" 75 "pmullw %%mm5, %%mm3\n\t"
69 "punpcklbw %%mm7, %%mm0\n\t" 76
70 "pmullw %%mm5, %%mm0\n\t" 77 /* dst[0..7] = (A * src[0..7] + B * src[1..8] + 4) >> 3 */
71 "punpckhbw %%mm7, %%mm1\n\t" 78 "paddw %%mm6, %%mm0\n\t"
72 "pmullw %%mm5, %%mm1\n\t" 79 "paddw %%mm6, %%mm1\n\t"
73 "paddw %%mm0, %%mm2\n\t" 80 "paddw %%mm2, %%mm0\n\t"
74 "paddw %%mm1, %%mm3\n\t" 81 "paddw %%mm3, %%mm1\n\t"
75 82 "psrlw $3, %%mm0\n\t"
76 /* dst[0..7] = pack(([mm2,mm3] + 32) >> 6) */ 83 "psrlw $3, %%mm1\n\t"
77 "paddw %1, %%mm2\n\t" 84 "packuswb %%mm1, %%mm0\n\t"
78 "paddw %1, %%mm3\n\t" 85 H264_CHROMA_OP(%0, %%mm0)
79 "psrlw $3, %%mm2\n\t" 86 "movq %%mm0, %0\n\t"
80 "psrlw $3, %%mm3\n\t" 87 : "=m" (dst[0]));
81 "packuswb %%mm3, %%mm2\n\t"
82 H264_CHROMA_OP(%0, %%mm2)
83 "movq %%mm2, %0\n\t"
84 : "=m" (dst[0]) : "m" (ff_pw_4));
85 88
86 src += stride; 89 src += stride;
87 dst += stride;
88 }
89 return;
90 }
91
92 if(x==0)
93 {
94 /* vertical filter only */
95 asm volatile("movd %0, %%mm6\n\t"
96 "punpcklwd %%mm6, %%mm6\n\t"
97 "punpckldq %%mm6, %%mm6\n\t" /* mm6 = C = y */
98 "movq %1, %%mm4\n\t"
99 "pxor %%mm7, %%mm7\n\t"
100 "psubw %%mm6, %%mm4\n\t" /* mm4 = A = 8-y */
101 : : "rm" (y), "m" (ff_pw_8));
102
103 asm volatile(
104 /* mm0 = src[0..7] */
105 "movq %0, %%mm0\n\t"
106 : : "m" (src[0]));
107
108 for(i=0; i<h; i++) {
109 asm volatile(
110 /* [mm2,mm3] = A * src[0..7] */
111 "movq %mm0, %mm2\n\t"
112 "punpcklbw %mm7, %mm2\n\t"
113 "pmullw %mm4, %mm2\n\t"
114 "movq %mm0, %mm3\n\t"
115 "punpckhbw %mm7, %mm3\n\t"
116 "pmullw %mm4, %mm3\n\t");
117
118 src += stride;
119 asm volatile(
120 /* mm0 = src[0..7] */
121 "movq %0, %%mm0\n\t"
122 : : "m" (src[0]));
123
124 asm volatile(
125 /* [mm2,mm3] += C * src[0..7] */
126 "movq %mm0, %mm1\n\t"
127 "punpcklbw %mm7, %mm1\n\t"
128 "pmullw %mm6, %mm1\n\t"
129 "paddw %mm1, %mm2\n\t"
130 "movq %mm0, %mm5\n\t"
131 "punpckhbw %mm7, %mm5\n\t"
132 "pmullw %mm6, %mm5\n\t"
133 "paddw %mm5, %mm3\n\t");
134
135 asm volatile(
136 /* dst[0..7] = pack(([mm2,mm3] + 32) >> 6) */
137 "paddw %1, %%mm2\n\t"
138 "paddw %1, %%mm3\n\t"
139 "psrlw $3, %%mm2\n\t"
140 "psrlw $3, %%mm3\n\t"
141 "packuswb %%mm3, %%mm2\n\t"
142 H264_CHROMA_OP(%0, %%mm2)
143 "movq %%mm2, %0\n\t"
144 : "=m" (dst[0]) : "m" (ff_pw_4));
145
146 dst += stride; 90 dst += stride;
147 } 91 }
148 return; 92 return;
149 } 93 }
150 94
175 "movq %0, %%mm0\n\t" 119 "movq %0, %%mm0\n\t"
176 "movq %1, %%mm1\n\t" 120 "movq %1, %%mm1\n\t"
177 : : "m" (src[0]), "m" (src[1])); 121 : : "m" (src[0]), "m" (src[1]));
178 122
179 for(i=0; i<h; i++) { 123 for(i=0; i<h; i++) {
180 asm volatile( 124 src += stride;
181 /* [mm2,mm3] = A * src[0..7] */ 125
126 asm volatile(
127 /* mm2 = A * src[0..3] + B * src[1..4] */
128 /* mm3 = A * src[4..7] + B * src[5..8] */
182 "movq %%mm0, %%mm2\n\t" 129 "movq %%mm0, %%mm2\n\t"
130 "movq %%mm1, %%mm3\n\t"
131 "punpckhbw %%mm7, %%mm0\n\t"
132 "punpcklbw %%mm7, %%mm1\n\t"
183 "punpcklbw %%mm7, %%mm2\n\t" 133 "punpcklbw %%mm7, %%mm2\n\t"
134 "punpckhbw %%mm7, %%mm3\n\t"
135 "pmullw %0, %%mm0\n\t"
184 "pmullw %0, %%mm2\n\t" 136 "pmullw %0, %%mm2\n\t"
185 "movq %%mm0, %%mm3\n\t" 137 "pmullw %%mm5, %%mm1\n\t"
186 "punpckhbw %%mm7, %%mm3\n\t" 138 "pmullw %%mm5, %%mm3\n\t"
187 "pmullw %0, %%mm3\n\t" 139 "paddw %%mm1, %%mm2\n\t"
188 140 "paddw %%mm0, %%mm3\n\t"
189 /* [mm2,mm3] += B * src[1..8] */ 141 : : "m" (AA));
190 "movq %%mm1, %%mm0\n\t" 142
143 asm volatile(
144 /* [mm2,mm3] += C * src[0..7] */
145 "movq %0, %%mm0\n\t"
146 "movq %%mm0, %%mm1\n\t"
191 "punpcklbw %%mm7, %%mm0\n\t" 147 "punpcklbw %%mm7, %%mm0\n\t"
192 "pmullw %%mm5, %%mm0\n\t"
193 "punpckhbw %%mm7, %%mm1\n\t" 148 "punpckhbw %%mm7, %%mm1\n\t"
194 "pmullw %%mm5, %%mm1\n\t" 149 "pmullw %%mm6, %%mm0\n\t"
150 "pmullw %%mm6, %%mm1\n\t"
195 "paddw %%mm0, %%mm2\n\t" 151 "paddw %%mm0, %%mm2\n\t"
196 "paddw %%mm1, %%mm3\n\t" 152 "paddw %%mm1, %%mm3\n\t"
197 : : "m" (AA)); 153 : : "m" (src[0]));
198 154
199 src += stride; 155 asm volatile(
200 asm volatile( 156 /* [mm2,mm3] += D * src[1..8] */
201 /* mm0 = src[0..7], mm1 = src[1..8] */ 157 "movq %1, %%mm1\n\t"
158 "movq %%mm1, %%mm0\n\t"
159 "movq %%mm1, %%mm4\n\t"
160 "punpcklbw %%mm7, %%mm0\n\t"
161 "punpckhbw %%mm7, %%mm4\n\t"
162 "pmullw %2, %%mm0\n\t"
163 "pmullw %2, %%mm4\n\t"
164 "paddw %%mm0, %%mm2\n\t"
165 "paddw %%mm4, %%mm3\n\t"
202 "movq %0, %%mm0\n\t" 166 "movq %0, %%mm0\n\t"
203 "movq %1, %%mm1\n\t" 167 : : "m" (src[0]), "m" (src[1]), "m" (DD));
204 : : "m" (src[0]), "m" (src[1])); 168
205 169 asm volatile(
206 asm volatile( 170 /* dst[0..7] = ([mm2,mm3] + 32) >> 6 */
207 /* [mm2,mm3] += C * src[0..7] */
208 "movq %mm0, %mm4\n\t"
209 "punpcklbw %mm7, %mm4\n\t"
210 "pmullw %mm6, %mm4\n\t"
211 "paddw %mm4, %mm2\n\t"
212 "movq %mm0, %mm4\n\t"
213 "punpckhbw %mm7, %mm4\n\t"
214 "pmullw %mm6, %mm4\n\t"
215 "paddw %mm4, %mm3\n\t");
216
217 asm volatile(
218 /* [mm2,mm3] += D * src[1..8] */
219 "movq %%mm1, %%mm4\n\t"
220 "punpcklbw %%mm7, %%mm4\n\t"
221 "pmullw %0, %%mm4\n\t"
222 "paddw %%mm4, %%mm2\n\t"
223 "movq %%mm1, %%mm4\n\t"
224 "punpckhbw %%mm7, %%mm4\n\t"
225 "pmullw %0, %%mm4\n\t"
226 "paddw %%mm4, %%mm3\n\t"
227 : : "m" (DD));
228
229 asm volatile(
230 /* dst[0..7] = pack(([mm2,mm3] + 32) >> 6) */
231 "paddw %1, %%mm2\n\t" 171 "paddw %1, %%mm2\n\t"
232 "paddw %1, %%mm3\n\t" 172 "paddw %1, %%mm3\n\t"
233 "psrlw $6, %%mm2\n\t" 173 "psrlw $6, %%mm2\n\t"
234 "psrlw $6, %%mm3\n\t" 174 "psrlw $6, %%mm3\n\t"
235 "packuswb %%mm3, %%mm2\n\t" 175 "packuswb %%mm3, %%mm2\n\t"