Mercurial > libavcodec.hg
comparison i386/dsputil_h264_template_mmx.c @ 3218:b2ffd9fb4153 libavcodec
simplified and slightly faster h264_chroma_mc8_mmx
author | lorenm |
---|---|
date | Sat, 25 Mar 2006 08:41:14 +0000 |
parents | 91f89a395b28 |
children | c8c591fe26f8 |
comparison
equal
deleted
inserted
replaced
3217:d9eceb8313c2 | 3218:b2ffd9fb4153 |
---|---|
35 return; | 35 return; |
36 } | 36 } |
37 | 37 |
38 assert(x<8 && y<8 && x>=0 && y>=0); | 38 assert(x<8 && y<8 && x>=0 && y>=0); |
39 | 39 |
40 if(y==0) | 40 if(y==0 || x==0) |
41 { | 41 { |
42 /* horizontal filter only */ | 42 /* 1 dimensional filter only */ |
43 asm volatile("movd %0, %%mm5\n\t" | 43 const int dxy = x ? 1 : stride; |
44 "punpcklwd %%mm5, %%mm5\n\t" | 44 |
45 "punpckldq %%mm5, %%mm5\n\t" /* mm5 = B = x */ | 45 asm volatile( |
46 "movq %1, %%mm4\n\t" | 46 "movd %0, %%mm5\n\t" |
47 "pxor %%mm7, %%mm7\n\t" | 47 "movq %1, %%mm4\n\t" |
48 "psubw %%mm5, %%mm4\n\t" /* mm4 = A = 8-x */ | 48 "punpcklwd %%mm5, %%mm5\n\t" |
49 : : "rm" (x), "m" (ff_pw_8)); | 49 "punpckldq %%mm5, %%mm5\n\t" /* mm5 = B = x */ |
50 "movq %%mm4, %%mm6\n\t" | |
51 "pxor %%mm7, %%mm7\n\t" | |
52 "psubw %%mm5, %%mm4\n\t" /* mm4 = A = 8-x */ | |
53 "psrlw $1, %%mm6\n\t" /* mm6 = 4 */ | |
54 :: "rm"(x+y), "m"(ff_pw_8)); | |
50 | 55 |
51 for(i=0; i<h; i++) { | 56 for(i=0; i<h; i++) { |
52 asm volatile( | 57 asm volatile( |
53 /* mm0 = src[0..7], mm1 = src[1..8] */ | 58 /* mm0 = src[0..7], mm1 = src[1..8] */ |
54 "movq %0, %%mm0\n\t" | 59 "movq %0, %%mm0\n\t" |
55 "movq %1, %%mm1\n\t" | 60 "movq %1, %%mm2\n\t" |
56 : : "m" (src[0]), "m" (src[1])); | 61 :: "m"(src[0]), "m"(src[dxy])); |
57 | 62 |
58 asm volatile( | 63 asm volatile( |
59 /* [mm2,mm3] = A * src[0..7] */ | 64 /* [mm0,mm1] = A * src[0..7] */ |
60 "movq %%mm0, %%mm2\n\t" | 65 /* [mm2,mm3] = B * src[1..8] */ |
66 "movq %%mm0, %%mm1\n\t" | |
67 "movq %%mm2, %%mm3\n\t" | |
68 "punpcklbw %%mm7, %%mm0\n\t" | |
69 "punpckhbw %%mm7, %%mm1\n\t" | |
61 "punpcklbw %%mm7, %%mm2\n\t" | 70 "punpcklbw %%mm7, %%mm2\n\t" |
62 "pmullw %%mm4, %%mm2\n\t" | |
63 "movq %%mm0, %%mm3\n\t" | |
64 "punpckhbw %%mm7, %%mm3\n\t" | 71 "punpckhbw %%mm7, %%mm3\n\t" |
65 "pmullw %%mm4, %%mm3\n\t" | 72 "pmullw %%mm4, %%mm0\n\t" |
66 | 73 "pmullw %%mm4, %%mm1\n\t" |
67 /* [mm2,mm3] += B * src[1..8] */ | 74 "pmullw %%mm5, %%mm2\n\t" |
68 "movq %%mm1, %%mm0\n\t" | 75 "pmullw %%mm5, %%mm3\n\t" |
69 "punpcklbw %%mm7, %%mm0\n\t" | 76 |
70 "pmullw %%mm5, %%mm0\n\t" | 77 /* dst[0..7] = (A * src[0..7] + B * src[1..8] + 4) >> 3 */ |
71 "punpckhbw %%mm7, %%mm1\n\t" | 78 "paddw %%mm6, %%mm0\n\t" |
72 "pmullw %%mm5, %%mm1\n\t" | 79 "paddw %%mm6, %%mm1\n\t" |
73 "paddw %%mm0, %%mm2\n\t" | 80 "paddw %%mm2, %%mm0\n\t" |
74 "paddw %%mm1, %%mm3\n\t" | 81 "paddw %%mm3, %%mm1\n\t" |
75 | 82 "psrlw $3, %%mm0\n\t" |
76 /* dst[0..7] = pack(([mm2,mm3] + 32) >> 6) */ | 83 "psrlw $3, %%mm1\n\t" |
77 "paddw %1, %%mm2\n\t" | 84 "packuswb %%mm1, %%mm0\n\t" |
78 "paddw %1, %%mm3\n\t" | 85 H264_CHROMA_OP(%0, %%mm0) |
79 "psrlw $3, %%mm2\n\t" | 86 "movq %%mm0, %0\n\t" |
80 "psrlw $3, %%mm3\n\t" | 87 : "=m" (dst[0])); |
81 "packuswb %%mm3, %%mm2\n\t" | |
82 H264_CHROMA_OP(%0, %%mm2) | |
83 "movq %%mm2, %0\n\t" | |
84 : "=m" (dst[0]) : "m" (ff_pw_4)); | |
85 | 88 |
86 src += stride; | 89 src += stride; |
87 dst += stride; | |
88 } | |
89 return; | |
90 } | |
91 | |
92 if(x==0) | |
93 { | |
94 /* vertical filter only */ | |
95 asm volatile("movd %0, %%mm6\n\t" | |
96 "punpcklwd %%mm6, %%mm6\n\t" | |
97 "punpckldq %%mm6, %%mm6\n\t" /* mm6 = C = y */ | |
98 "movq %1, %%mm4\n\t" | |
99 "pxor %%mm7, %%mm7\n\t" | |
100 "psubw %%mm6, %%mm4\n\t" /* mm4 = A = 8-y */ | |
101 : : "rm" (y), "m" (ff_pw_8)); | |
102 | |
103 asm volatile( | |
104 /* mm0 = src[0..7] */ | |
105 "movq %0, %%mm0\n\t" | |
106 : : "m" (src[0])); | |
107 | |
108 for(i=0; i<h; i++) { | |
109 asm volatile( | |
110 /* [mm2,mm3] = A * src[0..7] */ | |
111 "movq %mm0, %mm2\n\t" | |
112 "punpcklbw %mm7, %mm2\n\t" | |
113 "pmullw %mm4, %mm2\n\t" | |
114 "movq %mm0, %mm3\n\t" | |
115 "punpckhbw %mm7, %mm3\n\t" | |
116 "pmullw %mm4, %mm3\n\t"); | |
117 | |
118 src += stride; | |
119 asm volatile( | |
120 /* mm0 = src[0..7] */ | |
121 "movq %0, %%mm0\n\t" | |
122 : : "m" (src[0])); | |
123 | |
124 asm volatile( | |
125 /* [mm2,mm3] += C * src[0..7] */ | |
126 "movq %mm0, %mm1\n\t" | |
127 "punpcklbw %mm7, %mm1\n\t" | |
128 "pmullw %mm6, %mm1\n\t" | |
129 "paddw %mm1, %mm2\n\t" | |
130 "movq %mm0, %mm5\n\t" | |
131 "punpckhbw %mm7, %mm5\n\t" | |
132 "pmullw %mm6, %mm5\n\t" | |
133 "paddw %mm5, %mm3\n\t"); | |
134 | |
135 asm volatile( | |
136 /* dst[0..7] = pack(([mm2,mm3] + 32) >> 6) */ | |
137 "paddw %1, %%mm2\n\t" | |
138 "paddw %1, %%mm3\n\t" | |
139 "psrlw $3, %%mm2\n\t" | |
140 "psrlw $3, %%mm3\n\t" | |
141 "packuswb %%mm3, %%mm2\n\t" | |
142 H264_CHROMA_OP(%0, %%mm2) | |
143 "movq %%mm2, %0\n\t" | |
144 : "=m" (dst[0]) : "m" (ff_pw_4)); | |
145 | |
146 dst += stride; | 90 dst += stride; |
147 } | 91 } |
148 return; | 92 return; |
149 } | 93 } |
150 | 94 |
175 "movq %0, %%mm0\n\t" | 119 "movq %0, %%mm0\n\t" |
176 "movq %1, %%mm1\n\t" | 120 "movq %1, %%mm1\n\t" |
177 : : "m" (src[0]), "m" (src[1])); | 121 : : "m" (src[0]), "m" (src[1])); |
178 | 122 |
179 for(i=0; i<h; i++) { | 123 for(i=0; i<h; i++) { |
180 asm volatile( | 124 src += stride; |
181 /* [mm2,mm3] = A * src[0..7] */ | 125 |
126 asm volatile( | |
127 /* mm2 = A * src[0..3] + B * src[1..4] */ | |
128 /* mm3 = A * src[4..7] + B * src[5..8] */ | |
182 "movq %%mm0, %%mm2\n\t" | 129 "movq %%mm0, %%mm2\n\t" |
130 "movq %%mm1, %%mm3\n\t" | |
131 "punpckhbw %%mm7, %%mm0\n\t" | |
132 "punpcklbw %%mm7, %%mm1\n\t" | |
183 "punpcklbw %%mm7, %%mm2\n\t" | 133 "punpcklbw %%mm7, %%mm2\n\t" |
134 "punpckhbw %%mm7, %%mm3\n\t" | |
135 "pmullw %0, %%mm0\n\t" | |
184 "pmullw %0, %%mm2\n\t" | 136 "pmullw %0, %%mm2\n\t" |
185 "movq %%mm0, %%mm3\n\t" | 137 "pmullw %%mm5, %%mm1\n\t" |
186 "punpckhbw %%mm7, %%mm3\n\t" | 138 "pmullw %%mm5, %%mm3\n\t" |
187 "pmullw %0, %%mm3\n\t" | 139 "paddw %%mm1, %%mm2\n\t" |
188 | 140 "paddw %%mm0, %%mm3\n\t" |
189 /* [mm2,mm3] += B * src[1..8] */ | 141 : : "m" (AA)); |
190 "movq %%mm1, %%mm0\n\t" | 142 |
143 asm volatile( | |
144 /* [mm2,mm3] += C * src[0..7] */ | |
145 "movq %0, %%mm0\n\t" | |
146 "movq %%mm0, %%mm1\n\t" | |
191 "punpcklbw %%mm7, %%mm0\n\t" | 147 "punpcklbw %%mm7, %%mm0\n\t" |
192 "pmullw %%mm5, %%mm0\n\t" | |
193 "punpckhbw %%mm7, %%mm1\n\t" | 148 "punpckhbw %%mm7, %%mm1\n\t" |
194 "pmullw %%mm5, %%mm1\n\t" | 149 "pmullw %%mm6, %%mm0\n\t" |
150 "pmullw %%mm6, %%mm1\n\t" | |
195 "paddw %%mm0, %%mm2\n\t" | 151 "paddw %%mm0, %%mm2\n\t" |
196 "paddw %%mm1, %%mm3\n\t" | 152 "paddw %%mm1, %%mm3\n\t" |
197 : : "m" (AA)); | 153 : : "m" (src[0])); |
198 | 154 |
199 src += stride; | 155 asm volatile( |
200 asm volatile( | 156 /* [mm2,mm3] += D * src[1..8] */ |
201 /* mm0 = src[0..7], mm1 = src[1..8] */ | 157 "movq %1, %%mm1\n\t" |
158 "movq %%mm1, %%mm0\n\t" | |
159 "movq %%mm1, %%mm4\n\t" | |
160 "punpcklbw %%mm7, %%mm0\n\t" | |
161 "punpckhbw %%mm7, %%mm4\n\t" | |
162 "pmullw %2, %%mm0\n\t" | |
163 "pmullw %2, %%mm4\n\t" | |
164 "paddw %%mm0, %%mm2\n\t" | |
165 "paddw %%mm4, %%mm3\n\t" | |
202 "movq %0, %%mm0\n\t" | 166 "movq %0, %%mm0\n\t" |
203 "movq %1, %%mm1\n\t" | 167 : : "m" (src[0]), "m" (src[1]), "m" (DD)); |
204 : : "m" (src[0]), "m" (src[1])); | 168 |
205 | 169 asm volatile( |
206 asm volatile( | 170 /* dst[0..7] = ([mm2,mm3] + 32) >> 6 */ |
207 /* [mm2,mm3] += C * src[0..7] */ | |
208 "movq %mm0, %mm4\n\t" | |
209 "punpcklbw %mm7, %mm4\n\t" | |
210 "pmullw %mm6, %mm4\n\t" | |
211 "paddw %mm4, %mm2\n\t" | |
212 "movq %mm0, %mm4\n\t" | |
213 "punpckhbw %mm7, %mm4\n\t" | |
214 "pmullw %mm6, %mm4\n\t" | |
215 "paddw %mm4, %mm3\n\t"); | |
216 | |
217 asm volatile( | |
218 /* [mm2,mm3] += D * src[1..8] */ | |
219 "movq %%mm1, %%mm4\n\t" | |
220 "punpcklbw %%mm7, %%mm4\n\t" | |
221 "pmullw %0, %%mm4\n\t" | |
222 "paddw %%mm4, %%mm2\n\t" | |
223 "movq %%mm1, %%mm4\n\t" | |
224 "punpckhbw %%mm7, %%mm4\n\t" | |
225 "pmullw %0, %%mm4\n\t" | |
226 "paddw %%mm4, %%mm3\n\t" | |
227 : : "m" (DD)); | |
228 | |
229 asm volatile( | |
230 /* dst[0..7] = pack(([mm2,mm3] + 32) >> 6) */ | |
231 "paddw %1, %%mm2\n\t" | 171 "paddw %1, %%mm2\n\t" |
232 "paddw %1, %%mm3\n\t" | 172 "paddw %1, %%mm3\n\t" |
233 "psrlw $6, %%mm2\n\t" | 173 "psrlw $6, %%mm2\n\t" |
234 "psrlw $6, %%mm3\n\t" | 174 "psrlw $6, %%mm3\n\t" |
235 "packuswb %%mm3, %%mm2\n\t" | 175 "packuswb %%mm3, %%mm2\n\t" |