Mercurial > libavcodec.hg
annotate x86/snowdsp_mmx.c @ 12483:0159a19bfff7 libavcodec
aacdec: Rework channel mapping compatibility hacks.
For a PCE based configuration map the channels solely based on tags.
For an indexed configuration map the channels solely based on position.
This works with all known exotic samples including al17, elem_id0, bad_concat,
and lfe_is_sce.
author | alexc |
---|---|
date | Fri, 10 Sep 2010 18:01:48 +0000 |
parents | 9fef0a8ddd63 |
children |
rev | line source |
---|---|
8430 | 1 /* |
2 * MMX and SSE2 optimized snow DSP utils | |
3 * Copyright (c) 2005-2006 Robert Edele <yartrebo@earthlink.net> | |
4 * | |
5 * This file is part of FFmpeg. | |
6 * | |
7 * FFmpeg is free software; you can redistribute it and/or | |
8 * modify it under the terms of the GNU Lesser General Public | |
9 * License as published by the Free Software Foundation; either | |
10 * version 2.1 of the License, or (at your option) any later version. | |
11 * | |
12 * FFmpeg is distributed in the hope that it will be useful, | |
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 * Lesser General Public License for more details. | |
16 * | |
17 * You should have received a copy of the GNU Lesser General Public | |
18 * License along with FFmpeg; if not, write to the Free Software | |
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 */ | |
21 | |
12456
a5ddb39627fd
Rename FF_MM_ symbols related to CPU features flags as AV_CPU_FLAG_
stefano
parents:
12414
diff
changeset
|
22 #include "libavutil/cpu.h" |
8430 | 23 #include "libavutil/x86_cpu.h" |
24 #include "libavcodec/avcodec.h" | |
25 #include "libavcodec/snow.h" | |
11485 | 26 #include "libavcodec/dwt.h" |
11373 | 27 #include "dsputil_mmx.h" |
8430 | 28 |
11485 | 29 static void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, int width){ |
8430 | 30 const int w2= (width+1)>>1; |
11369 | 31 DECLARE_ALIGNED(16, IDWTELEM, temp)[width>>1]; |
8430 | 32 const int w_l= (width>>1); |
33 const int w_r= w2 - 1; | |
34 int i; | |
35 | |
36 { // Lift 0 | |
37 IDWTELEM * const ref = b + w2 - 1; | |
38 IDWTELEM b_0 = b[0]; //By allowing the first entry in b[0] to be calculated twice | |
39 // (the first time erroneously), we allow the SSE2 code to run an extra pass. | |
40 // The savings in code and time are well worth having to store this value and | |
41 // calculate b[0] correctly afterwards. | |
42 | |
43 i = 0; | |
44 __asm__ volatile( | |
45 "pcmpeqd %%xmm7, %%xmm7 \n\t" | |
46 "pcmpeqd %%xmm3, %%xmm3 \n\t" | |
47 "psllw $1, %%xmm3 \n\t" | |
48 "paddw %%xmm7, %%xmm3 \n\t" | |
49 "psllw $13, %%xmm3 \n\t" | |
50 ::); | |
51 for(; i<w_l-15; i+=16){ | |
52 __asm__ volatile( | |
53 "movdqu (%1), %%xmm1 \n\t" | |
54 "movdqu 16(%1), %%xmm5 \n\t" | |
55 "movdqu 2(%1), %%xmm2 \n\t" | |
56 "movdqu 18(%1), %%xmm6 \n\t" | |
57 "paddw %%xmm1, %%xmm2 \n\t" | |
58 "paddw %%xmm5, %%xmm6 \n\t" | |
59 "paddw %%xmm7, %%xmm2 \n\t" | |
60 "paddw %%xmm7, %%xmm6 \n\t" | |
61 "pmulhw %%xmm3, %%xmm2 \n\t" | |
62 "pmulhw %%xmm3, %%xmm6 \n\t" | |
63 "paddw (%0), %%xmm2 \n\t" | |
64 "paddw 16(%0), %%xmm6 \n\t" | |
65 "movdqa %%xmm2, (%0) \n\t" | |
66 "movdqa %%xmm6, 16(%0) \n\t" | |
67 :: "r"(&b[i]), "r"(&ref[i]) | |
68 : "memory" | |
69 ); | |
70 } | |
71 snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS); | |
72 b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS); | |
73 } | |
74 | |
75 { // Lift 1 | |
76 IDWTELEM * const dst = b+w2; | |
77 | |
78 i = 0; | |
79 for(; (((x86_reg)&dst[i]) & 0x1F) && i<w_r; i++){ | |
80 dst[i] = dst[i] - (b[i] + b[i + 1]); | |
81 } | |
82 for(; i<w_r-15; i+=16){ | |
83 __asm__ volatile( | |
84 "movdqu (%1), %%xmm1 \n\t" | |
85 "movdqu 16(%1), %%xmm5 \n\t" | |
86 "movdqu 2(%1), %%xmm2 \n\t" | |
87 "movdqu 18(%1), %%xmm6 \n\t" | |
88 "paddw %%xmm1, %%xmm2 \n\t" | |
89 "paddw %%xmm5, %%xmm6 \n\t" | |
90 "movdqa (%0), %%xmm0 \n\t" | |
91 "movdqa 16(%0), %%xmm4 \n\t" | |
92 "psubw %%xmm2, %%xmm0 \n\t" | |
93 "psubw %%xmm6, %%xmm4 \n\t" | |
94 "movdqa %%xmm0, (%0) \n\t" | |
95 "movdqa %%xmm4, 16(%0) \n\t" | |
96 :: "r"(&dst[i]), "r"(&b[i]) | |
97 : "memory" | |
98 ); | |
99 } | |
100 snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS); | |
101 } | |
102 | |
103 { // Lift 2 | |
104 IDWTELEM * const ref = b+w2 - 1; | |
105 IDWTELEM b_0 = b[0]; | |
106 | |
107 i = 0; | |
108 __asm__ volatile( | |
109 "psllw $15, %%xmm7 \n\t" | |
110 "pcmpeqw %%xmm6, %%xmm6 \n\t" | |
111 "psrlw $13, %%xmm6 \n\t" | |
112 "paddw %%xmm7, %%xmm6 \n\t" | |
113 ::); | |
114 for(; i<w_l-15; i+=16){ | |
115 __asm__ volatile( | |
116 "movdqu (%1), %%xmm0 \n\t" | |
117 "movdqu 16(%1), %%xmm4 \n\t" | |
118 "movdqu 2(%1), %%xmm1 \n\t" | |
119 "movdqu 18(%1), %%xmm5 \n\t" //FIXME try aligned reads and shifts | |
120 "paddw %%xmm6, %%xmm0 \n\t" | |
121 "paddw %%xmm6, %%xmm4 \n\t" | |
122 "paddw %%xmm7, %%xmm1 \n\t" | |
123 "paddw %%xmm7, %%xmm5 \n\t" | |
124 "pavgw %%xmm1, %%xmm0 \n\t" | |
125 "pavgw %%xmm5, %%xmm4 \n\t" | |
126 "psubw %%xmm7, %%xmm0 \n\t" | |
127 "psubw %%xmm7, %%xmm4 \n\t" | |
128 "psraw $1, %%xmm0 \n\t" | |
129 "psraw $1, %%xmm4 \n\t" | |
130 "movdqa (%0), %%xmm1 \n\t" | |
131 "movdqa 16(%0), %%xmm5 \n\t" | |
132 "paddw %%xmm1, %%xmm0 \n\t" | |
133 "paddw %%xmm5, %%xmm4 \n\t" | |
134 "psraw $2, %%xmm0 \n\t" | |
135 "psraw $2, %%xmm4 \n\t" | |
136 "paddw %%xmm1, %%xmm0 \n\t" | |
137 "paddw %%xmm5, %%xmm4 \n\t" | |
138 "movdqa %%xmm0, (%0) \n\t" | |
139 "movdqa %%xmm4, 16(%0) \n\t" | |
140 :: "r"(&b[i]), "r"(&ref[i]) | |
141 : "memory" | |
142 ); | |
143 } | |
144 snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l); | |
145 b[0] = b_0 + ((2 * ref[1] + W_BO-1 + 4 * b_0) >> W_BS); | |
146 } | |
147 | |
148 { // Lift 3 | |
149 IDWTELEM * const src = b+w2; | |
150 | |
151 i = 0; | |
152 for(; (((x86_reg)&temp[i]) & 0x1F) && i<w_r; i++){ | |
153 temp[i] = src[i] - ((-W_AM*(b[i] + b[i+1]))>>W_AS); | |
154 } | |
155 for(; i<w_r-7; i+=8){ | |
156 __asm__ volatile( | |
157 "movdqu 2(%1), %%xmm2 \n\t" | |
158 "movdqu 18(%1), %%xmm6 \n\t" | |
159 "paddw (%1), %%xmm2 \n\t" | |
160 "paddw 16(%1), %%xmm6 \n\t" | |
161 "movdqu (%0), %%xmm0 \n\t" | |
162 "movdqu 16(%0), %%xmm4 \n\t" | |
163 "paddw %%xmm2, %%xmm0 \n\t" | |
164 "paddw %%xmm6, %%xmm4 \n\t" | |
165 "psraw $1, %%xmm2 \n\t" | |
166 "psraw $1, %%xmm6 \n\t" | |
167 "paddw %%xmm0, %%xmm2 \n\t" | |
168 "paddw %%xmm4, %%xmm6 \n\t" | |
169 "movdqa %%xmm2, (%2) \n\t" | |
170 "movdqa %%xmm6, 16(%2) \n\t" | |
171 :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i]) | |
172 : "memory" | |
173 ); | |
174 } | |
175 snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS); | |
176 } | |
177 | |
178 { | |
179 snow_interleave_line_header(&i, width, b, temp); | |
180 | |
181 for (; (i & 0x3E) != 0x3E; i-=2){ | |
182 b[i+1] = temp[i>>1]; | |
183 b[i] = b[i>>1]; | |
184 } | |
185 for (i-=62; i>=0; i-=64){ | |
186 __asm__ volatile( | |
187 "movdqa (%1), %%xmm0 \n\t" | |
188 "movdqa 16(%1), %%xmm2 \n\t" | |
189 "movdqa 32(%1), %%xmm4 \n\t" | |
190 "movdqa 48(%1), %%xmm6 \n\t" | |
191 "movdqa (%1), %%xmm1 \n\t" | |
192 "movdqa 16(%1), %%xmm3 \n\t" | |
193 "movdqa 32(%1), %%xmm5 \n\t" | |
194 "movdqa 48(%1), %%xmm7 \n\t" | |
195 "punpcklwd (%2), %%xmm0 \n\t" | |
196 "punpcklwd 16(%2), %%xmm2 \n\t" | |
197 "punpcklwd 32(%2), %%xmm4 \n\t" | |
198 "punpcklwd 48(%2), %%xmm6 \n\t" | |
199 "movdqa %%xmm0, (%0) \n\t" | |
200 "movdqa %%xmm2, 32(%0) \n\t" | |
201 "movdqa %%xmm4, 64(%0) \n\t" | |
202 "movdqa %%xmm6, 96(%0) \n\t" | |
203 "punpckhwd (%2), %%xmm1 \n\t" | |
204 "punpckhwd 16(%2), %%xmm3 \n\t" | |
205 "punpckhwd 32(%2), %%xmm5 \n\t" | |
206 "punpckhwd 48(%2), %%xmm7 \n\t" | |
207 "movdqa %%xmm1, 16(%0) \n\t" | |
208 "movdqa %%xmm3, 48(%0) \n\t" | |
209 "movdqa %%xmm5, 80(%0) \n\t" | |
210 "movdqa %%xmm7, 112(%0) \n\t" | |
211 :: "r"(&(b)[i]), "r"(&(b)[i>>1]), "r"(&(temp)[i>>1]) | |
212 : "memory" | |
213 ); | |
214 } | |
215 } | |
216 } | |
217 | |
11485 | 218 static void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, int width){ |
8430 | 219 const int w2= (width+1)>>1; |
220 IDWTELEM temp[width >> 1]; | |
221 const int w_l= (width>>1); | |
222 const int w_r= w2 - 1; | |
223 int i; | |
224 | |
225 { // Lift 0 | |
226 IDWTELEM * const ref = b + w2 - 1; | |
227 | |
228 i = 1; | |
229 b[0] = b[0] - ((W_DM * 2 * ref[1]+W_DO)>>W_DS); | |
230 __asm__ volatile( | |
231 "pcmpeqw %%mm7, %%mm7 \n\t" | |
232 "pcmpeqw %%mm3, %%mm3 \n\t" | |
233 "psllw $1, %%mm3 \n\t" | |
234 "paddw %%mm7, %%mm3 \n\t" | |
235 "psllw $13, %%mm3 \n\t" | |
236 ::); | |
237 for(; i<w_l-7; i+=8){ | |
238 __asm__ volatile( | |
239 "movq (%1), %%mm2 \n\t" | |
240 "movq 8(%1), %%mm6 \n\t" | |
241 "paddw 2(%1), %%mm2 \n\t" | |
242 "paddw 10(%1), %%mm6 \n\t" | |
243 "paddw %%mm7, %%mm2 \n\t" | |
244 "paddw %%mm7, %%mm6 \n\t" | |
245 "pmulhw %%mm3, %%mm2 \n\t" | |
246 "pmulhw %%mm3, %%mm6 \n\t" | |
247 "paddw (%0), %%mm2 \n\t" | |
248 "paddw 8(%0), %%mm6 \n\t" | |
249 "movq %%mm2, (%0) \n\t" | |
250 "movq %%mm6, 8(%0) \n\t" | |
251 :: "r"(&b[i]), "r"(&ref[i]) | |
252 : "memory" | |
253 ); | |
254 } | |
255 snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS); | |
256 } | |
257 | |
258 { // Lift 1 | |
259 IDWTELEM * const dst = b+w2; | |
260 | |
261 i = 0; | |
262 for(; i<w_r-7; i+=8){ | |
263 __asm__ volatile( | |
264 "movq (%1), %%mm2 \n\t" | |
265 "movq 8(%1), %%mm6 \n\t" | |
266 "paddw 2(%1), %%mm2 \n\t" | |
267 "paddw 10(%1), %%mm6 \n\t" | |
268 "movq (%0), %%mm0 \n\t" | |
269 "movq 8(%0), %%mm4 \n\t" | |
270 "psubw %%mm2, %%mm0 \n\t" | |
271 "psubw %%mm6, %%mm4 \n\t" | |
272 "movq %%mm0, (%0) \n\t" | |
273 "movq %%mm4, 8(%0) \n\t" | |
274 :: "r"(&dst[i]), "r"(&b[i]) | |
275 : "memory" | |
276 ); | |
277 } | |
278 snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS); | |
279 } | |
280 | |
281 { // Lift 2 | |
282 IDWTELEM * const ref = b+w2 - 1; | |
283 | |
284 i = 1; | |
285 b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS); | |
286 __asm__ volatile( | |
287 "psllw $15, %%mm7 \n\t" | |
288 "pcmpeqw %%mm6, %%mm6 \n\t" | |
289 "psrlw $13, %%mm6 \n\t" | |
290 "paddw %%mm7, %%mm6 \n\t" | |
291 ::); | |
292 for(; i<w_l-7; i+=8){ | |
293 __asm__ volatile( | |
294 "movq (%1), %%mm0 \n\t" | |
295 "movq 8(%1), %%mm4 \n\t" | |
296 "movq 2(%1), %%mm1 \n\t" | |
297 "movq 10(%1), %%mm5 \n\t" | |
298 "paddw %%mm6, %%mm0 \n\t" | |
299 "paddw %%mm6, %%mm4 \n\t" | |
300 "paddw %%mm7, %%mm1 \n\t" | |
301 "paddw %%mm7, %%mm5 \n\t" | |
302 "pavgw %%mm1, %%mm0 \n\t" | |
303 "pavgw %%mm5, %%mm4 \n\t" | |
304 "psubw %%mm7, %%mm0 \n\t" | |
305 "psubw %%mm7, %%mm4 \n\t" | |
306 "psraw $1, %%mm0 \n\t" | |
307 "psraw $1, %%mm4 \n\t" | |
308 "movq (%0), %%mm1 \n\t" | |
309 "movq 8(%0), %%mm5 \n\t" | |
310 "paddw %%mm1, %%mm0 \n\t" | |
311 "paddw %%mm5, %%mm4 \n\t" | |
312 "psraw $2, %%mm0 \n\t" | |
313 "psraw $2, %%mm4 \n\t" | |
314 "paddw %%mm1, %%mm0 \n\t" | |
315 "paddw %%mm5, %%mm4 \n\t" | |
316 "movq %%mm0, (%0) \n\t" | |
317 "movq %%mm4, 8(%0) \n\t" | |
318 :: "r"(&b[i]), "r"(&ref[i]) | |
319 : "memory" | |
320 ); | |
321 } | |
322 snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l); | |
323 } | |
324 | |
325 { // Lift 3 | |
326 IDWTELEM * const src = b+w2; | |
327 i = 0; | |
328 | |
329 for(; i<w_r-7; i+=8){ | |
330 __asm__ volatile( | |
331 "movq 2(%1), %%mm2 \n\t" | |
332 "movq 10(%1), %%mm6 \n\t" | |
333 "paddw (%1), %%mm2 \n\t" | |
334 "paddw 8(%1), %%mm6 \n\t" | |
335 "movq (%0), %%mm0 \n\t" | |
336 "movq 8(%0), %%mm4 \n\t" | |
337 "paddw %%mm2, %%mm0 \n\t" | |
338 "paddw %%mm6, %%mm4 \n\t" | |
339 "psraw $1, %%mm2 \n\t" | |
340 "psraw $1, %%mm6 \n\t" | |
341 "paddw %%mm0, %%mm2 \n\t" | |
342 "paddw %%mm4, %%mm6 \n\t" | |
343 "movq %%mm2, (%2) \n\t" | |
344 "movq %%mm6, 8(%2) \n\t" | |
345 :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i]) | |
346 : "memory" | |
347 ); | |
348 } | |
349 snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS); | |
350 } | |
351 | |
352 { | |
353 snow_interleave_line_header(&i, width, b, temp); | |
354 | |
355 for (; (i & 0x1E) != 0x1E; i-=2){ | |
356 b[i+1] = temp[i>>1]; | |
357 b[i] = b[i>>1]; | |
358 } | |
359 for (i-=30; i>=0; i-=32){ | |
360 __asm__ volatile( | |
361 "movq (%1), %%mm0 \n\t" | |
362 "movq 8(%1), %%mm2 \n\t" | |
363 "movq 16(%1), %%mm4 \n\t" | |
364 "movq 24(%1), %%mm6 \n\t" | |
365 "movq (%1), %%mm1 \n\t" | |
366 "movq 8(%1), %%mm3 \n\t" | |
367 "movq 16(%1), %%mm5 \n\t" | |
368 "movq 24(%1), %%mm7 \n\t" | |
369 "punpcklwd (%2), %%mm0 \n\t" | |
370 "punpcklwd 8(%2), %%mm2 \n\t" | |
371 "punpcklwd 16(%2), %%mm4 \n\t" | |
372 "punpcklwd 24(%2), %%mm6 \n\t" | |
373 "movq %%mm0, (%0) \n\t" | |
374 "movq %%mm2, 16(%0) \n\t" | |
375 "movq %%mm4, 32(%0) \n\t" | |
376 "movq %%mm6, 48(%0) \n\t" | |
377 "punpckhwd (%2), %%mm1 \n\t" | |
378 "punpckhwd 8(%2), %%mm3 \n\t" | |
379 "punpckhwd 16(%2), %%mm5 \n\t" | |
380 "punpckhwd 24(%2), %%mm7 \n\t" | |
381 "movq %%mm1, 8(%0) \n\t" | |
382 "movq %%mm3, 24(%0) \n\t" | |
383 "movq %%mm5, 40(%0) \n\t" | |
384 "movq %%mm7, 56(%0) \n\t" | |
385 :: "r"(&b[i]), "r"(&b[i>>1]), "r"(&temp[i>>1]) | |
386 : "memory" | |
387 ); | |
388 } | |
389 } | |
390 } | |
391 | |
8590 | 392 #if HAVE_7REGS |
8430 | 393 #define snow_vertical_compose_sse2_load_add(op,r,t0,t1,t2,t3)\ |
394 ""op" ("r",%%"REG_d"), %%"t0" \n\t"\ | |
395 ""op" 16("r",%%"REG_d"), %%"t1" \n\t"\ | |
396 ""op" 32("r",%%"REG_d"), %%"t2" \n\t"\ | |
397 ""op" 48("r",%%"REG_d"), %%"t3" \n\t" | |
398 | |
399 #define snow_vertical_compose_sse2_load(r,t0,t1,t2,t3)\ | |
400 snow_vertical_compose_sse2_load_add("movdqa",r,t0,t1,t2,t3) | |
401 | |
402 #define snow_vertical_compose_sse2_add(r,t0,t1,t2,t3)\ | |
403 snow_vertical_compose_sse2_load_add("paddw",r,t0,t1,t2,t3) | |
404 | |
405 #define snow_vertical_compose_r2r_sub(s0,s1,s2,s3,t0,t1,t2,t3)\ | |
406 "psubw %%"s0", %%"t0" \n\t"\ | |
407 "psubw %%"s1", %%"t1" \n\t"\ | |
408 "psubw %%"s2", %%"t2" \n\t"\ | |
409 "psubw %%"s3", %%"t3" \n\t" | |
410 | |
411 #define snow_vertical_compose_sse2_store(w,s0,s1,s2,s3)\ | |
412 "movdqa %%"s0", ("w",%%"REG_d") \n\t"\ | |
413 "movdqa %%"s1", 16("w",%%"REG_d") \n\t"\ | |
414 "movdqa %%"s2", 32("w",%%"REG_d") \n\t"\ | |
415 "movdqa %%"s3", 48("w",%%"REG_d") \n\t" | |
416 | |
417 #define snow_vertical_compose_sra(n,t0,t1,t2,t3)\ | |
418 "psraw $"n", %%"t0" \n\t"\ | |
419 "psraw $"n", %%"t1" \n\t"\ | |
420 "psraw $"n", %%"t2" \n\t"\ | |
421 "psraw $"n", %%"t3" \n\t" | |
422 | |
423 #define snow_vertical_compose_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\ | |
424 "paddw %%"s0", %%"t0" \n\t"\ | |
425 "paddw %%"s1", %%"t1" \n\t"\ | |
426 "paddw %%"s2", %%"t2" \n\t"\ | |
427 "paddw %%"s3", %%"t3" \n\t" | |
428 | |
429 #define snow_vertical_compose_r2r_pmulhw(s0,s1,s2,s3,t0,t1,t2,t3)\ | |
430 "pmulhw %%"s0", %%"t0" \n\t"\ | |
431 "pmulhw %%"s1", %%"t1" \n\t"\ | |
432 "pmulhw %%"s2", %%"t2" \n\t"\ | |
433 "pmulhw %%"s3", %%"t3" \n\t" | |
434 | |
435 #define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\ | |
436 "movdqa %%"s0", %%"t0" \n\t"\ | |
437 "movdqa %%"s1", %%"t1" \n\t"\ | |
438 "movdqa %%"s2", %%"t2" \n\t"\ | |
439 "movdqa %%"s3", %%"t3" \n\t" | |
440 | |
11485 | 441 static void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){ |
8430 | 442 x86_reg i = width; |
443 | |
444 while(i & 0x1F) | |
445 { | |
446 i--; | |
447 b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS; | |
448 b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS; | |
449 b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS; | |
450 b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS; | |
451 } | |
452 i+=i; | |
453 | |
454 __asm__ volatile ( | |
455 "jmp 2f \n\t" | |
456 "1: \n\t" | |
457 snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6") | |
458 snow_vertical_compose_sse2_add("%6","xmm0","xmm2","xmm4","xmm6") | |
459 | |
460 | |
461 "pcmpeqw %%xmm0, %%xmm0 \n\t" | |
462 "pcmpeqw %%xmm2, %%xmm2 \n\t" | |
463 "paddw %%xmm2, %%xmm2 \n\t" | |
464 "paddw %%xmm0, %%xmm2 \n\t" | |
465 "psllw $13, %%xmm2 \n\t" | |
466 snow_vertical_compose_r2r_add("xmm0","xmm0","xmm0","xmm0","xmm1","xmm3","xmm5","xmm7") | |
467 snow_vertical_compose_r2r_pmulhw("xmm2","xmm2","xmm2","xmm2","xmm1","xmm3","xmm5","xmm7") | |
468 snow_vertical_compose_sse2_add("%5","xmm1","xmm3","xmm5","xmm7") | |
469 snow_vertical_compose_sse2_store("%5","xmm1","xmm3","xmm5","xmm7") | |
470 snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6") | |
471 snow_vertical_compose_sse2_add("%3","xmm1","xmm3","xmm5","xmm7") | |
472 snow_vertical_compose_r2r_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") | |
473 snow_vertical_compose_sse2_store("%4","xmm0","xmm2","xmm4","xmm6") | |
474 | |
475 "pcmpeqw %%xmm7, %%xmm7 \n\t" | |
476 "pcmpeqw %%xmm5, %%xmm5 \n\t" | |
477 "psllw $15, %%xmm7 \n\t" | |
478 "psrlw $13, %%xmm5 \n\t" | |
479 "paddw %%xmm7, %%xmm5 \n\t" | |
480 snow_vertical_compose_r2r_add("xmm5","xmm5","xmm5","xmm5","xmm0","xmm2","xmm4","xmm6") | |
481 "movq (%2,%%"REG_d"), %%xmm1 \n\t" | |
482 "movq 8(%2,%%"REG_d"), %%xmm3 \n\t" | |
483 "paddw %%xmm7, %%xmm1 \n\t" | |
484 "paddw %%xmm7, %%xmm3 \n\t" | |
485 "pavgw %%xmm1, %%xmm0 \n\t" | |
486 "pavgw %%xmm3, %%xmm2 \n\t" | |
487 "movq 16(%2,%%"REG_d"), %%xmm1 \n\t" | |
488 "movq 24(%2,%%"REG_d"), %%xmm3 \n\t" | |
489 "paddw %%xmm7, %%xmm1 \n\t" | |
490 "paddw %%xmm7, %%xmm3 \n\t" | |
491 "pavgw %%xmm1, %%xmm4 \n\t" | |
492 "pavgw %%xmm3, %%xmm6 \n\t" | |
493 snow_vertical_compose_r2r_sub("xmm7","xmm7","xmm7","xmm7","xmm0","xmm2","xmm4","xmm6") | |
494 snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6") | |
495 snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6") | |
496 | |
497 snow_vertical_compose_sra("2","xmm0","xmm2","xmm4","xmm6") | |
498 snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6") | |
499 snow_vertical_compose_sse2_store("%3","xmm0","xmm2","xmm4","xmm6") | |
500 snow_vertical_compose_sse2_add("%1","xmm0","xmm2","xmm4","xmm6") | |
501 snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7") | |
502 snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6") | |
503 snow_vertical_compose_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") | |
504 snow_vertical_compose_sse2_add("%2","xmm0","xmm2","xmm4","xmm6") | |
505 snow_vertical_compose_sse2_store("%2","xmm0","xmm2","xmm4","xmm6") | |
506 | |
507 "2: \n\t" | |
508 "sub $64, %%"REG_d" \n\t" | |
509 "jge 1b \n\t" | |
510 :"+d"(i) | |
511 :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5)); | |
512 } | |
513 | |
514 #define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\ | |
515 ""op" ("r",%%"REG_d"), %%"t0" \n\t"\ | |
516 ""op" 8("r",%%"REG_d"), %%"t1" \n\t"\ | |
517 ""op" 16("r",%%"REG_d"), %%"t2" \n\t"\ | |
518 ""op" 24("r",%%"REG_d"), %%"t3" \n\t" | |
519 | |
520 #define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\ | |
521 snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3) | |
522 | |
523 #define snow_vertical_compose_mmx_add(r,t0,t1,t2,t3)\ | |
524 snow_vertical_compose_mmx_load_add("paddw",r,t0,t1,t2,t3) | |
525 | |
526 #define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\ | |
527 "movq %%"s0", ("w",%%"REG_d") \n\t"\ | |
528 "movq %%"s1", 8("w",%%"REG_d") \n\t"\ | |
529 "movq %%"s2", 16("w",%%"REG_d") \n\t"\ | |
530 "movq %%"s3", 24("w",%%"REG_d") \n\t" | |
531 | |
532 #define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\ | |
533 "movq %%"s0", %%"t0" \n\t"\ | |
534 "movq %%"s1", %%"t1" \n\t"\ | |
535 "movq %%"s2", %%"t2" \n\t"\ | |
536 "movq %%"s3", %%"t3" \n\t" | |
537 | |
538 | |
11485 | 539 static void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){ |
8430 | 540 x86_reg i = width; |
541 while(i & 15) | |
542 { | |
543 i--; | |
544 b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS; | |
545 b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS; | |
546 b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS; | |
547 b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS; | |
548 } | |
549 i+=i; | |
550 __asm__ volatile( | |
551 "jmp 2f \n\t" | |
552 "1: \n\t" | |
553 | |
554 snow_vertical_compose_mmx_load("%4","mm1","mm3","mm5","mm7") | |
555 snow_vertical_compose_mmx_add("%6","mm1","mm3","mm5","mm7") | |
556 "pcmpeqw %%mm0, %%mm0 \n\t" | |
557 "pcmpeqw %%mm2, %%mm2 \n\t" | |
558 "paddw %%mm2, %%mm2 \n\t" | |
559 "paddw %%mm0, %%mm2 \n\t" | |
560 "psllw $13, %%mm2 \n\t" | |
561 snow_vertical_compose_r2r_add("mm0","mm0","mm0","mm0","mm1","mm3","mm5","mm7") | |
562 snow_vertical_compose_r2r_pmulhw("mm2","mm2","mm2","mm2","mm1","mm3","mm5","mm7") | |
563 snow_vertical_compose_mmx_add("%5","mm1","mm3","mm5","mm7") | |
564 snow_vertical_compose_mmx_store("%5","mm1","mm3","mm5","mm7") | |
565 snow_vertical_compose_mmx_load("%4","mm0","mm2","mm4","mm6") | |
566 snow_vertical_compose_mmx_add("%3","mm1","mm3","mm5","mm7") | |
567 snow_vertical_compose_r2r_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") | |
568 snow_vertical_compose_mmx_store("%4","mm0","mm2","mm4","mm6") | |
569 "pcmpeqw %%mm7, %%mm7 \n\t" | |
570 "pcmpeqw %%mm5, %%mm5 \n\t" | |
571 "psllw $15, %%mm7 \n\t" | |
572 "psrlw $13, %%mm5 \n\t" | |
573 "paddw %%mm7, %%mm5 \n\t" | |
574 snow_vertical_compose_r2r_add("mm5","mm5","mm5","mm5","mm0","mm2","mm4","mm6") | |
575 "movq (%2,%%"REG_d"), %%mm1 \n\t" | |
576 "movq 8(%2,%%"REG_d"), %%mm3 \n\t" | |
577 "paddw %%mm7, %%mm1 \n\t" | |
578 "paddw %%mm7, %%mm3 \n\t" | |
579 "pavgw %%mm1, %%mm0 \n\t" | |
580 "pavgw %%mm3, %%mm2 \n\t" | |
581 "movq 16(%2,%%"REG_d"), %%mm1 \n\t" | |
582 "movq 24(%2,%%"REG_d"), %%mm3 \n\t" | |
583 "paddw %%mm7, %%mm1 \n\t" | |
584 "paddw %%mm7, %%mm3 \n\t" | |
585 "pavgw %%mm1, %%mm4 \n\t" | |
586 "pavgw %%mm3, %%mm6 \n\t" | |
587 snow_vertical_compose_r2r_sub("mm7","mm7","mm7","mm7","mm0","mm2","mm4","mm6") | |
588 snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6") | |
589 snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6") | |
590 | |
591 snow_vertical_compose_sra("2","mm0","mm2","mm4","mm6") | |
592 snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6") | |
593 snow_vertical_compose_mmx_store("%3","mm0","mm2","mm4","mm6") | |
594 snow_vertical_compose_mmx_add("%1","mm0","mm2","mm4","mm6") | |
595 snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7") | |
596 snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6") | |
597 snow_vertical_compose_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") | |
598 snow_vertical_compose_mmx_add("%2","mm0","mm2","mm4","mm6") | |
599 snow_vertical_compose_mmx_store("%2","mm0","mm2","mm4","mm6") | |
600 | |
601 "2: \n\t" | |
602 "sub $32, %%"REG_d" \n\t" | |
603 "jge 1b \n\t" | |
604 :"+d"(i) | |
605 :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5)); | |
606 } | |
607 #endif //HAVE_7REGS | |
608 | |
609 #define snow_inner_add_yblock_sse2_header \ | |
610 IDWTELEM * * dst_array = sb->line + src_y;\ | |
611 x86_reg tmp;\ | |
612 __asm__ volatile(\ | |
613 "mov %7, %%"REG_c" \n\t"\ | |
614 "mov %6, %2 \n\t"\ | |
615 "mov %4, %%"REG_S" \n\t"\ | |
616 "pxor %%xmm7, %%xmm7 \n\t" /* 0 */\ | |
617 "pcmpeqd %%xmm3, %%xmm3 \n\t"\ | |
618 "psllw $15, %%xmm3 \n\t"\ | |
619 "psrlw $12, %%xmm3 \n\t" /* FRAC_BITS >> 1 */\ | |
620 "1: \n\t"\ | |
621 "mov %1, %%"REG_D" \n\t"\ | |
622 "mov (%%"REG_D"), %%"REG_D" \n\t"\ | |
623 "add %3, %%"REG_D" \n\t" | |
624 | |
625 #define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\ | |
626 "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ | |
627 "movq (%%"REG_d"), %%"out_reg1" \n\t"\ | |
628 "movq (%%"REG_d", %%"REG_c"), %%"out_reg2" \n\t"\ | |
629 "punpcklbw %%xmm7, %%"out_reg1" \n\t"\ | |
630 "punpcklbw %%xmm7, %%"out_reg2" \n\t"\ | |
631 "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\ | |
632 "movq "s_offset"+16(%%"REG_S"), %%xmm4 \n\t"\ | |
633 "punpcklbw %%xmm7, %%xmm0 \n\t"\ | |
634 "punpcklbw %%xmm7, %%xmm4 \n\t"\ | |
635 "pmullw %%xmm0, %%"out_reg1" \n\t"\ | |
636 "pmullw %%xmm4, %%"out_reg2" \n\t" | |
637 | |
638 #define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\ | |
639 "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ | |
640 "movq (%%"REG_d"), %%"out_reg1" \n\t"\ | |
641 "movq 8(%%"REG_d"), %%"out_reg2" \n\t"\ | |
642 "punpcklbw %%xmm7, %%"out_reg1" \n\t"\ | |
643 "punpcklbw %%xmm7, %%"out_reg2" \n\t"\ | |
644 "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\ | |
645 "movq "s_offset"+8(%%"REG_S"), %%xmm4 \n\t"\ | |
646 "punpcklbw %%xmm7, %%xmm0 \n\t"\ | |
647 "punpcklbw %%xmm7, %%xmm4 \n\t"\ | |
648 "pmullw %%xmm0, %%"out_reg1" \n\t"\ | |
649 "pmullw %%xmm4, %%"out_reg2" \n\t" | |
650 | |
651 #define snow_inner_add_yblock_sse2_accum_8(ptr_offset, s_offset) \ | |
652 snow_inner_add_yblock_sse2_start_8("xmm2", "xmm6", ptr_offset, s_offset)\ | |
653 "paddusw %%xmm2, %%xmm1 \n\t"\ | |
654 "paddusw %%xmm6, %%xmm5 \n\t" | |
655 | |
656 #define snow_inner_add_yblock_sse2_accum_16(ptr_offset, s_offset) \ | |
657 snow_inner_add_yblock_sse2_start_16("xmm2", "xmm6", ptr_offset, s_offset)\ | |
658 "paddusw %%xmm2, %%xmm1 \n\t"\ | |
659 "paddusw %%xmm6, %%xmm5 \n\t" | |
660 | |
661 #define snow_inner_add_yblock_sse2_end_common1\ | |
662 "add $32, %%"REG_S" \n\t"\ | |
663 "add %%"REG_c", %0 \n\t"\ | |
664 "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\ | |
665 "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\ | |
666 "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\ | |
667 "add %%"REG_c", (%%"REG_a") \n\t" | |
668 | |
669 #define snow_inner_add_yblock_sse2_end_common2\ | |
670 "jnz 1b \n\t"\ | |
671 :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\ | |
672 :\ | |
9194 | 673 "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\ |
8430 | 674 "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d""); |
675 | |
676 #define snow_inner_add_yblock_sse2_end_8\ | |
677 "sal $1, %%"REG_c" \n\t"\ | |
678 "add $"PTR_SIZE"*2, %1 \n\t"\ | |
679 snow_inner_add_yblock_sse2_end_common1\ | |
680 "sar $1, %%"REG_c" \n\t"\ | |
681 "sub $2, %2 \n\t"\ | |
682 snow_inner_add_yblock_sse2_end_common2 | |
683 | |
684 #define snow_inner_add_yblock_sse2_end_16\ | |
685 "add $"PTR_SIZE"*1, %1 \n\t"\ | |
686 snow_inner_add_yblock_sse2_end_common1\ | |
687 "dec %2 \n\t"\ | |
688 snow_inner_add_yblock_sse2_end_common2 | |
689 | |
690 static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h, | |
691 int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){ | |
692 snow_inner_add_yblock_sse2_header | |
693 snow_inner_add_yblock_sse2_start_8("xmm1", "xmm5", "3", "0") | |
694 snow_inner_add_yblock_sse2_accum_8("2", "8") | |
695 snow_inner_add_yblock_sse2_accum_8("1", "128") | |
696 snow_inner_add_yblock_sse2_accum_8("0", "136") | |
697 | |
698 "mov %0, %%"REG_d" \n\t" | |
699 "movdqa (%%"REG_D"), %%xmm0 \n\t" | |
700 "movdqa %%xmm1, %%xmm2 \n\t" | |
701 | |
702 "punpckhwd %%xmm7, %%xmm1 \n\t" | |
703 "punpcklwd %%xmm7, %%xmm2 \n\t" | |
704 "paddd %%xmm2, %%xmm0 \n\t" | |
705 "movdqa 16(%%"REG_D"), %%xmm2 \n\t" | |
706 "paddd %%xmm1, %%xmm2 \n\t" | |
707 "paddd %%xmm3, %%xmm0 \n\t" | |
708 "paddd %%xmm3, %%xmm2 \n\t" | |
709 | |
710 "mov %1, %%"REG_D" \n\t" | |
711 "mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\n\t" | |
712 "add %3, %%"REG_D" \n\t" | |
713 | |
714 "movdqa (%%"REG_D"), %%xmm4 \n\t" | |
715 "movdqa %%xmm5, %%xmm6 \n\t" | |
716 "punpckhwd %%xmm7, %%xmm5 \n\t" | |
717 "punpcklwd %%xmm7, %%xmm6 \n\t" | |
718 "paddd %%xmm6, %%xmm4 \n\t" | |
719 "movdqa 16(%%"REG_D"), %%xmm6 \n\t" | |
720 "paddd %%xmm5, %%xmm6 \n\t" | |
721 "paddd %%xmm3, %%xmm4 \n\t" | |
722 "paddd %%xmm3, %%xmm6 \n\t" | |
723 | |
724 "psrad $8, %%xmm0 \n\t" /* FRAC_BITS. */ | |
725 "psrad $8, %%xmm2 \n\t" /* FRAC_BITS. */ | |
726 "packssdw %%xmm2, %%xmm0 \n\t" | |
727 "packuswb %%xmm7, %%xmm0 \n\t" | |
728 "movq %%xmm0, (%%"REG_d") \n\t" | |
729 | |
730 "psrad $8, %%xmm4 \n\t" /* FRAC_BITS. */ | |
731 "psrad $8, %%xmm6 \n\t" /* FRAC_BITS. */ | |
732 "packssdw %%xmm6, %%xmm4 \n\t" | |
733 "packuswb %%xmm7, %%xmm4 \n\t" | |
734 "movq %%xmm4, (%%"REG_d",%%"REG_c");\n\t" | |
735 snow_inner_add_yblock_sse2_end_8 | |
736 } | |
737 | |
738 static void inner_add_yblock_bw_16_obmc_32_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h, | |
739 int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){ | |
740 snow_inner_add_yblock_sse2_header | |
741 snow_inner_add_yblock_sse2_start_16("xmm1", "xmm5", "3", "0") | |
742 snow_inner_add_yblock_sse2_accum_16("2", "16") | |
743 snow_inner_add_yblock_sse2_accum_16("1", "512") | |
744 snow_inner_add_yblock_sse2_accum_16("0", "528") | |
745 | |
746 "mov %0, %%"REG_d" \n\t" | |
747 "psrlw $4, %%xmm1 \n\t" | |
748 "psrlw $4, %%xmm5 \n\t" | |
749 "paddw (%%"REG_D"), %%xmm1 \n\t" | |
750 "paddw 16(%%"REG_D"), %%xmm5 \n\t" | |
751 "paddw %%xmm3, %%xmm1 \n\t" | |
752 "paddw %%xmm3, %%xmm5 \n\t" | |
753 "psraw $4, %%xmm1 \n\t" /* FRAC_BITS. */ | |
754 "psraw $4, %%xmm5 \n\t" /* FRAC_BITS. */ | |
755 "packuswb %%xmm5, %%xmm1 \n\t" | |
756 | |
757 "movdqu %%xmm1, (%%"REG_d") \n\t" | |
758 | |
759 snow_inner_add_yblock_sse2_end_16 | |
760 } | |
761 | |
762 #define snow_inner_add_yblock_mmx_header \ | |
763 IDWTELEM * * dst_array = sb->line + src_y;\ | |
764 x86_reg tmp;\ | |
765 __asm__ volatile(\ | |
766 "mov %7, %%"REG_c" \n\t"\ | |
767 "mov %6, %2 \n\t"\ | |
768 "mov %4, %%"REG_S" \n\t"\ | |
769 "pxor %%mm7, %%mm7 \n\t" /* 0 */\ | |
770 "pcmpeqd %%mm3, %%mm3 \n\t"\ | |
771 "psllw $15, %%mm3 \n\t"\ | |
772 "psrlw $12, %%mm3 \n\t" /* FRAC_BITS >> 1 */\ | |
773 "1: \n\t"\ | |
774 "mov %1, %%"REG_D" \n\t"\ | |
775 "mov (%%"REG_D"), %%"REG_D" \n\t"\ | |
776 "add %3, %%"REG_D" \n\t" | |
777 | |
778 #define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\ | |
779 "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ | |
780 "movd "d_offset"(%%"REG_d"), %%"out_reg1" \n\t"\ | |
781 "movd "d_offset"+4(%%"REG_d"), %%"out_reg2" \n\t"\ | |
782 "punpcklbw %%mm7, %%"out_reg1" \n\t"\ | |
783 "punpcklbw %%mm7, %%"out_reg2" \n\t"\ | |
784 "movd "s_offset"(%%"REG_S"), %%mm0 \n\t"\ | |
785 "movd "s_offset"+4(%%"REG_S"), %%mm4 \n\t"\ | |
786 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
787 "punpcklbw %%mm7, %%mm4 \n\t"\ | |
788 "pmullw %%mm0, %%"out_reg1" \n\t"\ | |
789 "pmullw %%mm4, %%"out_reg2" \n\t" | |
790 | |
791 #define snow_inner_add_yblock_mmx_accum(ptr_offset, s_offset, d_offset) \ | |
792 snow_inner_add_yblock_mmx_start("mm2", "mm6", ptr_offset, s_offset, d_offset)\ | |
793 "paddusw %%mm2, %%mm1 \n\t"\ | |
794 "paddusw %%mm6, %%mm5 \n\t" | |
795 | |
796 #define snow_inner_add_yblock_mmx_mix(read_offset, write_offset)\ | |
797 "mov %0, %%"REG_d" \n\t"\ | |
798 "psrlw $4, %%mm1 \n\t"\ | |
799 "psrlw $4, %%mm5 \n\t"\ | |
800 "paddw "read_offset"(%%"REG_D"), %%mm1 \n\t"\ | |
801 "paddw "read_offset"+8(%%"REG_D"), %%mm5 \n\t"\ | |
802 "paddw %%mm3, %%mm1 \n\t"\ | |
803 "paddw %%mm3, %%mm5 \n\t"\ | |
804 "psraw $4, %%mm1 \n\t"\ | |
805 "psraw $4, %%mm5 \n\t"\ | |
806 "packuswb %%mm5, %%mm1 \n\t"\ | |
807 "movq %%mm1, "write_offset"(%%"REG_d") \n\t" | |
808 | |
809 #define snow_inner_add_yblock_mmx_end(s_step)\ | |
810 "add $"s_step", %%"REG_S" \n\t"\ | |
811 "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\ | |
812 "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\ | |
813 "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\ | |
814 "add %%"REG_c", (%%"REG_a") \n\t"\ | |
815 "add $"PTR_SIZE"*1, %1 \n\t"\ | |
816 "add %%"REG_c", %0 \n\t"\ | |
817 "dec %2 \n\t"\ | |
818 "jnz 1b \n\t"\ | |
819 :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\ | |
820 :\ | |
9194 | 821 "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\ |
8430 | 822 "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d""); |
823 | |
824 static void inner_add_yblock_bw_8_obmc_16_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h, | |
825 int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){ | |
826 snow_inner_add_yblock_mmx_header | |
827 snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0") | |
828 snow_inner_add_yblock_mmx_accum("2", "8", "0") | |
829 snow_inner_add_yblock_mmx_accum("1", "128", "0") | |
830 snow_inner_add_yblock_mmx_accum("0", "136", "0") | |
831 snow_inner_add_yblock_mmx_mix("0", "0") | |
832 snow_inner_add_yblock_mmx_end("16") | |
833 } | |
834 | |
835 static void inner_add_yblock_bw_16_obmc_32_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h, | |
836 int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){ | |
837 snow_inner_add_yblock_mmx_header | |
838 snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0") | |
839 snow_inner_add_yblock_mmx_accum("2", "16", "0") | |
840 snow_inner_add_yblock_mmx_accum("1", "512", "0") | |
841 snow_inner_add_yblock_mmx_accum("0", "528", "0") | |
842 snow_inner_add_yblock_mmx_mix("0", "0") | |
843 | |
844 snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "8", "8") | |
845 snow_inner_add_yblock_mmx_accum("2", "24", "8") | |
846 snow_inner_add_yblock_mmx_accum("1", "520", "8") | |
847 snow_inner_add_yblock_mmx_accum("0", "536", "8") | |
848 snow_inner_add_yblock_mmx_mix("16", "8") | |
849 snow_inner_add_yblock_mmx_end("32") | |
850 } | |
851 | |
11485 | 852 static void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, |
8430 | 853 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){ |
854 | |
855 if (b_w == 16) | |
856 inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); | |
857 else if (b_w == 8 && obmc_stride == 16) { | |
858 if (!(b_h & 1)) | |
859 inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); | |
860 else | |
861 inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); | |
862 } else | |
863 ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); | |
864 } | |
865 | |
11485 | 866 static void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, |
8430 | 867 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){ |
868 if (b_w == 16) | |
869 inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); | |
870 else if (b_w == 8 && obmc_stride == 16) | |
871 inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); | |
872 else | |
873 ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); | |
874 } | |
11485 | 875 |
876 void ff_dwt_init_x86(DWTContext *c) | |
877 { | |
12475
9fef0a8ddd63
Move mm_support() from libavcodec to libavutil, make it a public
stefano
parents:
12456
diff
changeset
|
878 int mm_flags = av_get_cpu_flags(); |
11485 | 879 |
12456
a5ddb39627fd
Rename FF_MM_ symbols related to CPU features flags as AV_CPU_FLAG_
stefano
parents:
12414
diff
changeset
|
880 if (mm_flags & AV_CPU_FLAG_MMX) { |
a5ddb39627fd
Rename FF_MM_ symbols related to CPU features flags as AV_CPU_FLAG_
stefano
parents:
12414
diff
changeset
|
881 if(mm_flags & AV_CPU_FLAG_SSE2 & 0){ |
11485 | 882 c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2; |
883 #if HAVE_7REGS | |
884 c->vertical_compose97i = ff_snow_vertical_compose97i_sse2; | |
885 #endif | |
886 c->inner_add_yblock = ff_snow_inner_add_yblock_sse2; | |
887 } | |
888 else{ | |
12456
a5ddb39627fd
Rename FF_MM_ symbols related to CPU features flags as AV_CPU_FLAG_
stefano
parents:
12414
diff
changeset
|
889 if(mm_flags & AV_CPU_FLAG_MMX2){ |
11485 | 890 c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx; |
891 #if HAVE_7REGS | |
892 c->vertical_compose97i = ff_snow_vertical_compose97i_mmx; | |
893 #endif | |
894 } | |
895 c->inner_add_yblock = ff_snow_inner_add_yblock_mmx; | |
896 } | |
897 } | |
898 } |