Mercurial > libavcodec.hg
annotate i386/snowdsp_mmx.c @ 5591:642588a60570 libavcodec
update mmx code to latest snow changes
note, the code likely can overflow and thus needs some more changes
sse2 updated too but disabled as it is untested
author | michael |
---|---|
date | Sat, 25 Aug 2007 15:20:56 +0000 |
parents | 93082c591c8b |
children | fadffa1e5aef |
rev | line source |
---|---|
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
1 /* |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
2 * MMX and SSE2 optimized snow DSP utils |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
3 * Copyright (c) 2005-2006 Robert Edele <yartrebo@earthlink.net> |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
4 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3566
diff
changeset
|
5 * This file is part of FFmpeg. |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3566
diff
changeset
|
6 * |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3566
diff
changeset
|
7 * FFmpeg is free software; you can redistribute it and/or |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
8 * modify it under the terms of the GNU Lesser General Public |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
9 * License as published by the Free Software Foundation; either |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3566
diff
changeset
|
10 * version 2.1 of the License, or (at your option) any later version. |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
11 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3566
diff
changeset
|
12 * FFmpeg is distributed in the hope that it will be useful, |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
15 * Lesser General Public License for more details. |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
16 * |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
17 * You should have received a copy of the GNU Lesser General Public |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3566
diff
changeset
|
18 * License along with FFmpeg; if not, write to the Free Software |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
20 */ |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
21 |
5010
d5ba514e3f4a
Add libavcodec to compiler include flags in order to simplify header
diego
parents:
4436
diff
changeset
|
22 #include "avcodec.h" |
d5ba514e3f4a
Add libavcodec to compiler include flags in order to simplify header
diego
parents:
4436
diff
changeset
|
23 #include "snow.h" |
3398
e0927bc44a10
Move REG_* macros from libavcodec/i386/mmx.h to libavutil/x86_cpu.h
lucabe
parents:
3265
diff
changeset
|
24 #include "x86_cpu.h" |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
25 |
5591 | 26 void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, int width){ |
3210 | 27 const int w2= (width+1)>>1; |
28 // SSE2 code runs faster with pointers aligned on a 32-byte boundary. | |
5591 | 29 IDWTELEM temp_buf[(width>>1) + 4]; |
30 IDWTELEM * const temp = temp_buf + 4 - (((int)temp_buf & 0xF) >> 2); | |
3210 | 31 const int w_l= (width>>1); |
32 const int w_r= w2 - 1; | |
33 int i; | |
34 | |
35 { // Lift 0 | |
5591 | 36 IDWTELEM * const ref = b + w2 - 1; |
37 IDWTELEM b_0 = b[0]; //By allowing the first entry in b[0] to be calculated twice | |
3210 | 38 // (the first time erroneously), we allow the SSE2 code to run an extra pass. |
39 // The savings in code and time are well worth having to store this value and | |
40 // calculate b[0] correctly afterwards. | |
41 | |
42 i = 0; | |
43 asm volatile( | |
44 "pcmpeqd %%xmm7, %%xmm7 \n\t" | |
5591 | 45 "psllw $15, %%xmm7 \n\t" |
46 "psrlw $13, %%xmm7 \n\t" | |
3210 | 47 ::); |
5591 | 48 for(; i<w_l-15; i+=16){ |
3210 | 49 asm volatile( |
50 "movdqu (%1), %%xmm1 \n\t" | |
51 "movdqu 16(%1), %%xmm5 \n\t" | |
5591 | 52 "movdqu 2(%1), %%xmm2 \n\t" |
53 "movdqu 18(%1), %%xmm6 \n\t" | |
54 "paddw %%xmm1, %%xmm2 \n\t" | |
55 "paddw %%xmm5, %%xmm6 \n\t" | |
3210 | 56 "movdqa %%xmm2, %%xmm0 \n\t" |
57 "movdqa %%xmm6, %%xmm4 \n\t" | |
5591 | 58 "paddw %%xmm2, %%xmm2 \n\t" |
59 "paddw %%xmm6, %%xmm6 \n\t" | |
60 "paddw %%xmm0, %%xmm2 \n\t" | |
61 "paddw %%xmm4, %%xmm6 \n\t" | |
62 "paddw %%xmm7, %%xmm2 \n\t" | |
63 "paddw %%xmm7, %%xmm6 \n\t" | |
64 "psraw $3, %%xmm2 \n\t" | |
65 "psraw $3, %%xmm6 \n\t" | |
3210 | 66 "movdqa (%0), %%xmm0 \n\t" |
67 "movdqa 16(%0), %%xmm4 \n\t" | |
5591 | 68 "psubw %%xmm2, %%xmm0 \n\t" |
69 "psubw %%xmm6, %%xmm4 \n\t" | |
3210 | 70 "movdqa %%xmm0, (%0) \n\t" |
71 "movdqa %%xmm4, 16(%0) \n\t" | |
72 :: "r"(&b[i]), "r"(&ref[i]) | |
73 : "memory" | |
74 ); | |
75 } | |
76 snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS); | |
77 b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS); | |
78 } | |
79 | |
80 { // Lift 1 | |
5591 | 81 IDWTELEM * const dst = b+w2; |
3210 | 82 |
83 i = 0; | |
5591 | 84 for(; (((long)&dst[i]) & 0x1F) && i<w_r; i++){ |
3210 | 85 dst[i] = dst[i] - (b[i] + b[i + 1]); |
86 } | |
5591 | 87 for(; i<w_r-15; i+=16){ |
3210 | 88 asm volatile( |
89 "movdqu (%1), %%xmm1 \n\t" | |
90 "movdqu 16(%1), %%xmm5 \n\t" | |
5591 | 91 "movdqu 2(%1), %%xmm2 \n\t" |
92 "movdqu 18(%1), %%xmm6 \n\t" | |
93 "paddw %%xmm1, %%xmm2 \n\t" | |
94 "paddw %%xmm5, %%xmm6 \n\t" | |
3210 | 95 "movdqa (%0), %%xmm0 \n\t" |
96 "movdqa 16(%0), %%xmm4 \n\t" | |
5591 | 97 "psubw %%xmm2, %%xmm0 \n\t" |
98 "psubw %%xmm6, %%xmm4 \n\t" | |
3210 | 99 "movdqa %%xmm0, (%0) \n\t" |
100 "movdqa %%xmm4, 16(%0) \n\t" | |
101 :: "r"(&dst[i]), "r"(&b[i]) | |
102 : "memory" | |
103 ); | |
104 } | |
105 snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS); | |
106 } | |
107 | |
108 { // Lift 2 | |
5591 | 109 IDWTELEM * const ref = b+w2 - 1; |
110 IDWTELEM b_0 = b[0]; | |
3210 | 111 |
112 i = 0; | |
113 asm volatile( | |
5591 | 114 "psllw $1, %%xmm7 \n\t" |
3210 | 115 ::); |
5591 | 116 for(; i<w_l-15; i+=16){ |
3210 | 117 asm volatile( |
118 "movdqu (%1), %%xmm1 \n\t" | |
119 "movdqu 16(%1), %%xmm5 \n\t" | |
5591 | 120 "movdqu 2(%1), %%xmm0 \n\t" |
121 "movdqu 18(%1), %%xmm4 \n\t" //FIXME try aligned reads and shifts | |
122 "paddw %%xmm1, %%xmm0 \n\t" | |
123 "paddw %%xmm5, %%xmm4 \n\t" | |
124 "paddw %%xmm7, %%xmm0 \n\t" | |
125 "paddw %%xmm7, %%xmm4 \n\t" | |
5555
0790af6d0028
remove idiotc double subtraction from the sse2 code (untested, no sse2 here)
michael
parents:
5554
diff
changeset
|
126 "movdqa (%0), %%xmm1 \n\t" |
0790af6d0028
remove idiotc double subtraction from the sse2 code (untested, no sse2 here)
michael
parents:
5554
diff
changeset
|
127 "movdqa 16(%0), %%xmm5 \n\t" |
5591 | 128 "psraw $2, %%xmm0 \n\t" |
129 "psraw $2, %%xmm4 \n\t" | |
130 "paddw %%xmm1, %%xmm0 \n\t" | |
131 "paddw %%xmm5, %%xmm4 \n\t" | |
132 "psraw $2, %%xmm0 \n\t" | |
133 "psraw $2, %%xmm4 \n\t" | |
134 "paddw %%xmm1, %%xmm0 \n\t" | |
135 "paddw %%xmm5, %%xmm4 \n\t" | |
3210 | 136 "movdqa %%xmm0, (%0) \n\t" |
137 "movdqa %%xmm4, 16(%0) \n\t" | |
138 :: "r"(&b[i]), "r"(&ref[i]) | |
139 : "memory" | |
140 ); | |
141 } | |
142 snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l); | |
5555
0790af6d0028
remove idiotc double subtraction from the sse2 code (untested, no sse2 here)
michael
parents:
5554
diff
changeset
|
143 b[0] = b_0 + ((2 * ref[1] + W_BO-1 + 4 * b_0) >> W_BS); |
3210 | 144 } |
145 | |
146 { // Lift 3 | |
5591 | 147 IDWTELEM * const src = b+w2; |
3210 | 148 |
149 i = 0; | |
5591 | 150 for(; (((long)&temp[i]) & 0x1F) && i<w_r; i++){ |
3210 | 151 temp[i] = src[i] - ((-W_AM*(b[i] + b[i+1]))>>W_AS); |
152 } | |
153 for(; i<w_r-7; i+=8){ | |
154 asm volatile( | |
5591 | 155 "movdqu 2(%1), %%xmm2 \n\t" |
156 "movdqu 18(%1), %%xmm6 \n\t" | |
157 "paddw (%1), %%xmm2 \n\t" | |
158 "paddw 16(%1), %%xmm6 \n\t" | |
5565
93082c591c8b
Change rounding of the horizontal DWT to match the vertical one.
michael
parents:
5563
diff
changeset
|
159 "movdqu (%0), %%xmm0 \n\t" |
93082c591c8b
Change rounding of the horizontal DWT to match the vertical one.
michael
parents:
5563
diff
changeset
|
160 "movdqu 16(%0), %%xmm4 \n\t" |
5591 | 161 "paddw %%xmm2, %%xmm0 \n\t" |
162 "paddw %%xmm6, %%xmm4 \n\t" | |
163 "psraw $1, %%xmm2 \n\t" | |
164 "psraw $1, %%xmm6 \n\t" | |
165 "paddw %%xmm0, %%xmm2 \n\t" | |
166 "paddw %%xmm4, %%xmm6 \n\t" | |
3210 | 167 "movdqa %%xmm2, (%2) \n\t" |
168 "movdqa %%xmm6, 16(%2) \n\t" | |
169 :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i]) | |
170 : "memory" | |
171 ); | |
172 } | |
5565
93082c591c8b
Change rounding of the horizontal DWT to match the vertical one.
michael
parents:
5563
diff
changeset
|
173 snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS); |
3210 | 174 } |
175 | |
176 { | |
177 snow_interleave_line_header(&i, width, b, temp); | |
178 | |
5591 | 179 for (; (i & 0x3E) != 0x3E; i-=2){ |
180 b[i+1] = temp[i>>1]; | |
181 b[i] = b[i>>1]; | |
182 } | |
183 for (i-=62; i>=0; i-=64){ | |
184 asm volatile( | |
185 "movdqa (%1), %%xmm0 \n\t" | |
186 "movdqa 16(%1), %%xmm2 \n\t" | |
187 "movdqa 32(%1), %%xmm4 \n\t" | |
188 "movdqa 48(%1), %%xmm6 \n\t" | |
189 "movdqa (%1), %%xmm1 \n\t" | |
190 "movdqa 16(%1), %%xmm3 \n\t" | |
191 "movdqa 32(%1), %%xmm5 \n\t" | |
192 "movdqa 48(%1), %%xmm7 \n\t" | |
193 "punpcklwd (%2), %%xmm0 \n\t" | |
194 "punpcklwd 16(%2), %%xmm2 \n\t" | |
195 "punpcklwd 32(%2), %%xmm4 \n\t" | |
196 "punpcklwd 48(%2), %%xmm6 \n\t" | |
197 "movdqa %%xmm0, (%0) \n\t" | |
198 "movdqa %%xmm2, 32(%0) \n\t" | |
199 "movdqa %%xmm4, 64(%0) \n\t" | |
200 "movdqa %%xmm6, 96(%0) \n\t" | |
201 "punpckhwd (%2), %%xmm1 \n\t" | |
202 "punpckhwd 16(%2), %%xmm3 \n\t" | |
203 "punpckhwd 32(%2), %%xmm5 \n\t" | |
204 "punpckhwd 48(%2), %%xmm7 \n\t" | |
205 "movdqa %%xmm1, 16(%0) \n\t" | |
206 "movdqa %%xmm3, 48(%0) \n\t" | |
207 "movdqa %%xmm5, 80(%0) \n\t" | |
208 "movdqa %%xmm7, 112(%0) \n\t" | |
209 :: "r"(&(b)[i]), "r"(&(b)[i>>1]), "r"(&(temp)[i>>1]) | |
210 : "memory" | |
211 ); | |
212 } | |
213 } | |
214 } | |
215 | |
216 void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, int width){ | |
217 const int w2= (width+1)>>1; | |
218 IDWTELEM temp[width >> 1]; | |
219 const int w_l= (width>>1); | |
220 const int w_r= w2 - 1; | |
221 int i; | |
222 | |
223 { // Lift 0 | |
224 IDWTELEM * const ref = b + w2 - 1; | |
225 | |
226 i = 1; | |
227 b[0] = b[0] - ((W_DM * 2 * ref[1]+W_DO)>>W_DS); | |
228 asm volatile( | |
229 "pcmpeqw %%mm7, %%mm7 \n\t" | |
230 "psllw $15, %%mm7 \n\t" | |
231 "psrlw $13, %%mm7 \n\t" | |
232 ::); | |
233 for(; i<w_l-7; i+=8){ | |
234 asm volatile( | |
235 "movq (%1), %%mm2 \n\t" | |
236 "movq 8(%1), %%mm6 \n\t" | |
237 "paddw 2(%1), %%mm2 \n\t" | |
238 "paddw 10(%1), %%mm6 \n\t" | |
239 "movq %%mm2, %%mm0 \n\t" | |
240 "movq %%mm6, %%mm4 \n\t" | |
241 "paddw %%mm2, %%mm2 \n\t" | |
242 "paddw %%mm6, %%mm6 \n\t" | |
243 "paddw %%mm0, %%mm2 \n\t" | |
244 "paddw %%mm4, %%mm6 \n\t" | |
245 "paddw %%mm7, %%mm2 \n\t" | |
246 "paddw %%mm7, %%mm6 \n\t" | |
247 "psraw $3, %%mm2 \n\t" | |
248 "psraw $3, %%mm6 \n\t" | |
249 "movq (%0), %%mm0 \n\t" | |
250 "movq 8(%0), %%mm4 \n\t" | |
251 "psubw %%mm2, %%mm0 \n\t" | |
252 "psubw %%mm6, %%mm4 \n\t" | |
253 "movq %%mm0, (%0) \n\t" | |
254 "movq %%mm4, 8(%0) \n\t" | |
255 :: "r"(&b[i]), "r"(&ref[i]) | |
256 : "memory" | |
257 ); | |
258 } | |
259 snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS); | |
260 } | |
261 | |
262 { // Lift 1 | |
263 IDWTELEM * const dst = b+w2; | |
264 | |
265 i = 0; | |
266 for(; i<w_r-7; i+=8){ | |
267 asm volatile( | |
268 "movq (%1), %%mm2 \n\t" | |
269 "movq 8(%1), %%mm6 \n\t" | |
270 "paddw 2(%1), %%mm2 \n\t" | |
271 "paddw 10(%1), %%mm6 \n\t" | |
272 "movq (%0), %%mm0 \n\t" | |
273 "movq 8(%0), %%mm4 \n\t" | |
274 "psubw %%mm2, %%mm0 \n\t" | |
275 "psubw %%mm6, %%mm4 \n\t" | |
276 "movq %%mm0, (%0) \n\t" | |
277 "movq %%mm4, 8(%0) \n\t" | |
278 :: "r"(&dst[i]), "r"(&b[i]) | |
279 : "memory" | |
280 ); | |
281 } | |
282 snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS); | |
283 } | |
284 | |
285 { // Lift 2 | |
286 IDWTELEM * const ref = b+w2 - 1; | |
287 | |
288 i = 1; | |
289 b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS); | |
290 asm volatile( | |
291 "psllw $1, %%mm7 \n\t" | |
292 ::); | |
293 for(; i<w_l-7; i+=8){ | |
294 asm volatile( | |
295 "movq (%1), %%mm0 \n\t" | |
296 "movq 8(%1), %%mm4 \n\t" | |
297 "paddw 2(%1), %%mm0 \n\t" | |
298 "paddw 10(%1), %%mm4 \n\t" | |
299 "paddw %%mm7, %%mm0 \n\t" | |
300 "paddw %%mm7, %%mm4 \n\t" | |
301 "psraw $2, %%mm0 \n\t" | |
302 "psraw $2, %%mm4 \n\t" | |
303 "movq (%0), %%mm1 \n\t" | |
304 "movq 8(%0), %%mm5 \n\t" | |
305 "paddw %%mm1, %%mm0 \n\t" | |
306 "paddw %%mm5, %%mm4 \n\t" | |
307 "psraw $2, %%mm0 \n\t" | |
308 "psraw $2, %%mm4 \n\t" | |
309 "paddw %%mm1, %%mm0 \n\t" | |
310 "paddw %%mm5, %%mm4 \n\t" | |
311 "movq %%mm0, (%0) \n\t" | |
312 "movq %%mm4, 8(%0) \n\t" | |
313 :: "r"(&b[i]), "r"(&ref[i]) | |
314 : "memory" | |
315 ); | |
316 } | |
317 snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l); | |
318 } | |
319 | |
320 { // Lift 3 | |
321 IDWTELEM * const src = b+w2; | |
322 i = 0; | |
323 | |
324 for(; i<w_r-7; i+=8){ | |
325 asm volatile( | |
326 "movq 2(%1), %%mm2 \n\t" | |
327 "movq 10(%1), %%mm6 \n\t" | |
328 "paddw (%1), %%mm2 \n\t" | |
329 "paddw 8(%1), %%mm6 \n\t" | |
330 "movq (%0), %%mm0 \n\t" | |
331 "movq 8(%0), %%mm4 \n\t" | |
332 "paddw %%mm2, %%mm0 \n\t" | |
333 "paddw %%mm6, %%mm4 \n\t" | |
334 "psraw $1, %%mm2 \n\t" | |
335 "psraw $1, %%mm6 \n\t" | |
336 "paddw %%mm0, %%mm2 \n\t" | |
337 "paddw %%mm4, %%mm6 \n\t" | |
338 "movq %%mm2, (%2) \n\t" | |
339 "movq %%mm6, 8(%2) \n\t" | |
340 :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i]) | |
341 : "memory" | |
342 ); | |
343 } | |
344 snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS); | |
345 } | |
346 | |
347 { | |
348 snow_interleave_line_header(&i, width, b, temp); | |
349 | |
3210 | 350 for (; (i & 0x1E) != 0x1E; i-=2){ |
351 b[i+1] = temp[i>>1]; | |
352 b[i] = b[i>>1]; | |
353 } | |
354 for (i-=30; i>=0; i-=32){ | |
355 asm volatile( | |
356 "movq (%1), %%mm0 \n\t" | |
357 "movq 8(%1), %%mm2 \n\t" | |
358 "movq 16(%1), %%mm4 \n\t" | |
359 "movq 24(%1), %%mm6 \n\t" | |
360 "movq (%1), %%mm1 \n\t" | |
361 "movq 8(%1), %%mm3 \n\t" | |
362 "movq 16(%1), %%mm5 \n\t" | |
363 "movq 24(%1), %%mm7 \n\t" | |
5591 | 364 "punpcklwd (%2), %%mm0 \n\t" |
365 "punpcklwd 8(%2), %%mm2 \n\t" | |
366 "punpcklwd 16(%2), %%mm4 \n\t" | |
367 "punpcklwd 24(%2), %%mm6 \n\t" | |
3210 | 368 "movq %%mm0, (%0) \n\t" |
369 "movq %%mm2, 16(%0) \n\t" | |
370 "movq %%mm4, 32(%0) \n\t" | |
371 "movq %%mm6, 48(%0) \n\t" | |
5591 | 372 "punpckhwd (%2), %%mm1 \n\t" |
373 "punpckhwd 8(%2), %%mm3 \n\t" | |
374 "punpckhwd 16(%2), %%mm5 \n\t" | |
375 "punpckhwd 24(%2), %%mm7 \n\t" | |
3210 | 376 "movq %%mm1, 8(%0) \n\t" |
377 "movq %%mm3, 24(%0) \n\t" | |
378 "movq %%mm5, 40(%0) \n\t" | |
379 "movq %%mm7, 56(%0) \n\t" | |
380 :: "r"(&b[i]), "r"(&b[i>>1]), "r"(&temp[i>>1]) | |
381 : "memory" | |
382 ); | |
383 } | |
384 } | |
385 } | |
386 | |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
387 #define snow_vertical_compose_sse2_load_add(op,r,t0,t1,t2,t3)\ |
5591 | 388 ""op" (%%"r",%%"REG_d",2), %%"t0" \n\t"\ |
389 ""op" 16(%%"r",%%"REG_d",2), %%"t1" \n\t"\ | |
390 ""op" 32(%%"r",%%"REG_d",2), %%"t2" \n\t"\ | |
391 ""op" 48(%%"r",%%"REG_d",2), %%"t3" \n\t" | |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
392 |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
393 #define snow_vertical_compose_sse2_load(r,t0,t1,t2,t3)\ |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
394 snow_vertical_compose_sse2_load_add("movdqa",r,t0,t1,t2,t3) |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
395 |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
396 #define snow_vertical_compose_sse2_add(r,t0,t1,t2,t3)\ |
5591 | 397 snow_vertical_compose_sse2_load_add("paddw",r,t0,t1,t2,t3) |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
398 |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
399 #define snow_vertical_compose_sse2_sub(s0,s1,s2,s3,t0,t1,t2,t3)\ |
5591 | 400 "psubw %%"s0", %%"t0" \n\t"\ |
401 "psubw %%"s1", %%"t1" \n\t"\ | |
402 "psubw %%"s2", %%"t2" \n\t"\ | |
403 "psubw %%"s3", %%"t3" \n\t" | |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
404 |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
405 #define snow_vertical_compose_sse2_store(w,s0,s1,s2,s3)\ |
5591 | 406 "movdqa %%"s0", (%%"w",%%"REG_d",2) \n\t"\ |
407 "movdqa %%"s1", 16(%%"w",%%"REG_d",2) \n\t"\ | |
408 "movdqa %%"s2", 32(%%"w",%%"REG_d",2) \n\t"\ | |
409 "movdqa %%"s3", 48(%%"w",%%"REG_d",2) \n\t" | |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
410 |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
411 #define snow_vertical_compose_sse2_sra(n,t0,t1,t2,t3)\ |
5591 | 412 "psraw $"n", %%"t0" \n\t"\ |
413 "psraw $"n", %%"t1" \n\t"\ | |
414 "psraw $"n", %%"t2" \n\t"\ | |
415 "psraw $"n", %%"t3" \n\t" | |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
416 |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
417 #define snow_vertical_compose_sse2_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\ |
5591 | 418 "paddw %%"s0", %%"t0" \n\t"\ |
419 "paddw %%"s1", %%"t1" \n\t"\ | |
420 "paddw %%"s2", %%"t2" \n\t"\ | |
421 "paddw %%"s3", %%"t3" \n\t" | |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
422 |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
423 #define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\ |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
424 "movdqa %%"s0", %%"t0" \n\t"\ |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
425 "movdqa %%"s1", %%"t1" \n\t"\ |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
426 "movdqa %%"s2", %%"t2" \n\t"\ |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
427 "movdqa %%"s3", %%"t3" \n\t" |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
428 |
5591 | 429 void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){ |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
430 long i = width; |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
431 |
5591 | 432 while(i & 0x1F) |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
433 { |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
434 i--; |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
435 b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS; |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
436 b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS; |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
437 b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS; |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
438 b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS; |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
439 } |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
440 |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
441 asm volatile ( |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
442 "jmp 2f \n\t" |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
443 "1: \n\t" |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
444 |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
445 "mov %6, %%"REG_a" \n\t" |
3566
b63ef6fcbc70
Fix x86 SIMD asm and pic, patch from Martin von Gagern <Martin.vGagern@gmx.net>
lu_zero
parents:
3398
diff
changeset
|
446 "mov %4, %%"REG_S" \n\t" |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
447 |
3566
b63ef6fcbc70
Fix x86 SIMD asm and pic, patch from Martin von Gagern <Martin.vGagern@gmx.net>
lu_zero
parents:
3398
diff
changeset
|
448 snow_vertical_compose_sse2_load(REG_S,"xmm0","xmm2","xmm4","xmm6") |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
449 snow_vertical_compose_sse2_add(REG_a,"xmm0","xmm2","xmm4","xmm6") |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
450 snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7") |
5558 | 451 snow_vertical_compose_sse2_r2r_add("xmm0","xmm2","xmm4","xmm6","xmm0","xmm2","xmm4","xmm6") |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
452 snow_vertical_compose_sse2_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
453 |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
454 "pcmpeqd %%xmm1, %%xmm1 \n\t" |
5591 | 455 "psllw $15, %%xmm1 \n\t" |
456 "psrlw $13, %%xmm1 \n\t" | |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
457 "mov %5, %%"REG_a" \n\t" |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
458 |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
459 snow_vertical_compose_sse2_r2r_add("xmm1","xmm1","xmm1","xmm1","xmm0","xmm2","xmm4","xmm6") |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
460 snow_vertical_compose_sse2_sra("3","xmm0","xmm2","xmm4","xmm6") |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
461 snow_vertical_compose_sse2_load(REG_a,"xmm1","xmm3","xmm5","xmm7") |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
462 snow_vertical_compose_sse2_sub("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7") |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
463 snow_vertical_compose_sse2_store(REG_a,"xmm1","xmm3","xmm5","xmm7") |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
464 "mov %3, %%"REG_c" \n\t" |
3566
b63ef6fcbc70
Fix x86 SIMD asm and pic, patch from Martin von Gagern <Martin.vGagern@gmx.net>
lu_zero
parents:
3398
diff
changeset
|
465 snow_vertical_compose_sse2_load(REG_S,"xmm0","xmm2","xmm4","xmm6") |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
466 snow_vertical_compose_sse2_add(REG_c,"xmm1","xmm3","xmm5","xmm7") |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
467 snow_vertical_compose_sse2_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") |
3566
b63ef6fcbc70
Fix x86 SIMD asm and pic, patch from Martin von Gagern <Martin.vGagern@gmx.net>
lu_zero
parents:
3398
diff
changeset
|
468 snow_vertical_compose_sse2_store(REG_S,"xmm0","xmm2","xmm4","xmm6") |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
469 "mov %2, %%"REG_a" \n\t" |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
470 snow_vertical_compose_sse2_add(REG_a,"xmm0","xmm2","xmm4","xmm6") |
5560
2e63bd8e894d
Simplify and speedup code, reduce needed headroom by 2 bits in the 3rd
michael
parents:
5559
diff
changeset
|
471 snow_vertical_compose_sse2_sra("2","xmm0","xmm2","xmm4","xmm6") |
2e63bd8e894d
Simplify and speedup code, reduce needed headroom by 2 bits in the 3rd
michael
parents:
5559
diff
changeset
|
472 snow_vertical_compose_sse2_add(REG_c,"xmm0","xmm2","xmm4","xmm6") |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
473 |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
474 "pcmpeqd %%xmm1, %%xmm1 \n\t" |
5591 | 475 "psllw $15, %%xmm1 \n\t" |
476 "psrlw $14, %%xmm1 \n\t" | |
3566
b63ef6fcbc70
Fix x86 SIMD asm and pic, patch from Martin von Gagern <Martin.vGagern@gmx.net>
lu_zero
parents:
3398
diff
changeset
|
477 "mov %1, %%"REG_S" \n\t" |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
478 |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
479 snow_vertical_compose_sse2_r2r_add("xmm1","xmm1","xmm1","xmm1","xmm0","xmm2","xmm4","xmm6") |
5560
2e63bd8e894d
Simplify and speedup code, reduce needed headroom by 2 bits in the 3rd
michael
parents:
5559
diff
changeset
|
480 snow_vertical_compose_sse2_sra("2","xmm0","xmm2","xmm4","xmm6") |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
481 snow_vertical_compose_sse2_add(REG_c,"xmm0","xmm2","xmm4","xmm6") |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
482 snow_vertical_compose_sse2_store(REG_c,"xmm0","xmm2","xmm4","xmm6") |
3566
b63ef6fcbc70
Fix x86 SIMD asm and pic, patch from Martin von Gagern <Martin.vGagern@gmx.net>
lu_zero
parents:
3398
diff
changeset
|
483 snow_vertical_compose_sse2_add(REG_S,"xmm0","xmm2","xmm4","xmm6") |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
484 snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7") |
5562
0074dd43ae5e
Simplify and optimize the 4th vertical lifting step of the SSE2 code (untested)
michael
parents:
5561
diff
changeset
|
485 snow_vertical_compose_sse2_sra("1","xmm0","xmm2","xmm4","xmm6") |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
486 snow_vertical_compose_sse2_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
487 snow_vertical_compose_sse2_add(REG_a,"xmm0","xmm2","xmm4","xmm6") |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
488 snow_vertical_compose_sse2_store(REG_a,"xmm0","xmm2","xmm4","xmm6") |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
489 |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
490 "2: \n\t" |
5591 | 491 "sub $32, %%"REG_d" \n\t" |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
492 "jge 1b \n\t" |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
493 :"+d"(i) |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
494 : |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
495 "m"(b0),"m"(b1),"m"(b2),"m"(b3),"m"(b4),"m"(b5): |
3566
b63ef6fcbc70
Fix x86 SIMD asm and pic, patch from Martin von Gagern <Martin.vGagern@gmx.net>
lu_zero
parents:
3398
diff
changeset
|
496 "%"REG_a"","%"REG_S"","%"REG_c""); |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
497 } |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
498 |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
499 #define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\ |
5591 | 500 ""op" (%%"r",%%"REG_d",2), %%"t0" \n\t"\ |
501 ""op" 8(%%"r",%%"REG_d",2), %%"t1" \n\t"\ | |
502 ""op" 16(%%"r",%%"REG_d",2), %%"t2" \n\t"\ | |
503 ""op" 24(%%"r",%%"REG_d",2), %%"t3" \n\t" | |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
504 |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
505 #define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\ |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
506 snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3) |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
507 |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
508 #define snow_vertical_compose_mmx_add(r,t0,t1,t2,t3)\ |
5591 | 509 snow_vertical_compose_mmx_load_add("paddw",r,t0,t1,t2,t3) |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
510 |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
511 #define snow_vertical_compose_mmx_sub(s0,s1,s2,s3,t0,t1,t2,t3)\ |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
512 snow_vertical_compose_sse2_sub(s0,s1,s2,s3,t0,t1,t2,t3) |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
513 |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
514 #define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\ |
5591 | 515 "movq %%"s0", (%%"w",%%"REG_d",2) \n\t"\ |
516 "movq %%"s1", 8(%%"w",%%"REG_d",2) \n\t"\ | |
517 "movq %%"s2", 16(%%"w",%%"REG_d",2) \n\t"\ | |
518 "movq %%"s3", 24(%%"w",%%"REG_d",2) \n\t" | |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
519 |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
520 #define snow_vertical_compose_mmx_sra(n,t0,t1,t2,t3)\ |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
521 snow_vertical_compose_sse2_sra(n,t0,t1,t2,t3) |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
522 |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
523 #define snow_vertical_compose_mmx_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\ |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
524 snow_vertical_compose_sse2_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3) |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
525 |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
526 #define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\ |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
527 "movq %%"s0", %%"t0" \n\t"\ |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
528 "movq %%"s1", %%"t1" \n\t"\ |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
529 "movq %%"s2", %%"t2" \n\t"\ |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
530 "movq %%"s3", %%"t3" \n\t" |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
531 |
5591 | 532 void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){ |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
533 long i = width; |
5591 | 534 while(i & 15) |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
535 { |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
536 i--; |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
537 b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS; |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
538 b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS; |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
539 b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS; |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
540 b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS; |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
541 } |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
542 |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
543 asm volatile( |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
544 "jmp 2f \n\t" |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
545 "1: \n\t" |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
546 |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
547 "mov %6, %%"REG_a" \n\t" |
3566
b63ef6fcbc70
Fix x86 SIMD asm and pic, patch from Martin von Gagern <Martin.vGagern@gmx.net>
lu_zero
parents:
3398
diff
changeset
|
548 "mov %4, %%"REG_S" \n\t" |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
549 |
3566
b63ef6fcbc70
Fix x86 SIMD asm and pic, patch from Martin von Gagern <Martin.vGagern@gmx.net>
lu_zero
parents:
3398
diff
changeset
|
550 snow_vertical_compose_mmx_load(REG_S,"mm0","mm2","mm4","mm6") |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
551 snow_vertical_compose_mmx_add(REG_a,"mm0","mm2","mm4","mm6") |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
552 snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7") |
5557 | 553 snow_vertical_compose_mmx_r2r_add("mm0","mm2","mm4","mm6","mm0","mm2","mm4","mm6") |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
554 snow_vertical_compose_mmx_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
555 |
5591 | 556 "pcmpeqw %%mm1, %%mm1 \n\t" |
557 "psllw $15, %%mm1 \n\t" | |
558 "psrlw $13, %%mm1 \n\t" | |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
559 "mov %5, %%"REG_a" \n\t" |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
560 |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
561 snow_vertical_compose_mmx_r2r_add("mm1","mm1","mm1","mm1","mm0","mm2","mm4","mm6") |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
562 snow_vertical_compose_mmx_sra("3","mm0","mm2","mm4","mm6") |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
563 snow_vertical_compose_mmx_load(REG_a,"mm1","mm3","mm5","mm7") |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
564 snow_vertical_compose_mmx_sub("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7") |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
565 snow_vertical_compose_mmx_store(REG_a,"mm1","mm3","mm5","mm7") |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
566 "mov %3, %%"REG_c" \n\t" |
3566
b63ef6fcbc70
Fix x86 SIMD asm and pic, patch from Martin von Gagern <Martin.vGagern@gmx.net>
lu_zero
parents:
3398
diff
changeset
|
567 snow_vertical_compose_mmx_load(REG_S,"mm0","mm2","mm4","mm6") |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
568 snow_vertical_compose_mmx_add(REG_c,"mm1","mm3","mm5","mm7") |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
569 snow_vertical_compose_mmx_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") |
3566
b63ef6fcbc70
Fix x86 SIMD asm and pic, patch from Martin von Gagern <Martin.vGagern@gmx.net>
lu_zero
parents:
3398
diff
changeset
|
570 snow_vertical_compose_mmx_store(REG_S,"mm0","mm2","mm4","mm6") |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
571 "mov %2, %%"REG_a" \n\t" |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
572 snow_vertical_compose_mmx_add(REG_a,"mm0","mm2","mm4","mm6") |
5559
9993d38f59cf
simplify, speedup and reduce needed headroom by 2 bits in the 3rd
michael
parents:
5558
diff
changeset
|
573 snow_vertical_compose_mmx_sra("2","mm0","mm2","mm4","mm6") |
9993d38f59cf
simplify, speedup and reduce needed headroom by 2 bits in the 3rd
michael
parents:
5558
diff
changeset
|
574 snow_vertical_compose_mmx_add(REG_c,"mm0","mm2","mm4","mm6") |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
575 |
5591 | 576 "pcmpeqw %%mm1, %%mm1 \n\t" |
577 "psllw $15, %%mm1 \n\t" | |
578 "psrlw $14, %%mm1 \n\t" | |
3566
b63ef6fcbc70
Fix x86 SIMD asm and pic, patch from Martin von Gagern <Martin.vGagern@gmx.net>
lu_zero
parents:
3398
diff
changeset
|
579 "mov %1, %%"REG_S" \n\t" |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
580 |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
581 snow_vertical_compose_mmx_r2r_add("mm1","mm1","mm1","mm1","mm0","mm2","mm4","mm6") |
5559
9993d38f59cf
simplify, speedup and reduce needed headroom by 2 bits in the 3rd
michael
parents:
5558
diff
changeset
|
582 snow_vertical_compose_mmx_sra("2","mm0","mm2","mm4","mm6") |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
583 snow_vertical_compose_mmx_add(REG_c,"mm0","mm2","mm4","mm6") |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
584 snow_vertical_compose_mmx_store(REG_c,"mm0","mm2","mm4","mm6") |
3566
b63ef6fcbc70
Fix x86 SIMD asm and pic, patch from Martin von Gagern <Martin.vGagern@gmx.net>
lu_zero
parents:
3398
diff
changeset
|
585 snow_vertical_compose_mmx_add(REG_S,"mm0","mm2","mm4","mm6") |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
586 snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7") |
5561
225867be27f7
Simplify and optimize the 4th vertical lifting step of the MMX code
michael
parents:
5560
diff
changeset
|
587 snow_vertical_compose_mmx_sra("1","mm0","mm2","mm4","mm6") |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
588 snow_vertical_compose_mmx_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
589 snow_vertical_compose_mmx_add(REG_a,"mm0","mm2","mm4","mm6") |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
590 snow_vertical_compose_mmx_store(REG_a,"mm0","mm2","mm4","mm6") |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
591 |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
592 "2: \n\t" |
5591 | 593 "sub $16, %%"REG_d" \n\t" |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
594 "jge 1b \n\t" |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
595 :"+d"(i) |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
596 : |
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
597 "m"(b0),"m"(b1),"m"(b2),"m"(b3),"m"(b4),"m"(b5): |
3566
b63ef6fcbc70
Fix x86 SIMD asm and pic, patch from Martin von Gagern <Martin.vGagern@gmx.net>
lu_zero
parents:
3398
diff
changeset
|
598 "%"REG_a"","%"REG_S"","%"REG_c""); |
3207
33110c1008a4
Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff
changeset
|
599 } |
3211
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
600 |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
601 #define snow_inner_add_yblock_sse2_header \ |
5591 | 602 IDWTELEM * * dst_array = sb->line + src_y;\ |
3566
b63ef6fcbc70
Fix x86 SIMD asm and pic, patch from Martin von Gagern <Martin.vGagern@gmx.net>
lu_zero
parents:
3398
diff
changeset
|
603 long tmp;\ |
3211
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
604 asm volatile(\ |
3566
b63ef6fcbc70
Fix x86 SIMD asm and pic, patch from Martin von Gagern <Martin.vGagern@gmx.net>
lu_zero
parents:
3398
diff
changeset
|
605 "mov %7, %%"REG_c" \n\t"\ |
b63ef6fcbc70
Fix x86 SIMD asm and pic, patch from Martin von Gagern <Martin.vGagern@gmx.net>
lu_zero
parents:
3398
diff
changeset
|
606 "mov %6, %2 \n\t"\ |
b63ef6fcbc70
Fix x86 SIMD asm and pic, patch from Martin von Gagern <Martin.vGagern@gmx.net>
lu_zero
parents:
3398
diff
changeset
|
607 "mov %4, %%"REG_S" \n\t"\ |
3211
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
608 "pxor %%xmm7, %%xmm7 \n\t" /* 0 */\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
609 "pcmpeqd %%xmm3, %%xmm3 \n\t"\ |
5591 | 610 "psllw $15, %%xmm3 \n\t"\ |
611 "psrlw $12, %%xmm3 \n\t" /* FRAC_BITS >> 1 */\ | |
3211
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
612 "1: \n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
613 "mov %1, %%"REG_D" \n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
614 "mov (%%"REG_D"), %%"REG_D" \n\t"\ |
3566
b63ef6fcbc70
Fix x86 SIMD asm and pic, patch from Martin von Gagern <Martin.vGagern@gmx.net>
lu_zero
parents:
3398
diff
changeset
|
615 "add %3, %%"REG_D" \n\t" |
3211
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
616 |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
617 #define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
618 "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
619 "movq (%%"REG_d"), %%"out_reg1" \n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
620 "movq (%%"REG_d", %%"REG_c"), %%"out_reg2" \n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
621 "punpcklbw %%xmm7, %%"out_reg1" \n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
622 "punpcklbw %%xmm7, %%"out_reg2" \n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
623 "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
624 "movq "s_offset"+16(%%"REG_S"), %%xmm4 \n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
625 "punpcklbw %%xmm7, %%xmm0 \n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
626 "punpcklbw %%xmm7, %%xmm4 \n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
627 "pmullw %%xmm0, %%"out_reg1" \n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
628 "pmullw %%xmm4, %%"out_reg2" \n\t" |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
629 |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
630 #define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
631 "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
632 "movq (%%"REG_d"), %%"out_reg1" \n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
633 "movq 8(%%"REG_d"), %%"out_reg2" \n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
634 "punpcklbw %%xmm7, %%"out_reg1" \n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
635 "punpcklbw %%xmm7, %%"out_reg2" \n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
636 "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
637 "movq "s_offset"+8(%%"REG_S"), %%xmm4 \n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
638 "punpcklbw %%xmm7, %%xmm0 \n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
639 "punpcklbw %%xmm7, %%xmm4 \n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
640 "pmullw %%xmm0, %%"out_reg1" \n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
641 "pmullw %%xmm4, %%"out_reg2" \n\t" |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
642 |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
643 #define snow_inner_add_yblock_sse2_accum_8(ptr_offset, s_offset) \ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
644 snow_inner_add_yblock_sse2_start_8("xmm2", "xmm6", ptr_offset, s_offset)\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
645 "paddusw %%xmm2, %%xmm1 \n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
646 "paddusw %%xmm6, %%xmm5 \n\t" |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
647 |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
648 #define snow_inner_add_yblock_sse2_accum_16(ptr_offset, s_offset) \ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
649 snow_inner_add_yblock_sse2_start_16("xmm2", "xmm6", ptr_offset, s_offset)\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
650 "paddusw %%xmm2, %%xmm1 \n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
651 "paddusw %%xmm6, %%xmm5 \n\t" |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
652 |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
653 #define snow_inner_add_yblock_sse2_end_common1\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
654 "add $32, %%"REG_S" \n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
655 "add %%"REG_c", %0 \n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
656 "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
657 "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
658 "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
659 "add %%"REG_c", (%%"REG_a") \n\t" |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
660 |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
661 #define snow_inner_add_yblock_sse2_end_common2\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
662 "jnz 1b \n\t"\ |
3566
b63ef6fcbc70
Fix x86 SIMD asm and pic, patch from Martin von Gagern <Martin.vGagern@gmx.net>
lu_zero
parents:
3398
diff
changeset
|
663 :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\ |
3211
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
664 :\ |
5591 | 665 "rm"((long)(src_x<<1)),"m"(obmc),"a"(block),"m"((long)b_h),"m"((long)src_stride):\ |
3566
b63ef6fcbc70
Fix x86 SIMD asm and pic, patch from Martin von Gagern <Martin.vGagern@gmx.net>
lu_zero
parents:
3398
diff
changeset
|
666 "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d""); |
3211
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
667 |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
668 #define snow_inner_add_yblock_sse2_end_8\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
669 "sal $1, %%"REG_c" \n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
670 "add $"PTR_SIZE"*2, %1 \n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
671 snow_inner_add_yblock_sse2_end_common1\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
672 "sar $1, %%"REG_c" \n\t"\ |
3566
b63ef6fcbc70
Fix x86 SIMD asm and pic, patch from Martin von Gagern <Martin.vGagern@gmx.net>
lu_zero
parents:
3398
diff
changeset
|
673 "sub $2, %2 \n\t"\ |
3211
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
674 snow_inner_add_yblock_sse2_end_common2 |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
675 |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
676 #define snow_inner_add_yblock_sse2_end_16\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
677 "add $"PTR_SIZE"*1, %1 \n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
678 snow_inner_add_yblock_sse2_end_common1\ |
3566
b63ef6fcbc70
Fix x86 SIMD asm and pic, patch from Martin von Gagern <Martin.vGagern@gmx.net>
lu_zero
parents:
3398
diff
changeset
|
679 "dec %2 \n\t"\ |
3211
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
680 snow_inner_add_yblock_sse2_end_common2 |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
681 |
4436
d3e389536b0a
Add the const specifier as needed to reduce the number of warnings.
takis
parents:
3947
diff
changeset
|
682 static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h, |
3211
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
683 int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
684 snow_inner_add_yblock_sse2_header |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
685 snow_inner_add_yblock_sse2_start_8("xmm1", "xmm5", "3", "0") |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
686 snow_inner_add_yblock_sse2_accum_8("2", "8") |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
687 snow_inner_add_yblock_sse2_accum_8("1", "128") |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
688 snow_inner_add_yblock_sse2_accum_8("0", "136") |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
689 |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
690 "mov %0, %%"REG_d" \n\t" |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
691 "movdqa (%%"REG_D"), %%xmm0 \n\t" |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
692 "movdqa %%xmm1, %%xmm2 \n\t" |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
693 |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
694 "punpckhwd %%xmm7, %%xmm1 \n\t" |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
695 "punpcklwd %%xmm7, %%xmm2 \n\t" |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
696 "paddd %%xmm2, %%xmm0 \n\t" |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
697 "movdqa 16(%%"REG_D"), %%xmm2 \n\t" |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
698 "paddd %%xmm1, %%xmm2 \n\t" |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
699 "paddd %%xmm3, %%xmm0 \n\t" |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
700 "paddd %%xmm3, %%xmm2 \n\t" |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
701 |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
702 "mov %1, %%"REG_D" \n\t" |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
703 "mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\n\t" |
3566
b63ef6fcbc70
Fix x86 SIMD asm and pic, patch from Martin von Gagern <Martin.vGagern@gmx.net>
lu_zero
parents:
3398
diff
changeset
|
704 "add %3, %%"REG_D" \n\t" |
3211
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
705 |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
706 "movdqa (%%"REG_D"), %%xmm4 \n\t" |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
707 "movdqa %%xmm5, %%xmm6 \n\t" |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
708 "punpckhwd %%xmm7, %%xmm5 \n\t" |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
709 "punpcklwd %%xmm7, %%xmm6 \n\t" |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
710 "paddd %%xmm6, %%xmm4 \n\t" |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
711 "movdqa 16(%%"REG_D"), %%xmm6 \n\t" |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
712 "paddd %%xmm5, %%xmm6 \n\t" |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
713 "paddd %%xmm3, %%xmm4 \n\t" |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
714 "paddd %%xmm3, %%xmm6 \n\t" |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
715 |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
716 "psrad $8, %%xmm0 \n\t" /* FRAC_BITS. */ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
717 "psrad $8, %%xmm2 \n\t" /* FRAC_BITS. */ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
718 "packssdw %%xmm2, %%xmm0 \n\t" |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
719 "packuswb %%xmm7, %%xmm0 \n\t" |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
720 "movq %%xmm0, (%%"REG_d") \n\t" |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
721 |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
722 "psrad $8, %%xmm4 \n\t" /* FRAC_BITS. */ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
723 "psrad $8, %%xmm6 \n\t" /* FRAC_BITS. */ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
724 "packssdw %%xmm6, %%xmm4 \n\t" |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
725 "packuswb %%xmm7, %%xmm4 \n\t" |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
726 "movq %%xmm4, (%%"REG_d",%%"REG_c");\n\t" |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
727 snow_inner_add_yblock_sse2_end_8 |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
728 } |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
729 |
4436
d3e389536b0a
Add the const specifier as needed to reduce the number of warnings.
takis
parents:
3947
diff
changeset
|
730 static void inner_add_yblock_bw_16_obmc_32_sse2(const uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h, |
3211
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
731 int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
732 snow_inner_add_yblock_sse2_header |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
733 snow_inner_add_yblock_sse2_start_16("xmm1", "xmm5", "3", "0") |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
734 snow_inner_add_yblock_sse2_accum_16("2", "16") |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
735 snow_inner_add_yblock_sse2_accum_16("1", "512") |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
736 snow_inner_add_yblock_sse2_accum_16("0", "528") |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
737 |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
738 "mov %0, %%"REG_d" \n\t" |
5591 | 739 "psrlw $4, %%xmm1 \n\t" |
740 "psrlw $4, %%xmm5 \n\t" | |
741 "paddw (%%"REG_D"), %%xmm1 \n\t" | |
742 "paddw 16(%%"REG_D"), %%xmm5 \n\t" | |
743 "paddw %%xmm3, %%xmm1 \n\t" | |
744 "paddw %%xmm3, %%xmm5 \n\t" | |
745 "psraw $4, %%xmm1 \n\t" /* FRAC_BITS. */ | |
746 "psraw $4, %%xmm5 \n\t" /* FRAC_BITS. */ | |
747 "packuswb %%xmm5, %%xmm1 \n\t" | |
3211
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
748 |
5591 | 749 "movdqu %%xmm1, (%%"REG_d") \n\t" |
3211
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
750 |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
751 snow_inner_add_yblock_sse2_end_16 |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
752 } |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
753 |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
754 #define snow_inner_add_yblock_mmx_header \ |
5591 | 755 IDWTELEM * * dst_array = sb->line + src_y;\ |
3566
b63ef6fcbc70
Fix x86 SIMD asm and pic, patch from Martin von Gagern <Martin.vGagern@gmx.net>
lu_zero
parents:
3398
diff
changeset
|
756 long tmp;\ |
3211
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
757 asm volatile(\ |
3566
b63ef6fcbc70
Fix x86 SIMD asm and pic, patch from Martin von Gagern <Martin.vGagern@gmx.net>
lu_zero
parents:
3398
diff
changeset
|
758 "mov %7, %%"REG_c" \n\t"\ |
b63ef6fcbc70
Fix x86 SIMD asm and pic, patch from Martin von Gagern <Martin.vGagern@gmx.net>
lu_zero
parents:
3398
diff
changeset
|
759 "mov %6, %2 \n\t"\ |
b63ef6fcbc70
Fix x86 SIMD asm and pic, patch from Martin von Gagern <Martin.vGagern@gmx.net>
lu_zero
parents:
3398
diff
changeset
|
760 "mov %4, %%"REG_S" \n\t"\ |
3211
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
761 "pxor %%mm7, %%mm7 \n\t" /* 0 */\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
762 "pcmpeqd %%mm3, %%mm3 \n\t"\ |
5591 | 763 "psllw $15, %%mm3 \n\t"\ |
764 "psrlw $12, %%mm3 \n\t" /* FRAC_BITS >> 1 */\ | |
3211
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
765 "1: \n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
766 "mov %1, %%"REG_D" \n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
767 "mov (%%"REG_D"), %%"REG_D" \n\t"\ |
3566
b63ef6fcbc70
Fix x86 SIMD asm and pic, patch from Martin von Gagern <Martin.vGagern@gmx.net>
lu_zero
parents:
3398
diff
changeset
|
768 "add %3, %%"REG_D" \n\t" |
3211
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
769 |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
770 #define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
771 "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
772 "movd "d_offset"(%%"REG_d"), %%"out_reg1" \n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
773 "movd "d_offset"+4(%%"REG_d"), %%"out_reg2" \n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
774 "punpcklbw %%mm7, %%"out_reg1" \n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
775 "punpcklbw %%mm7, %%"out_reg2" \n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
776 "movd "s_offset"(%%"REG_S"), %%mm0 \n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
777 "movd "s_offset"+4(%%"REG_S"), %%mm4 \n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
778 "punpcklbw %%mm7, %%mm0 \n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
779 "punpcklbw %%mm7, %%mm4 \n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
780 "pmullw %%mm0, %%"out_reg1" \n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
781 "pmullw %%mm4, %%"out_reg2" \n\t" |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
782 |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
783 #define snow_inner_add_yblock_mmx_accum(ptr_offset, s_offset, d_offset) \ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
784 snow_inner_add_yblock_mmx_start("mm2", "mm6", ptr_offset, s_offset, d_offset)\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
785 "paddusw %%mm2, %%mm1 \n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
786 "paddusw %%mm6, %%mm5 \n\t" |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
787 |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
788 #define snow_inner_add_yblock_mmx_mix(read_offset, write_offset)\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
789 "mov %0, %%"REG_d" \n\t"\ |
5591 | 790 "psrlw $4, %%mm1 \n\t"\ |
791 "psrlw $4, %%mm5 \n\t"\ | |
792 "paddw "read_offset"(%%"REG_D"), %%mm1 \n\t"\ | |
793 "paddw "read_offset"+8(%%"REG_D"), %%mm5 \n\t"\ | |
794 "paddw %%mm3, %%mm1 \n\t"\ | |
795 "paddw %%mm3, %%mm5 \n\t"\ | |
796 "psraw $4, %%mm1 \n\t"\ | |
797 "psraw $4, %%mm5 \n\t"\ | |
798 "packuswb %%mm5, %%mm1 \n\t"\ | |
799 "movq %%mm1, "write_offset"(%%"REG_d") \n\t" | |
3211
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
800 |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
801 #define snow_inner_add_yblock_mmx_end(s_step)\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
802 "add $"s_step", %%"REG_S" \n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
803 "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
804 "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
805 "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
806 "add %%"REG_c", (%%"REG_a") \n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
807 "add $"PTR_SIZE"*1, %1 \n\t"\ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
808 "add %%"REG_c", %0 \n\t"\ |
3566
b63ef6fcbc70
Fix x86 SIMD asm and pic, patch from Martin von Gagern <Martin.vGagern@gmx.net>
lu_zero
parents:
3398
diff
changeset
|
809 "dec %2 \n\t"\ |
3211
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
810 "jnz 1b \n\t"\ |
3566
b63ef6fcbc70
Fix x86 SIMD asm and pic, patch from Martin von Gagern <Martin.vGagern@gmx.net>
lu_zero
parents:
3398
diff
changeset
|
811 :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\ |
3211
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
812 :\ |
5591 | 813 "rm"((long)(src_x<<1)),"m"(obmc),"a"(block),"m"((long)b_h),"m"((long)src_stride):\ |
3566
b63ef6fcbc70
Fix x86 SIMD asm and pic, patch from Martin von Gagern <Martin.vGagern@gmx.net>
lu_zero
parents:
3398
diff
changeset
|
814 "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d""); |
3211
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
815 |
4436
d3e389536b0a
Add the const specifier as needed to reduce the number of warnings.
takis
parents:
3947
diff
changeset
|
816 static void inner_add_yblock_bw_8_obmc_16_mmx(const uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h, |
3211
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
817 int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
818 snow_inner_add_yblock_mmx_header |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
819 snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0") |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
820 snow_inner_add_yblock_mmx_accum("2", "8", "0") |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
821 snow_inner_add_yblock_mmx_accum("1", "128", "0") |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
822 snow_inner_add_yblock_mmx_accum("0", "136", "0") |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
823 snow_inner_add_yblock_mmx_mix("0", "0") |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
824 snow_inner_add_yblock_mmx_end("16") |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
825 } |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
826 |
4436
d3e389536b0a
Add the const specifier as needed to reduce the number of warnings.
takis
parents:
3947
diff
changeset
|
827 static void inner_add_yblock_bw_16_obmc_32_mmx(const uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h, |
3211
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
828 int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
829 snow_inner_add_yblock_mmx_header |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
830 snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0") |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
831 snow_inner_add_yblock_mmx_accum("2", "16", "0") |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
832 snow_inner_add_yblock_mmx_accum("1", "512", "0") |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
833 snow_inner_add_yblock_mmx_accum("0", "528", "0") |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
834 snow_inner_add_yblock_mmx_mix("0", "0") |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
835 |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
836 snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "8", "8") |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
837 snow_inner_add_yblock_mmx_accum("2", "24", "8") |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
838 snow_inner_add_yblock_mmx_accum("1", "520", "8") |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
839 snow_inner_add_yblock_mmx_accum("0", "536", "8") |
5591 | 840 snow_inner_add_yblock_mmx_mix("16", "8") |
3211
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
841 snow_inner_add_yblock_mmx_end("32") |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
842 } |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
843 |
4436
d3e389536b0a
Add the const specifier as needed to reduce the number of warnings.
takis
parents:
3947
diff
changeset
|
844 void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, |
3211
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
845 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
846 |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
847 if (b_w == 16) |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
848 inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
849 else if (b_w == 8 && obmc_stride == 16) { |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
850 if (!(b_h & 1)) |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
851 inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
852 else |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
853 inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
854 } else |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
855 ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
856 } |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
857 |
4436
d3e389536b0a
Add the const specifier as needed to reduce the number of warnings.
takis
parents:
3947
diff
changeset
|
858 void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, |
3211
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
859 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){ |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
860 if (b_w == 16) |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
861 inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
862 else if (b_w == 8 && obmc_stride == 16) |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
863 inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
864 else |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
865 ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); |
b77b5e7072d6
add MMX and SSE versions of ff_snow_inner_add_yblock
gpoirier
parents:
3210
diff
changeset
|
866 } |