Mercurial > libavcodec.hg
comparison x86/vc1dsp_mmx.c @ 9441:e14cd3ac3806 libavcodec
VC1: extend MMX qpel MC to include MMX2 avg qpel
author | conrad |
---|---|
date | Wed, 15 Apr 2009 02:25:42 +0000 |
parents | 7768bdfd4f7b |
children | 7a116de63777 |
comparison
equal
deleted
inserted
replaced
9440:daee921fb6bb | 9441:e14cd3ac3806 |
---|---|
26 | 26 |
27 #include "libavutil/x86_cpu.h" | 27 #include "libavutil/x86_cpu.h" |
28 #include "libavcodec/dsputil.h" | 28 #include "libavcodec/dsputil.h" |
29 #include "dsputil_mmx.h" | 29 #include "dsputil_mmx.h" |
30 | 30 |
31 #define OP_PUT(S,D) | |
32 #define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t" | |
33 | |
31 /** Add rounder from mm7 to mm3 and pack result at destination */ | 34 /** Add rounder from mm7 to mm3 and pack result at destination */ |
32 #define NORMALIZE_MMX(SHIFT) \ | 35 #define NORMALIZE_MMX(SHIFT) \ |
33 "paddw %%mm7, %%mm3 \n\t" /* +bias-r */ \ | 36 "paddw %%mm7, %%mm3 \n\t" /* +bias-r */ \ |
34 "paddw %%mm7, %%mm4 \n\t" /* +bias-r */ \ | 37 "paddw %%mm7, %%mm4 \n\t" /* +bias-r */ \ |
35 "psraw "SHIFT", %%mm3 \n\t" \ | 38 "psraw "SHIFT", %%mm3 \n\t" \ |
36 "psraw "SHIFT", %%mm4 \n\t" | 39 "psraw "SHIFT", %%mm4 \n\t" |
37 | 40 |
38 #define TRANSFER_DO_PACK \ | 41 #define TRANSFER_DO_PACK(OP) \ |
39 "packuswb %%mm4, %%mm3 \n\t" \ | 42 "packuswb %%mm4, %%mm3 \n\t" \ |
43 OP((%2), %%mm3) \ | |
40 "movq %%mm3, (%2) \n\t" | 44 "movq %%mm3, (%2) \n\t" |
41 | 45 |
42 #define TRANSFER_DONT_PACK \ | 46 #define TRANSFER_DONT_PACK(OP) \ |
47 OP(0(%2), %%mm3) \ | |
48 OP(8(%2), %%mm4) \ | |
43 "movq %%mm3, 0(%2) \n\t" \ | 49 "movq %%mm3, 0(%2) \n\t" \ |
44 "movq %%mm4, 8(%2) \n\t" | 50 "movq %%mm4, 8(%2) \n\t" |
45 | 51 |
46 /** @see MSPEL_FILTER13_CORE for use as UNPACK macro */ | 52 /** @see MSPEL_FILTER13_CORE for use as UNPACK macro */ |
47 #define DO_UNPACK(reg) "punpcklbw %%mm0, " reg "\n\t" | 53 #define DO_UNPACK(reg) "punpcklbw %%mm0, " reg "\n\t" |
105 | 111 |
106 /** | 112 /** |
107 * Data is already unpacked, so some operations can directly be made from | 113 * Data is already unpacked, so some operations can directly be made from |
108 * memory. | 114 * memory. |
109 */ | 115 */ |
110 static void vc1_put_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride, | 116 #define VC1_HOR_16b_SHIFT2(OP, OPNAME)\ |
111 const int16_t *src, int rnd) | 117 static void OPNAME ## vc1_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,\ |
112 { | 118 const int16_t *src, int rnd)\ |
113 int h = 8; | 119 {\ |
114 | 120 int h = 8;\ |
115 src -= 1; | 121 \ |
116 rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */ | 122 src -= 1;\ |
117 __asm__ volatile( | 123 rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */\ |
118 LOAD_ROUNDER_MMX("%4") | 124 __asm__ volatile(\ |
119 "movq "MANGLE(ff_pw_128)", %%mm6\n\t" | 125 LOAD_ROUNDER_MMX("%4")\ |
120 "movq "MANGLE(ff_pw_9)", %%mm5 \n\t" | 126 "movq "MANGLE(ff_pw_128)", %%mm6\n\t"\ |
121 "1: \n\t" | 127 "movq "MANGLE(ff_pw_9)", %%mm5 \n\t"\ |
122 "movq 2*0+0(%1), %%mm1 \n\t" | 128 "1: \n\t"\ |
123 "movq 2*0+8(%1), %%mm2 \n\t" | 129 "movq 2*0+0(%1), %%mm1 \n\t"\ |
124 "movq 2*1+0(%1), %%mm3 \n\t" | 130 "movq 2*0+8(%1), %%mm2 \n\t"\ |
125 "movq 2*1+8(%1), %%mm4 \n\t" | 131 "movq 2*1+0(%1), %%mm3 \n\t"\ |
126 "paddw 2*3+0(%1), %%mm1 \n\t" | 132 "movq 2*1+8(%1), %%mm4 \n\t"\ |
127 "paddw 2*3+8(%1), %%mm2 \n\t" | 133 "paddw 2*3+0(%1), %%mm1 \n\t"\ |
128 "paddw 2*2+0(%1), %%mm3 \n\t" | 134 "paddw 2*3+8(%1), %%mm2 \n\t"\ |
129 "paddw 2*2+8(%1), %%mm4 \n\t" | 135 "paddw 2*2+0(%1), %%mm3 \n\t"\ |
130 "pmullw %%mm5, %%mm3 \n\t" | 136 "paddw 2*2+8(%1), %%mm4 \n\t"\ |
131 "pmullw %%mm5, %%mm4 \n\t" | 137 "pmullw %%mm5, %%mm3 \n\t"\ |
132 "psubw %%mm1, %%mm3 \n\t" | 138 "pmullw %%mm5, %%mm4 \n\t"\ |
133 "psubw %%mm2, %%mm4 \n\t" | 139 "psubw %%mm1, %%mm3 \n\t"\ |
134 NORMALIZE_MMX("$7") | 140 "psubw %%mm2, %%mm4 \n\t"\ |
135 /* Remove bias */ | 141 NORMALIZE_MMX("$7")\ |
136 "paddw %%mm6, %%mm3 \n\t" | 142 /* Remove bias */\ |
137 "paddw %%mm6, %%mm4 \n\t" | 143 "paddw %%mm6, %%mm3 \n\t"\ |
138 TRANSFER_DO_PACK | 144 "paddw %%mm6, %%mm4 \n\t"\ |
139 "add $24, %1 \n\t" | 145 TRANSFER_DO_PACK(OP)\ |
140 "add %3, %2 \n\t" | 146 "add $24, %1 \n\t"\ |
141 "decl %0 \n\t" | 147 "add %3, %2 \n\t"\ |
142 "jnz 1b \n\t" | 148 "decl %0 \n\t"\ |
143 : "+r"(h), "+r" (src), "+r" (dst) | 149 "jnz 1b \n\t"\ |
144 : "r"(stride), "m"(rnd) | 150 : "+r"(h), "+r" (src), "+r" (dst)\ |
145 : "memory" | 151 : "r"(stride), "m"(rnd)\ |
146 ); | 152 : "memory"\ |
147 } | 153 );\ |
154 } | |
155 | |
156 VC1_HOR_16b_SHIFT2(OP_PUT, put_) | |
157 VC1_HOR_16b_SHIFT2(OP_AVG, avg_) | |
148 | 158 |
149 | 159 |
150 /** | 160 /** |
151 * Purely vertical or horizontal 1/2 shift interpolation. | 161 * Purely vertical or horizontal 1/2 shift interpolation. |
152 * Sacrify mm6 for *9 factor. | 162 * Sacrify mm6 for *9 factor. |
153 */ | 163 */ |
154 static void vc1_put_shift2_mmx(uint8_t *dst, const uint8_t *src, | 164 #define VC1_SHIFT2(OP, OPNAME)\ |
155 x86_reg stride, int rnd, x86_reg offset) | 165 static void OPNAME ## vc1_shift2_mmx(uint8_t *dst, const uint8_t *src,\ |
156 { | 166 x86_reg stride, int rnd, x86_reg offset)\ |
157 rnd = 8-rnd; | 167 {\ |
158 __asm__ volatile( | 168 rnd = 8-rnd;\ |
159 "mov $8, %%"REG_c" \n\t" | 169 __asm__ volatile(\ |
160 LOAD_ROUNDER_MMX("%5") | 170 "mov $8, %%"REG_c" \n\t"\ |
161 "movq "MANGLE(ff_pw_9)", %%mm6\n\t" | 171 LOAD_ROUNDER_MMX("%5")\ |
162 "1: \n\t" | 172 "movq "MANGLE(ff_pw_9)", %%mm6\n\t"\ |
163 "movd 0(%0 ), %%mm3 \n\t" | 173 "1: \n\t"\ |
164 "movd 4(%0 ), %%mm4 \n\t" | 174 "movd 0(%0 ), %%mm3 \n\t"\ |
165 "movd 0(%0,%2), %%mm1 \n\t" | 175 "movd 4(%0 ), %%mm4 \n\t"\ |
166 "movd 4(%0,%2), %%mm2 \n\t" | 176 "movd 0(%0,%2), %%mm1 \n\t"\ |
167 "add %2, %0 \n\t" | 177 "movd 4(%0,%2), %%mm2 \n\t"\ |
168 "punpcklbw %%mm0, %%mm3 \n\t" | 178 "add %2, %0 \n\t"\ |
169 "punpcklbw %%mm0, %%mm4 \n\t" | 179 "punpcklbw %%mm0, %%mm3 \n\t"\ |
170 "punpcklbw %%mm0, %%mm1 \n\t" | 180 "punpcklbw %%mm0, %%mm4 \n\t"\ |
171 "punpcklbw %%mm0, %%mm2 \n\t" | 181 "punpcklbw %%mm0, %%mm1 \n\t"\ |
172 "paddw %%mm1, %%mm3 \n\t" | 182 "punpcklbw %%mm0, %%mm2 \n\t"\ |
173 "paddw %%mm2, %%mm4 \n\t" | 183 "paddw %%mm1, %%mm3 \n\t"\ |
174 "movd 0(%0,%3), %%mm1 \n\t" | 184 "paddw %%mm2, %%mm4 \n\t"\ |
175 "movd 4(%0,%3), %%mm2 \n\t" | 185 "movd 0(%0,%3), %%mm1 \n\t"\ |
176 "pmullw %%mm6, %%mm3 \n\t" /* 0,9,9,0*/ | 186 "movd 4(%0,%3), %%mm2 \n\t"\ |
177 "pmullw %%mm6, %%mm4 \n\t" /* 0,9,9,0*/ | 187 "pmullw %%mm6, %%mm3 \n\t" /* 0,9,9,0*/\ |
178 "punpcklbw %%mm0, %%mm1 \n\t" | 188 "pmullw %%mm6, %%mm4 \n\t" /* 0,9,9,0*/\ |
179 "punpcklbw %%mm0, %%mm2 \n\t" | 189 "punpcklbw %%mm0, %%mm1 \n\t"\ |
180 "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,0*/ | 190 "punpcklbw %%mm0, %%mm2 \n\t"\ |
181 "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,0*/ | 191 "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,0*/\ |
182 "movd 0(%0,%2), %%mm1 \n\t" | 192 "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,0*/\ |
183 "movd 4(%0,%2), %%mm2 \n\t" | 193 "movd 0(%0,%2), %%mm1 \n\t"\ |
184 "punpcklbw %%mm0, %%mm1 \n\t" | 194 "movd 4(%0,%2), %%mm2 \n\t"\ |
185 "punpcklbw %%mm0, %%mm2 \n\t" | 195 "punpcklbw %%mm0, %%mm1 \n\t"\ |
186 "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,-1*/ | 196 "punpcklbw %%mm0, %%mm2 \n\t"\ |
187 "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,-1*/ | 197 "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,-1*/\ |
188 NORMALIZE_MMX("$4") | 198 "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,-1*/\ |
189 "packuswb %%mm4, %%mm3 \n\t" | 199 NORMALIZE_MMX("$4")\ |
190 "movq %%mm3, (%1) \n\t" | 200 "packuswb %%mm4, %%mm3 \n\t"\ |
191 "add %6, %0 \n\t" | 201 OP((%1), %%mm3)\ |
192 "add %4, %1 \n\t" | 202 "movq %%mm3, (%1) \n\t"\ |
193 "dec %%"REG_c" \n\t" | 203 "add %6, %0 \n\t"\ |
194 "jnz 1b \n\t" | 204 "add %4, %1 \n\t"\ |
195 : "+r"(src), "+r"(dst) | 205 "dec %%"REG_c" \n\t"\ |
196 : "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd), | 206 "jnz 1b \n\t"\ |
197 "g"(stride-offset) | 207 : "+r"(src), "+r"(dst)\ |
198 : "%"REG_c, "memory" | 208 : "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd),\ |
199 ); | 209 "g"(stride-offset)\ |
200 } | 210 : "%"REG_c, "memory"\ |
211 );\ | |
212 } | |
213 | |
214 VC1_SHIFT2(OP_PUT, put_) | |
215 VC1_SHIFT2(OP_AVG, avg_) | |
201 | 216 |
202 /** | 217 /** |
203 * Filter coefficients made global to allow access by all 1 or 3 quarter shift | 218 * Filter coefficients made global to allow access by all 1 or 3 quarter shift |
204 * interpolation functions. | 219 * interpolation functions. |
205 */ | 220 */ |
270 "movq "MANGLE(ff_pw_18)", %%mm6\n\t" \ | 285 "movq "MANGLE(ff_pw_18)", %%mm6\n\t" \ |
271 ASMALIGN(3) \ | 286 ASMALIGN(3) \ |
272 "1: \n\t" \ | 287 "1: \n\t" \ |
273 MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \ | 288 MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \ |
274 NORMALIZE_MMX("%6") \ | 289 NORMALIZE_MMX("%6") \ |
275 TRANSFER_DONT_PACK \ | 290 TRANSFER_DONT_PACK(OP_PUT) \ |
276 /* Last 3 (in fact 4) bytes on the line */ \ | 291 /* Last 3 (in fact 4) bytes on the line */ \ |
277 "movd 8+"A1", %%mm1 \n\t" \ | 292 "movd 8+"A1", %%mm1 \n\t" \ |
278 DO_UNPACK("%%mm1") \ | 293 DO_UNPACK("%%mm1") \ |
279 "movq %%mm1, %%mm3 \n\t" \ | 294 "movq %%mm1, %%mm3 \n\t" \ |
280 "paddw %%mm1, %%mm1 \n\t" \ | 295 "paddw %%mm1, %%mm1 \n\t" \ |
310 * Here, offset=16bits, so parameters passed A1 to A4 should be simple. | 325 * Here, offset=16bits, so parameters passed A1 to A4 should be simple. |
311 * | 326 * |
312 * @param NAME Either 1 or 3 | 327 * @param NAME Either 1 or 3 |
313 * @see MSPEL_FILTER13_CORE for information on A1->A4 | 328 * @see MSPEL_FILTER13_CORE for information on A1->A4 |
314 */ | 329 */ |
315 #define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4) \ | 330 #define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME) \ |
316 static void \ | 331 static void \ |
317 vc1_put_hor_16b_ ## NAME ## _mmx(uint8_t *dst, x86_reg stride, \ | 332 OPNAME ## vc1_hor_16b_ ## NAME ## _mmx(uint8_t *dst, x86_reg stride, \ |
318 const int16_t *src, int rnd) \ | 333 const int16_t *src, int rnd) \ |
319 { \ | 334 { \ |
320 int h = 8; \ | 335 int h = 8; \ |
321 src -= 1; \ | 336 src -= 1; \ |
322 rnd -= (-4+58+13-3)*256; /* Add -256 bias */ \ | 337 rnd -= (-4+58+13-3)*256; /* Add -256 bias */ \ |
329 MSPEL_FILTER13_CORE(DONT_UNPACK, "movq 2", A1, A2, A3, A4) \ | 344 MSPEL_FILTER13_CORE(DONT_UNPACK, "movq 2", A1, A2, A3, A4) \ |
330 NORMALIZE_MMX("$7") \ | 345 NORMALIZE_MMX("$7") \ |
331 /* Remove bias */ \ | 346 /* Remove bias */ \ |
332 "paddw "MANGLE(ff_pw_128)", %%mm3 \n\t" \ | 347 "paddw "MANGLE(ff_pw_128)", %%mm3 \n\t" \ |
333 "paddw "MANGLE(ff_pw_128)", %%mm4 \n\t" \ | 348 "paddw "MANGLE(ff_pw_128)", %%mm4 \n\t" \ |
334 TRANSFER_DO_PACK \ | 349 TRANSFER_DO_PACK(OP) \ |
335 "add $24, %1 \n\t" \ | 350 "add $24, %1 \n\t" \ |
336 "add %3, %2 \n\t" \ | 351 "add %3, %2 \n\t" \ |
337 "decl %0 \n\t" \ | 352 "decl %0 \n\t" \ |
338 "jnz 1b \n\t" \ | 353 "jnz 1b \n\t" \ |
339 : "+r"(h), "+r" (src), "+r" (dst) \ | 354 : "+r"(h), "+r" (src), "+r" (dst) \ |
348 * %3 (offset) and %4 (3*offset). | 363 * %3 (offset) and %4 (3*offset). |
349 * | 364 * |
350 * @param NAME Either 1 or 3 | 365 * @param NAME Either 1 or 3 |
351 * @see MSPEL_FILTER13_CORE for information on A1->A4 | 366 * @see MSPEL_FILTER13_CORE for information on A1->A4 |
352 */ | 367 */ |
353 #define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4) \ | 368 #define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME) \ |
354 static void \ | 369 static void \ |
355 vc1_put_## NAME ## _mmx(uint8_t *dst, const uint8_t *src, \ | 370 OPNAME ## vc1_## NAME ## _mmx(uint8_t *dst, const uint8_t *src, \ |
356 x86_reg stride, int rnd, x86_reg offset) \ | 371 x86_reg stride, int rnd, x86_reg offset) \ |
357 { \ | 372 { \ |
358 int h = 8; \ | 373 int h = 8; \ |
359 src -= offset; \ | 374 src -= offset; \ |
360 rnd = 32-rnd; \ | 375 rnd = 32-rnd; \ |
364 "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \ | 379 "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \ |
365 ASMALIGN(3) \ | 380 ASMALIGN(3) \ |
366 "1: \n\t" \ | 381 "1: \n\t" \ |
367 MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \ | 382 MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \ |
368 NORMALIZE_MMX("$6") \ | 383 NORMALIZE_MMX("$6") \ |
369 TRANSFER_DO_PACK \ | 384 TRANSFER_DO_PACK(OP) \ |
370 "add %5, %1 \n\t" \ | 385 "add %5, %1 \n\t" \ |
371 "add %5, %2 \n\t" \ | 386 "add %5, %2 \n\t" \ |
372 "decl %0 \n\t" \ | 387 "decl %0 \n\t" \ |
373 "jnz 1b \n\t" \ | 388 "jnz 1b \n\t" \ |
374 : "+r"(h), "+r" (src), "+r" (dst) \ | 389 : "+r"(h), "+r" (src), "+r" (dst) \ |
376 : "memory" \ | 391 : "memory" \ |
377 ); \ | 392 ); \ |
378 } | 393 } |
379 | 394 |
380 /** 1/4 shift bicubic interpolation */ | 395 /** 1/4 shift bicubic interpolation */ |
381 MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )") | 396 MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_PUT, put_) |
397 MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_AVG, avg_) | |
382 MSPEL_FILTER13_VER_16B(shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )") | 398 MSPEL_FILTER13_VER_16B(shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )") |
383 MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)") | 399 MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_PUT, put_) |
400 MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_AVG, avg_) | |
384 | 401 |
385 /** 3/4 shift bicubic interpolation */ | 402 /** 3/4 shift bicubic interpolation */ |
386 MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )") | 403 MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_PUT, put_) |
404 MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_AVG, avg_) | |
387 MSPEL_FILTER13_VER_16B(shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )") | 405 MSPEL_FILTER13_VER_16B(shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )") |
388 MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)") | 406 MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_PUT, put_) |
407 MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_AVG, avg_) | |
389 | 408 |
390 typedef void (*vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t *src, x86_reg src_stride, int rnd, int64_t shift); | 409 typedef void (*vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t *src, x86_reg src_stride, int rnd, int64_t shift); |
391 typedef void (*vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, x86_reg dst_stride, const int16_t *src, int rnd); | 410 typedef void (*vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, x86_reg dst_stride, const int16_t *src, int rnd); |
392 typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, x86_reg stride, int rnd, x86_reg offset); | 411 typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, x86_reg stride, int rnd, x86_reg offset); |
393 | 412 |
400 * @param stride Stride for both src and dst buffers. | 419 * @param stride Stride for both src and dst buffers. |
401 * @param hmode Horizontal filter (expressed in quarter pixels shift). | 420 * @param hmode Horizontal filter (expressed in quarter pixels shift). |
402 * @param hmode Vertical filter. | 421 * @param hmode Vertical filter. |
403 * @param rnd Rounding bias. | 422 * @param rnd Rounding bias. |
404 */ | 423 */ |
405 static void vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride, | 424 #define VC1_MSPEL_MC(OP)\ |
406 int hmode, int vmode, int rnd) | 425 static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\ |
407 { | 426 int hmode, int vmode, int rnd)\ |
408 static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] = | 427 {\ |
409 { NULL, vc1_put_ver_16b_shift1_mmx, vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx }; | 428 static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\ |
410 static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] = | 429 { NULL, vc1_put_ver_16b_shift1_mmx, vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\ |
411 { NULL, vc1_put_hor_16b_shift1_mmx, vc1_put_hor_16b_shift2_mmx, vc1_put_hor_16b_shift3_mmx }; | 430 static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\ |
412 static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] = | 431 { NULL, OP ## vc1_hor_16b_shift1_mmx, OP ## vc1_hor_16b_shift2_mmx, OP ## vc1_hor_16b_shift3_mmx };\ |
413 { NULL, vc1_put_shift1_mmx, vc1_put_shift2_mmx, vc1_put_shift3_mmx }; | 432 static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =\ |
414 | 433 { NULL, OP ## vc1_shift1_mmx, OP ## vc1_shift2_mmx, OP ## vc1_shift3_mmx };\ |
415 __asm__ volatile( | 434 \ |
416 "pxor %%mm0, %%mm0 \n\t" | 435 __asm__ volatile(\ |
417 ::: "memory" | 436 "pxor %%mm0, %%mm0 \n\t"\ |
418 ); | 437 ::: "memory"\ |
419 | 438 );\ |
420 if (vmode) { /* Vertical filter to apply */ | 439 \ |
421 if (hmode) { /* Horizontal filter to apply, output to tmp */ | 440 if (vmode) { /* Vertical filter to apply */\ |
422 static const int shift_value[] = { 0, 5, 1, 5 }; | 441 if (hmode) { /* Horizontal filter to apply, output to tmp */\ |
423 int shift = (shift_value[hmode]+shift_value[vmode])>>1; | 442 static const int shift_value[] = { 0, 5, 1, 5 };\ |
424 int r; | 443 int shift = (shift_value[hmode]+shift_value[vmode])>>1;\ |
425 DECLARE_ALIGNED_16(int16_t, tmp[12*8]); | 444 int r;\ |
426 | 445 DECLARE_ALIGNED_16(int16_t, tmp[12*8]);\ |
427 r = (1<<(shift-1)) + rnd-1; | 446 \ |
428 vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift); | 447 r = (1<<(shift-1)) + rnd-1;\ |
429 | 448 vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);\ |
430 vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd); | 449 \ |
431 return; | 450 vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);\ |
432 } | 451 return;\ |
433 else { /* No horizontal filter, output 8 lines to dst */ | 452 }\ |
434 vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride); | 453 else { /* No horizontal filter, output 8 lines to dst */\ |
435 return; | 454 vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);\ |
436 } | 455 return;\ |
437 } | 456 }\ |
438 | 457 }\ |
439 /* Horizontal mode with no vertical mode */ | 458 \ |
440 vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1); | 459 /* Horizontal mode with no vertical mode */\ |
441 } | 460 vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);\ |
461 } | |
462 | |
463 VC1_MSPEL_MC(put_) | |
464 VC1_MSPEL_MC(avg_) | |
442 | 465 |
443 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd); | 466 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd); |
467 void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd); | |
444 | 468 |
445 /** Macro to ease bicubic filter interpolation functions declarations */ | 469 /** Macro to ease bicubic filter interpolation functions declarations */ |
446 #define DECLARE_FUNCTION(a, b) \ | 470 #define DECLARE_FUNCTION(a, b) \ |
447 static void put_vc1_mspel_mc ## a ## b ## _mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) { \ | 471 static void put_vc1_mspel_mc ## a ## b ## _mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) { \ |
448 vc1_mspel_mc(dst, src, stride, a, b, rnd); \ | 472 put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \ |
473 }\ | |
474 static void avg_vc1_mspel_mc ## a ## b ## _mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd) { \ | |
475 avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \ | |
449 } | 476 } |
450 | 477 |
451 DECLARE_FUNCTION(0, 1) | 478 DECLARE_FUNCTION(0, 1) |
452 DECLARE_FUNCTION(0, 2) | 479 DECLARE_FUNCTION(0, 2) |
453 DECLARE_FUNCTION(0, 3) | 480 DECLARE_FUNCTION(0, 3) |
466 DECLARE_FUNCTION(3, 1) | 493 DECLARE_FUNCTION(3, 1) |
467 DECLARE_FUNCTION(3, 2) | 494 DECLARE_FUNCTION(3, 2) |
468 DECLARE_FUNCTION(3, 3) | 495 DECLARE_FUNCTION(3, 3) |
469 | 496 |
470 void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx) { | 497 void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx) { |
498 mm_flags = mm_support(); | |
499 | |
471 dsp->put_vc1_mspel_pixels_tab[ 0] = ff_put_vc1_mspel_mc00_mmx; | 500 dsp->put_vc1_mspel_pixels_tab[ 0] = ff_put_vc1_mspel_mc00_mmx; |
472 dsp->put_vc1_mspel_pixels_tab[ 4] = put_vc1_mspel_mc01_mmx; | 501 dsp->put_vc1_mspel_pixels_tab[ 4] = put_vc1_mspel_mc01_mmx; |
473 dsp->put_vc1_mspel_pixels_tab[ 8] = put_vc1_mspel_mc02_mmx; | 502 dsp->put_vc1_mspel_pixels_tab[ 8] = put_vc1_mspel_mc02_mmx; |
474 dsp->put_vc1_mspel_pixels_tab[12] = put_vc1_mspel_mc03_mmx; | 503 dsp->put_vc1_mspel_pixels_tab[12] = put_vc1_mspel_mc03_mmx; |
475 | 504 |
485 | 514 |
486 dsp->put_vc1_mspel_pixels_tab[ 3] = put_vc1_mspel_mc30_mmx; | 515 dsp->put_vc1_mspel_pixels_tab[ 3] = put_vc1_mspel_mc30_mmx; |
487 dsp->put_vc1_mspel_pixels_tab[ 7] = put_vc1_mspel_mc31_mmx; | 516 dsp->put_vc1_mspel_pixels_tab[ 7] = put_vc1_mspel_mc31_mmx; |
488 dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_mmx; | 517 dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_mmx; |
489 dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_mmx; | 518 dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_mmx; |
490 } | 519 |
520 if (mm_flags & FF_MM_MMX2){ | |
521 dsp->avg_vc1_mspel_pixels_tab[ 0] = ff_avg_vc1_mspel_mc00_mmx2; | |
522 dsp->avg_vc1_mspel_pixels_tab[ 4] = avg_vc1_mspel_mc01_mmx2; | |
523 dsp->avg_vc1_mspel_pixels_tab[ 8] = avg_vc1_mspel_mc02_mmx2; | |
524 dsp->avg_vc1_mspel_pixels_tab[12] = avg_vc1_mspel_mc03_mmx2; | |
525 | |
526 dsp->avg_vc1_mspel_pixels_tab[ 1] = avg_vc1_mspel_mc10_mmx2; | |
527 dsp->avg_vc1_mspel_pixels_tab[ 5] = avg_vc1_mspel_mc11_mmx2; | |
528 dsp->avg_vc1_mspel_pixels_tab[ 9] = avg_vc1_mspel_mc12_mmx2; | |
529 dsp->avg_vc1_mspel_pixels_tab[13] = avg_vc1_mspel_mc13_mmx2; | |
530 | |
531 dsp->avg_vc1_mspel_pixels_tab[ 2] = avg_vc1_mspel_mc20_mmx2; | |
532 dsp->avg_vc1_mspel_pixels_tab[ 6] = avg_vc1_mspel_mc21_mmx2; | |
533 dsp->avg_vc1_mspel_pixels_tab[10] = avg_vc1_mspel_mc22_mmx2; | |
534 dsp->avg_vc1_mspel_pixels_tab[14] = avg_vc1_mspel_mc23_mmx2; | |
535 | |
536 dsp->avg_vc1_mspel_pixels_tab[ 3] = avg_vc1_mspel_mc30_mmx2; | |
537 dsp->avg_vc1_mspel_pixels_tab[ 7] = avg_vc1_mspel_mc31_mmx2; | |
538 dsp->avg_vc1_mspel_pixels_tab[11] = avg_vc1_mspel_mc32_mmx2; | |
539 dsp->avg_vc1_mspel_pixels_tab[15] = avg_vc1_mspel_mc33_mmx2; | |
540 } | |
541 } |