Mercurial > libavcodec.hg
comparison i386/snowdsp_mmx.c @ 8031:eebc7209c47f libavcodec
Convert asm keyword into __asm__.
Neither the asm() nor the __asm__() keyword is part of the C99
standard, but while GCC accepts the former in C89 syntax, it is not
accepted in C99 unless GNU extensions are turned on (with -fasm). The
latter form is accepted in any syntax as an extension (without
requiring further command-line options).
Sun Studio C99 compiler also does not accept asm() while accepting
__asm__(), albeit reporting warnings that it's not valid C99 syntax.
author | flameeyes |
---|---|
date | Thu, 16 Oct 2008 13:34:09 +0000 |
parents | f7cbb7733146 |
children |
comparison
equal
deleted
inserted
replaced
8030:a512ac8fa540 | 8031:eebc7209c47f |
---|---|
36 // (the first time erroneously), we allow the SSE2 code to run an extra pass. | 36 // (the first time erroneously), we allow the SSE2 code to run an extra pass. |
37 // The savings in code and time are well worth having to store this value and | 37 // The savings in code and time are well worth having to store this value and |
38 // calculate b[0] correctly afterwards. | 38 // calculate b[0] correctly afterwards. |
39 | 39 |
40 i = 0; | 40 i = 0; |
41 asm volatile( | 41 __asm__ volatile( |
42 "pcmpeqd %%xmm7, %%xmm7 \n\t" | 42 "pcmpeqd %%xmm7, %%xmm7 \n\t" |
43 "pcmpeqd %%xmm3, %%xmm3 \n\t" | 43 "pcmpeqd %%xmm3, %%xmm3 \n\t" |
44 "psllw $1, %%xmm3 \n\t" | 44 "psllw $1, %%xmm3 \n\t" |
45 "paddw %%xmm7, %%xmm3 \n\t" | 45 "paddw %%xmm7, %%xmm3 \n\t" |
46 "psllw $13, %%xmm3 \n\t" | 46 "psllw $13, %%xmm3 \n\t" |
47 ::); | 47 ::); |
48 for(; i<w_l-15; i+=16){ | 48 for(; i<w_l-15; i+=16){ |
49 asm volatile( | 49 __asm__ volatile( |
50 "movdqu (%1), %%xmm1 \n\t" | 50 "movdqu (%1), %%xmm1 \n\t" |
51 "movdqu 16(%1), %%xmm5 \n\t" | 51 "movdqu 16(%1), %%xmm5 \n\t" |
52 "movdqu 2(%1), %%xmm2 \n\t" | 52 "movdqu 2(%1), %%xmm2 \n\t" |
53 "movdqu 18(%1), %%xmm6 \n\t" | 53 "movdqu 18(%1), %%xmm6 \n\t" |
54 "paddw %%xmm1, %%xmm2 \n\t" | 54 "paddw %%xmm1, %%xmm2 \n\t" |
75 i = 0; | 75 i = 0; |
76 for(; (((x86_reg)&dst[i]) & 0x1F) && i<w_r; i++){ | 76 for(; (((x86_reg)&dst[i]) & 0x1F) && i<w_r; i++){ |
77 dst[i] = dst[i] - (b[i] + b[i + 1]); | 77 dst[i] = dst[i] - (b[i] + b[i + 1]); |
78 } | 78 } |
79 for(; i<w_r-15; i+=16){ | 79 for(; i<w_r-15; i+=16){ |
80 asm volatile( | 80 __asm__ volatile( |
81 "movdqu (%1), %%xmm1 \n\t" | 81 "movdqu (%1), %%xmm1 \n\t" |
82 "movdqu 16(%1), %%xmm5 \n\t" | 82 "movdqu 16(%1), %%xmm5 \n\t" |
83 "movdqu 2(%1), %%xmm2 \n\t" | 83 "movdqu 2(%1), %%xmm2 \n\t" |
84 "movdqu 18(%1), %%xmm6 \n\t" | 84 "movdqu 18(%1), %%xmm6 \n\t" |
85 "paddw %%xmm1, %%xmm2 \n\t" | 85 "paddw %%xmm1, %%xmm2 \n\t" |
100 { // Lift 2 | 100 { // Lift 2 |
101 IDWTELEM * const ref = b+w2 - 1; | 101 IDWTELEM * const ref = b+w2 - 1; |
102 IDWTELEM b_0 = b[0]; | 102 IDWTELEM b_0 = b[0]; |
103 | 103 |
104 i = 0; | 104 i = 0; |
105 asm volatile( | 105 __asm__ volatile( |
106 "psllw $15, %%xmm7 \n\t" | 106 "psllw $15, %%xmm7 \n\t" |
107 "pcmpeqw %%xmm6, %%xmm6 \n\t" | 107 "pcmpeqw %%xmm6, %%xmm6 \n\t" |
108 "psrlw $13, %%xmm6 \n\t" | 108 "psrlw $13, %%xmm6 \n\t" |
109 "paddw %%xmm7, %%xmm6 \n\t" | 109 "paddw %%xmm7, %%xmm6 \n\t" |
110 ::); | 110 ::); |
111 for(; i<w_l-15; i+=16){ | 111 for(; i<w_l-15; i+=16){ |
112 asm volatile( | 112 __asm__ volatile( |
113 "movdqu (%1), %%xmm0 \n\t" | 113 "movdqu (%1), %%xmm0 \n\t" |
114 "movdqu 16(%1), %%xmm4 \n\t" | 114 "movdqu 16(%1), %%xmm4 \n\t" |
115 "movdqu 2(%1), %%xmm1 \n\t" | 115 "movdqu 2(%1), %%xmm1 \n\t" |
116 "movdqu 18(%1), %%xmm5 \n\t" //FIXME try aligned reads and shifts | 116 "movdqu 18(%1), %%xmm5 \n\t" //FIXME try aligned reads and shifts |
117 "paddw %%xmm6, %%xmm0 \n\t" | 117 "paddw %%xmm6, %%xmm0 \n\t" |
148 i = 0; | 148 i = 0; |
149 for(; (((x86_reg)&temp[i]) & 0x1F) && i<w_r; i++){ | 149 for(; (((x86_reg)&temp[i]) & 0x1F) && i<w_r; i++){ |
150 temp[i] = src[i] - ((-W_AM*(b[i] + b[i+1]))>>W_AS); | 150 temp[i] = src[i] - ((-W_AM*(b[i] + b[i+1]))>>W_AS); |
151 } | 151 } |
152 for(; i<w_r-7; i+=8){ | 152 for(; i<w_r-7; i+=8){ |
153 asm volatile( | 153 __asm__ volatile( |
154 "movdqu 2(%1), %%xmm2 \n\t" | 154 "movdqu 2(%1), %%xmm2 \n\t" |
155 "movdqu 18(%1), %%xmm6 \n\t" | 155 "movdqu 18(%1), %%xmm6 \n\t" |
156 "paddw (%1), %%xmm2 \n\t" | 156 "paddw (%1), %%xmm2 \n\t" |
157 "paddw 16(%1), %%xmm6 \n\t" | 157 "paddw 16(%1), %%xmm6 \n\t" |
158 "movdqu (%0), %%xmm0 \n\t" | 158 "movdqu (%0), %%xmm0 \n\t" |
178 for (; (i & 0x3E) != 0x3E; i-=2){ | 178 for (; (i & 0x3E) != 0x3E; i-=2){ |
179 b[i+1] = temp[i>>1]; | 179 b[i+1] = temp[i>>1]; |
180 b[i] = b[i>>1]; | 180 b[i] = b[i>>1]; |
181 } | 181 } |
182 for (i-=62; i>=0; i-=64){ | 182 for (i-=62; i>=0; i-=64){ |
183 asm volatile( | 183 __asm__ volatile( |
184 "movdqa (%1), %%xmm0 \n\t" | 184 "movdqa (%1), %%xmm0 \n\t" |
185 "movdqa 16(%1), %%xmm2 \n\t" | 185 "movdqa 16(%1), %%xmm2 \n\t" |
186 "movdqa 32(%1), %%xmm4 \n\t" | 186 "movdqa 32(%1), %%xmm4 \n\t" |
187 "movdqa 48(%1), %%xmm6 \n\t" | 187 "movdqa 48(%1), %%xmm6 \n\t" |
188 "movdqa (%1), %%xmm1 \n\t" | 188 "movdqa (%1), %%xmm1 \n\t" |
222 { // Lift 0 | 222 { // Lift 0 |
223 IDWTELEM * const ref = b + w2 - 1; | 223 IDWTELEM * const ref = b + w2 - 1; |
224 | 224 |
225 i = 1; | 225 i = 1; |
226 b[0] = b[0] - ((W_DM * 2 * ref[1]+W_DO)>>W_DS); | 226 b[0] = b[0] - ((W_DM * 2 * ref[1]+W_DO)>>W_DS); |
227 asm volatile( | 227 __asm__ volatile( |
228 "pcmpeqw %%mm7, %%mm7 \n\t" | 228 "pcmpeqw %%mm7, %%mm7 \n\t" |
229 "pcmpeqw %%mm3, %%mm3 \n\t" | 229 "pcmpeqw %%mm3, %%mm3 \n\t" |
230 "psllw $1, %%mm3 \n\t" | 230 "psllw $1, %%mm3 \n\t" |
231 "paddw %%mm7, %%mm3 \n\t" | 231 "paddw %%mm7, %%mm3 \n\t" |
232 "psllw $13, %%mm3 \n\t" | 232 "psllw $13, %%mm3 \n\t" |
233 ::); | 233 ::); |
234 for(; i<w_l-7; i+=8){ | 234 for(; i<w_l-7; i+=8){ |
235 asm volatile( | 235 __asm__ volatile( |
236 "movq (%1), %%mm2 \n\t" | 236 "movq (%1), %%mm2 \n\t" |
237 "movq 8(%1), %%mm6 \n\t" | 237 "movq 8(%1), %%mm6 \n\t" |
238 "paddw 2(%1), %%mm2 \n\t" | 238 "paddw 2(%1), %%mm2 \n\t" |
239 "paddw 10(%1), %%mm6 \n\t" | 239 "paddw 10(%1), %%mm6 \n\t" |
240 "paddw %%mm7, %%mm2 \n\t" | 240 "paddw %%mm7, %%mm2 \n\t" |
255 { // Lift 1 | 255 { // Lift 1 |
256 IDWTELEM * const dst = b+w2; | 256 IDWTELEM * const dst = b+w2; |
257 | 257 |
258 i = 0; | 258 i = 0; |
259 for(; i<w_r-7; i+=8){ | 259 for(; i<w_r-7; i+=8){ |
260 asm volatile( | 260 __asm__ volatile( |
261 "movq (%1), %%mm2 \n\t" | 261 "movq (%1), %%mm2 \n\t" |
262 "movq 8(%1), %%mm6 \n\t" | 262 "movq 8(%1), %%mm6 \n\t" |
263 "paddw 2(%1), %%mm2 \n\t" | 263 "paddw 2(%1), %%mm2 \n\t" |
264 "paddw 10(%1), %%mm6 \n\t" | 264 "paddw 10(%1), %%mm6 \n\t" |
265 "movq (%0), %%mm0 \n\t" | 265 "movq (%0), %%mm0 \n\t" |
278 { // Lift 2 | 278 { // Lift 2 |
279 IDWTELEM * const ref = b+w2 - 1; | 279 IDWTELEM * const ref = b+w2 - 1; |
280 | 280 |
281 i = 1; | 281 i = 1; |
282 b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS); | 282 b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS); |
283 asm volatile( | 283 __asm__ volatile( |
284 "psllw $15, %%mm7 \n\t" | 284 "psllw $15, %%mm7 \n\t" |
285 "pcmpeqw %%mm6, %%mm6 \n\t" | 285 "pcmpeqw %%mm6, %%mm6 \n\t" |
286 "psrlw $13, %%mm6 \n\t" | 286 "psrlw $13, %%mm6 \n\t" |
287 "paddw %%mm7, %%mm6 \n\t" | 287 "paddw %%mm7, %%mm6 \n\t" |
288 ::); | 288 ::); |
289 for(; i<w_l-7; i+=8){ | 289 for(; i<w_l-7; i+=8){ |
290 asm volatile( | 290 __asm__ volatile( |
291 "movq (%1), %%mm0 \n\t" | 291 "movq (%1), %%mm0 \n\t" |
292 "movq 8(%1), %%mm4 \n\t" | 292 "movq 8(%1), %%mm4 \n\t" |
293 "movq 2(%1), %%mm1 \n\t" | 293 "movq 2(%1), %%mm1 \n\t" |
294 "movq 10(%1), %%mm5 \n\t" | 294 "movq 10(%1), %%mm5 \n\t" |
295 "paddw %%mm6, %%mm0 \n\t" | 295 "paddw %%mm6, %%mm0 \n\t" |
322 { // Lift 3 | 322 { // Lift 3 |
323 IDWTELEM * const src = b+w2; | 323 IDWTELEM * const src = b+w2; |
324 i = 0; | 324 i = 0; |
325 | 325 |
326 for(; i<w_r-7; i+=8){ | 326 for(; i<w_r-7; i+=8){ |
327 asm volatile( | 327 __asm__ volatile( |
328 "movq 2(%1), %%mm2 \n\t" | 328 "movq 2(%1), %%mm2 \n\t" |
329 "movq 10(%1), %%mm6 \n\t" | 329 "movq 10(%1), %%mm6 \n\t" |
330 "paddw (%1), %%mm2 \n\t" | 330 "paddw (%1), %%mm2 \n\t" |
331 "paddw 8(%1), %%mm6 \n\t" | 331 "paddw 8(%1), %%mm6 \n\t" |
332 "movq (%0), %%mm0 \n\t" | 332 "movq (%0), %%mm0 \n\t" |
352 for (; (i & 0x1E) != 0x1E; i-=2){ | 352 for (; (i & 0x1E) != 0x1E; i-=2){ |
353 b[i+1] = temp[i>>1]; | 353 b[i+1] = temp[i>>1]; |
354 b[i] = b[i>>1]; | 354 b[i] = b[i>>1]; |
355 } | 355 } |
356 for (i-=30; i>=0; i-=32){ | 356 for (i-=30; i>=0; i-=32){ |
357 asm volatile( | 357 __asm__ volatile( |
358 "movq (%1), %%mm0 \n\t" | 358 "movq (%1), %%mm0 \n\t" |
359 "movq 8(%1), %%mm2 \n\t" | 359 "movq 8(%1), %%mm2 \n\t" |
360 "movq 16(%1), %%mm4 \n\t" | 360 "movq 16(%1), %%mm4 \n\t" |
361 "movq 24(%1), %%mm6 \n\t" | 361 "movq 24(%1), %%mm6 \n\t" |
362 "movq (%1), %%mm1 \n\t" | 362 "movq (%1), %%mm1 \n\t" |
446 b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS; | 446 b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS; |
447 b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS; | 447 b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS; |
448 } | 448 } |
449 i+=i; | 449 i+=i; |
450 | 450 |
451 asm volatile ( | 451 __asm__ volatile ( |
452 "jmp 2f \n\t" | 452 "jmp 2f \n\t" |
453 "1: \n\t" | 453 "1: \n\t" |
454 snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6") | 454 snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6") |
455 snow_vertical_compose_sse2_add("%6","xmm0","xmm2","xmm4","xmm6") | 455 snow_vertical_compose_sse2_add("%6","xmm0","xmm2","xmm4","xmm6") |
456 | 456 |
542 b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS; | 542 b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS; |
543 b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS; | 543 b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS; |
544 b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS; | 544 b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS; |
545 } | 545 } |
546 i+=i; | 546 i+=i; |
547 asm volatile( | 547 __asm__ volatile( |
548 "jmp 2f \n\t" | 548 "jmp 2f \n\t" |
549 "1: \n\t" | 549 "1: \n\t" |
550 | 550 |
551 snow_vertical_compose_mmx_load("%4","mm1","mm3","mm5","mm7") | 551 snow_vertical_compose_mmx_load("%4","mm1","mm3","mm5","mm7") |
552 snow_vertical_compose_mmx_add("%6","mm1","mm3","mm5","mm7") | 552 snow_vertical_compose_mmx_add("%6","mm1","mm3","mm5","mm7") |
604 #endif //HAVE_7REGS | 604 #endif //HAVE_7REGS |
605 | 605 |
606 #define snow_inner_add_yblock_sse2_header \ | 606 #define snow_inner_add_yblock_sse2_header \ |
607 IDWTELEM * * dst_array = sb->line + src_y;\ | 607 IDWTELEM * * dst_array = sb->line + src_y;\ |
608 x86_reg tmp;\ | 608 x86_reg tmp;\ |
609 asm volatile(\ | 609 __asm__ volatile(\ |
610 "mov %7, %%"REG_c" \n\t"\ | 610 "mov %7, %%"REG_c" \n\t"\ |
611 "mov %6, %2 \n\t"\ | 611 "mov %6, %2 \n\t"\ |
612 "mov %4, %%"REG_S" \n\t"\ | 612 "mov %4, %%"REG_S" \n\t"\ |
613 "pxor %%xmm7, %%xmm7 \n\t" /* 0 */\ | 613 "pxor %%xmm7, %%xmm7 \n\t" /* 0 */\ |
614 "pcmpeqd %%xmm3, %%xmm3 \n\t"\ | 614 "pcmpeqd %%xmm3, %%xmm3 \n\t"\ |
757 } | 757 } |
758 | 758 |
759 #define snow_inner_add_yblock_mmx_header \ | 759 #define snow_inner_add_yblock_mmx_header \ |
760 IDWTELEM * * dst_array = sb->line + src_y;\ | 760 IDWTELEM * * dst_array = sb->line + src_y;\ |
761 x86_reg tmp;\ | 761 x86_reg tmp;\ |
762 asm volatile(\ | 762 __asm__ volatile(\ |
763 "mov %7, %%"REG_c" \n\t"\ | 763 "mov %7, %%"REG_c" \n\t"\ |
764 "mov %6, %2 \n\t"\ | 764 "mov %6, %2 \n\t"\ |
765 "mov %4, %%"REG_S" \n\t"\ | 765 "mov %4, %%"REG_S" \n\t"\ |
766 "pxor %%mm7, %%mm7 \n\t" /* 0 */\ | 766 "pxor %%mm7, %%mm7 \n\t" /* 0 */\ |
767 "pcmpeqd %%mm3, %%mm3 \n\t"\ | 767 "pcmpeqd %%mm3, %%mm3 \n\t"\ |