comparison i386/snowdsp_mmx.c @ 8031:eebc7209c47f libavcodec

Convert asm keyword into __asm__. Neither the asm() nor the __asm__() keyword is part of the C99 standard, but while GCC accepts the former in C89 syntax, it is not accepted in C99 unless GNU extensions are turned on (with -fasm). The latter form is accepted in any syntax as an extension (without requiring further command-line options). Sun Studio C99 compiler also does not accept asm() while accepting __asm__(), albeit reporting warnings that it's not valid C99 syntax.
author flameeyes
date Thu, 16 Oct 2008 13:34:09 +0000
parents f7cbb7733146
children
comparison
equal deleted inserted replaced
8030:a512ac8fa540 8031:eebc7209c47f
36 // (the first time erroneously), we allow the SSE2 code to run an extra pass. 36 // (the first time erroneously), we allow the SSE2 code to run an extra pass.
37 // The savings in code and time are well worth having to store this value and 37 // The savings in code and time are well worth having to store this value and
38 // calculate b[0] correctly afterwards. 38 // calculate b[0] correctly afterwards.
39 39
40 i = 0; 40 i = 0;
41 asm volatile( 41 __asm__ volatile(
42 "pcmpeqd %%xmm7, %%xmm7 \n\t" 42 "pcmpeqd %%xmm7, %%xmm7 \n\t"
43 "pcmpeqd %%xmm3, %%xmm3 \n\t" 43 "pcmpeqd %%xmm3, %%xmm3 \n\t"
44 "psllw $1, %%xmm3 \n\t" 44 "psllw $1, %%xmm3 \n\t"
45 "paddw %%xmm7, %%xmm3 \n\t" 45 "paddw %%xmm7, %%xmm3 \n\t"
46 "psllw $13, %%xmm3 \n\t" 46 "psllw $13, %%xmm3 \n\t"
47 ::); 47 ::);
48 for(; i<w_l-15; i+=16){ 48 for(; i<w_l-15; i+=16){
49 asm volatile( 49 __asm__ volatile(
50 "movdqu (%1), %%xmm1 \n\t" 50 "movdqu (%1), %%xmm1 \n\t"
51 "movdqu 16(%1), %%xmm5 \n\t" 51 "movdqu 16(%1), %%xmm5 \n\t"
52 "movdqu 2(%1), %%xmm2 \n\t" 52 "movdqu 2(%1), %%xmm2 \n\t"
53 "movdqu 18(%1), %%xmm6 \n\t" 53 "movdqu 18(%1), %%xmm6 \n\t"
54 "paddw %%xmm1, %%xmm2 \n\t" 54 "paddw %%xmm1, %%xmm2 \n\t"
75 i = 0; 75 i = 0;
76 for(; (((x86_reg)&dst[i]) & 0x1F) && i<w_r; i++){ 76 for(; (((x86_reg)&dst[i]) & 0x1F) && i<w_r; i++){
77 dst[i] = dst[i] - (b[i] + b[i + 1]); 77 dst[i] = dst[i] - (b[i] + b[i + 1]);
78 } 78 }
79 for(; i<w_r-15; i+=16){ 79 for(; i<w_r-15; i+=16){
80 asm volatile( 80 __asm__ volatile(
81 "movdqu (%1), %%xmm1 \n\t" 81 "movdqu (%1), %%xmm1 \n\t"
82 "movdqu 16(%1), %%xmm5 \n\t" 82 "movdqu 16(%1), %%xmm5 \n\t"
83 "movdqu 2(%1), %%xmm2 \n\t" 83 "movdqu 2(%1), %%xmm2 \n\t"
84 "movdqu 18(%1), %%xmm6 \n\t" 84 "movdqu 18(%1), %%xmm6 \n\t"
85 "paddw %%xmm1, %%xmm2 \n\t" 85 "paddw %%xmm1, %%xmm2 \n\t"
100 { // Lift 2 100 { // Lift 2
101 IDWTELEM * const ref = b+w2 - 1; 101 IDWTELEM * const ref = b+w2 - 1;
102 IDWTELEM b_0 = b[0]; 102 IDWTELEM b_0 = b[0];
103 103
104 i = 0; 104 i = 0;
105 asm volatile( 105 __asm__ volatile(
106 "psllw $15, %%xmm7 \n\t" 106 "psllw $15, %%xmm7 \n\t"
107 "pcmpeqw %%xmm6, %%xmm6 \n\t" 107 "pcmpeqw %%xmm6, %%xmm6 \n\t"
108 "psrlw $13, %%xmm6 \n\t" 108 "psrlw $13, %%xmm6 \n\t"
109 "paddw %%xmm7, %%xmm6 \n\t" 109 "paddw %%xmm7, %%xmm6 \n\t"
110 ::); 110 ::);
111 for(; i<w_l-15; i+=16){ 111 for(; i<w_l-15; i+=16){
112 asm volatile( 112 __asm__ volatile(
113 "movdqu (%1), %%xmm0 \n\t" 113 "movdqu (%1), %%xmm0 \n\t"
114 "movdqu 16(%1), %%xmm4 \n\t" 114 "movdqu 16(%1), %%xmm4 \n\t"
115 "movdqu 2(%1), %%xmm1 \n\t" 115 "movdqu 2(%1), %%xmm1 \n\t"
116 "movdqu 18(%1), %%xmm5 \n\t" //FIXME try aligned reads and shifts 116 "movdqu 18(%1), %%xmm5 \n\t" //FIXME try aligned reads and shifts
117 "paddw %%xmm6, %%xmm0 \n\t" 117 "paddw %%xmm6, %%xmm0 \n\t"
148 i = 0; 148 i = 0;
149 for(; (((x86_reg)&temp[i]) & 0x1F) && i<w_r; i++){ 149 for(; (((x86_reg)&temp[i]) & 0x1F) && i<w_r; i++){
150 temp[i] = src[i] - ((-W_AM*(b[i] + b[i+1]))>>W_AS); 150 temp[i] = src[i] - ((-W_AM*(b[i] + b[i+1]))>>W_AS);
151 } 151 }
152 for(; i<w_r-7; i+=8){ 152 for(; i<w_r-7; i+=8){
153 asm volatile( 153 __asm__ volatile(
154 "movdqu 2(%1), %%xmm2 \n\t" 154 "movdqu 2(%1), %%xmm2 \n\t"
155 "movdqu 18(%1), %%xmm6 \n\t" 155 "movdqu 18(%1), %%xmm6 \n\t"
156 "paddw (%1), %%xmm2 \n\t" 156 "paddw (%1), %%xmm2 \n\t"
157 "paddw 16(%1), %%xmm6 \n\t" 157 "paddw 16(%1), %%xmm6 \n\t"
158 "movdqu (%0), %%xmm0 \n\t" 158 "movdqu (%0), %%xmm0 \n\t"
178 for (; (i & 0x3E) != 0x3E; i-=2){ 178 for (; (i & 0x3E) != 0x3E; i-=2){
179 b[i+1] = temp[i>>1]; 179 b[i+1] = temp[i>>1];
180 b[i] = b[i>>1]; 180 b[i] = b[i>>1];
181 } 181 }
182 for (i-=62; i>=0; i-=64){ 182 for (i-=62; i>=0; i-=64){
183 asm volatile( 183 __asm__ volatile(
184 "movdqa (%1), %%xmm0 \n\t" 184 "movdqa (%1), %%xmm0 \n\t"
185 "movdqa 16(%1), %%xmm2 \n\t" 185 "movdqa 16(%1), %%xmm2 \n\t"
186 "movdqa 32(%1), %%xmm4 \n\t" 186 "movdqa 32(%1), %%xmm4 \n\t"
187 "movdqa 48(%1), %%xmm6 \n\t" 187 "movdqa 48(%1), %%xmm6 \n\t"
188 "movdqa (%1), %%xmm1 \n\t" 188 "movdqa (%1), %%xmm1 \n\t"
222 { // Lift 0 222 { // Lift 0
223 IDWTELEM * const ref = b + w2 - 1; 223 IDWTELEM * const ref = b + w2 - 1;
224 224
225 i = 1; 225 i = 1;
226 b[0] = b[0] - ((W_DM * 2 * ref[1]+W_DO)>>W_DS); 226 b[0] = b[0] - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
227 asm volatile( 227 __asm__ volatile(
228 "pcmpeqw %%mm7, %%mm7 \n\t" 228 "pcmpeqw %%mm7, %%mm7 \n\t"
229 "pcmpeqw %%mm3, %%mm3 \n\t" 229 "pcmpeqw %%mm3, %%mm3 \n\t"
230 "psllw $1, %%mm3 \n\t" 230 "psllw $1, %%mm3 \n\t"
231 "paddw %%mm7, %%mm3 \n\t" 231 "paddw %%mm7, %%mm3 \n\t"
232 "psllw $13, %%mm3 \n\t" 232 "psllw $13, %%mm3 \n\t"
233 ::); 233 ::);
234 for(; i<w_l-7; i+=8){ 234 for(; i<w_l-7; i+=8){
235 asm volatile( 235 __asm__ volatile(
236 "movq (%1), %%mm2 \n\t" 236 "movq (%1), %%mm2 \n\t"
237 "movq 8(%1), %%mm6 \n\t" 237 "movq 8(%1), %%mm6 \n\t"
238 "paddw 2(%1), %%mm2 \n\t" 238 "paddw 2(%1), %%mm2 \n\t"
239 "paddw 10(%1), %%mm6 \n\t" 239 "paddw 10(%1), %%mm6 \n\t"
240 "paddw %%mm7, %%mm2 \n\t" 240 "paddw %%mm7, %%mm2 \n\t"
255 { // Lift 1 255 { // Lift 1
256 IDWTELEM * const dst = b+w2; 256 IDWTELEM * const dst = b+w2;
257 257
258 i = 0; 258 i = 0;
259 for(; i<w_r-7; i+=8){ 259 for(; i<w_r-7; i+=8){
260 asm volatile( 260 __asm__ volatile(
261 "movq (%1), %%mm2 \n\t" 261 "movq (%1), %%mm2 \n\t"
262 "movq 8(%1), %%mm6 \n\t" 262 "movq 8(%1), %%mm6 \n\t"
263 "paddw 2(%1), %%mm2 \n\t" 263 "paddw 2(%1), %%mm2 \n\t"
264 "paddw 10(%1), %%mm6 \n\t" 264 "paddw 10(%1), %%mm6 \n\t"
265 "movq (%0), %%mm0 \n\t" 265 "movq (%0), %%mm0 \n\t"
278 { // Lift 2 278 { // Lift 2
279 IDWTELEM * const ref = b+w2 - 1; 279 IDWTELEM * const ref = b+w2 - 1;
280 280
281 i = 1; 281 i = 1;
282 b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS); 282 b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS);
283 asm volatile( 283 __asm__ volatile(
284 "psllw $15, %%mm7 \n\t" 284 "psllw $15, %%mm7 \n\t"
285 "pcmpeqw %%mm6, %%mm6 \n\t" 285 "pcmpeqw %%mm6, %%mm6 \n\t"
286 "psrlw $13, %%mm6 \n\t" 286 "psrlw $13, %%mm6 \n\t"
287 "paddw %%mm7, %%mm6 \n\t" 287 "paddw %%mm7, %%mm6 \n\t"
288 ::); 288 ::);
289 for(; i<w_l-7; i+=8){ 289 for(; i<w_l-7; i+=8){
290 asm volatile( 290 __asm__ volatile(
291 "movq (%1), %%mm0 \n\t" 291 "movq (%1), %%mm0 \n\t"
292 "movq 8(%1), %%mm4 \n\t" 292 "movq 8(%1), %%mm4 \n\t"
293 "movq 2(%1), %%mm1 \n\t" 293 "movq 2(%1), %%mm1 \n\t"
294 "movq 10(%1), %%mm5 \n\t" 294 "movq 10(%1), %%mm5 \n\t"
295 "paddw %%mm6, %%mm0 \n\t" 295 "paddw %%mm6, %%mm0 \n\t"
322 { // Lift 3 322 { // Lift 3
323 IDWTELEM * const src = b+w2; 323 IDWTELEM * const src = b+w2;
324 i = 0; 324 i = 0;
325 325
326 for(; i<w_r-7; i+=8){ 326 for(; i<w_r-7; i+=8){
327 asm volatile( 327 __asm__ volatile(
328 "movq 2(%1), %%mm2 \n\t" 328 "movq 2(%1), %%mm2 \n\t"
329 "movq 10(%1), %%mm6 \n\t" 329 "movq 10(%1), %%mm6 \n\t"
330 "paddw (%1), %%mm2 \n\t" 330 "paddw (%1), %%mm2 \n\t"
331 "paddw 8(%1), %%mm6 \n\t" 331 "paddw 8(%1), %%mm6 \n\t"
332 "movq (%0), %%mm0 \n\t" 332 "movq (%0), %%mm0 \n\t"
352 for (; (i & 0x1E) != 0x1E; i-=2){ 352 for (; (i & 0x1E) != 0x1E; i-=2){
353 b[i+1] = temp[i>>1]; 353 b[i+1] = temp[i>>1];
354 b[i] = b[i>>1]; 354 b[i] = b[i>>1];
355 } 355 }
356 for (i-=30; i>=0; i-=32){ 356 for (i-=30; i>=0; i-=32){
357 asm volatile( 357 __asm__ volatile(
358 "movq (%1), %%mm0 \n\t" 358 "movq (%1), %%mm0 \n\t"
359 "movq 8(%1), %%mm2 \n\t" 359 "movq 8(%1), %%mm2 \n\t"
360 "movq 16(%1), %%mm4 \n\t" 360 "movq 16(%1), %%mm4 \n\t"
361 "movq 24(%1), %%mm6 \n\t" 361 "movq 24(%1), %%mm6 \n\t"
362 "movq (%1), %%mm1 \n\t" 362 "movq (%1), %%mm1 \n\t"
446 b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS; 446 b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
447 b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS; 447 b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
448 } 448 }
449 i+=i; 449 i+=i;
450 450
451 asm volatile ( 451 __asm__ volatile (
452 "jmp 2f \n\t" 452 "jmp 2f \n\t"
453 "1: \n\t" 453 "1: \n\t"
454 snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6") 454 snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
455 snow_vertical_compose_sse2_add("%6","xmm0","xmm2","xmm4","xmm6") 455 snow_vertical_compose_sse2_add("%6","xmm0","xmm2","xmm4","xmm6")
456 456
542 b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS; 542 b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
543 b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS; 543 b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
544 b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS; 544 b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
545 } 545 }
546 i+=i; 546 i+=i;
547 asm volatile( 547 __asm__ volatile(
548 "jmp 2f \n\t" 548 "jmp 2f \n\t"
549 "1: \n\t" 549 "1: \n\t"
550 550
551 snow_vertical_compose_mmx_load("%4","mm1","mm3","mm5","mm7") 551 snow_vertical_compose_mmx_load("%4","mm1","mm3","mm5","mm7")
552 snow_vertical_compose_mmx_add("%6","mm1","mm3","mm5","mm7") 552 snow_vertical_compose_mmx_add("%6","mm1","mm3","mm5","mm7")
604 #endif //HAVE_7REGS 604 #endif //HAVE_7REGS
605 605
606 #define snow_inner_add_yblock_sse2_header \ 606 #define snow_inner_add_yblock_sse2_header \
607 IDWTELEM * * dst_array = sb->line + src_y;\ 607 IDWTELEM * * dst_array = sb->line + src_y;\
608 x86_reg tmp;\ 608 x86_reg tmp;\
609 asm volatile(\ 609 __asm__ volatile(\
610 "mov %7, %%"REG_c" \n\t"\ 610 "mov %7, %%"REG_c" \n\t"\
611 "mov %6, %2 \n\t"\ 611 "mov %6, %2 \n\t"\
612 "mov %4, %%"REG_S" \n\t"\ 612 "mov %4, %%"REG_S" \n\t"\
613 "pxor %%xmm7, %%xmm7 \n\t" /* 0 */\ 613 "pxor %%xmm7, %%xmm7 \n\t" /* 0 */\
614 "pcmpeqd %%xmm3, %%xmm3 \n\t"\ 614 "pcmpeqd %%xmm3, %%xmm3 \n\t"\
757 } 757 }
758 758
759 #define snow_inner_add_yblock_mmx_header \ 759 #define snow_inner_add_yblock_mmx_header \
760 IDWTELEM * * dst_array = sb->line + src_y;\ 760 IDWTELEM * * dst_array = sb->line + src_y;\
761 x86_reg tmp;\ 761 x86_reg tmp;\
762 asm volatile(\ 762 __asm__ volatile(\
763 "mov %7, %%"REG_c" \n\t"\ 763 "mov %7, %%"REG_c" \n\t"\
764 "mov %6, %2 \n\t"\ 764 "mov %6, %2 \n\t"\
765 "mov %4, %%"REG_S" \n\t"\ 765 "mov %4, %%"REG_S" \n\t"\
766 "pxor %%mm7, %%mm7 \n\t" /* 0 */\ 766 "pxor %%mm7, %%mm7 \n\t" /* 0 */\
767 "pcmpeqd %%mm3, %%mm3 \n\t"\ 767 "pcmpeqd %%mm3, %%mm3 \n\t"\