comparison i386/snowdsp_mmx.c @ 5594:384629ebcb93 libavcodec

avoid overflow in the 3rd lifting step, this now needs mmx2 at minimum (patch for plain mmx support is welcome ...)
author michael
date Sun, 26 Aug 2007 01:11:02 +0000
parents bd015f9ea964
children 9da9e00a04a5
comparison
equal deleted inserted replaced
5593:bd015f9ea964 5594:384629ebcb93
109 IDWTELEM * const ref = b+w2 - 1; 109 IDWTELEM * const ref = b+w2 - 1;
110 IDWTELEM b_0 = b[0]; 110 IDWTELEM b_0 = b[0];
111 111
112 i = 0; 112 i = 0;
113 asm volatile( 113 asm volatile(
114 "psllw $2, %%xmm7 \n\t" 114 "psllw $13, %%xmm7 \n\t"
115 "pcmpeqw %%xmm6, %%xmm6 \n\t"
116 "psrlw $13, %%xmm6 \n\t"
117 "paddw %%xmm7, %%xmm6 \n\t"
115 ::); 118 ::);
116 for(; i<w_l-15; i+=16){ 119 for(; i<w_l-15; i+=16){
117 asm volatile( 120 asm volatile(
118 "movdqu (%1), %%xmm1 \n\t" 121 "movdqu (%1), %%xmm0 \n\t"
119 "movdqu 16(%1), %%xmm5 \n\t" 122 "movdqu 16(%1), %%xmm4 \n\t"
120 "movdqu 2(%1), %%xmm0 \n\t" 123 "movdqu 2(%1), %%xmm1 \n\t"
121 "movdqu 18(%1), %%xmm4 \n\t" //FIXME try aligned reads and shifts 124 "movdqu 18(%1), %%xmm5 \n\t" //FIXME try aligned reads and shifts
122 "paddw %%xmm1, %%xmm0 \n\t" 125 "paddw %%xmm6, %%xmm0 \n\t"
123 "paddw %%xmm5, %%xmm4 \n\t" 126 "paddw %%xmm6, %%xmm4 \n\t"
124 "paddw %%xmm7, %%xmm0 \n\t" 127 "paddw %%xmm7, %%xmm1 \n\t"
125 "paddw %%xmm7, %%xmm4 \n\t" 128 "paddw %%xmm7, %%xmm5 \n\t"
129 "pavgw %%xmm1, %%xmm0 \n\t"
130 "pavgw %%xmm5, %%xmm4 \n\t"
131 "psubw %%xmm7, %%xmm0 \n\t"
132 "psubw %%xmm7, %%xmm4 \n\t"
133 "psraw $1, %%xmm0 \n\t"
134 "psraw $1, %%xmm4 \n\t"
126 "movdqa (%0), %%xmm1 \n\t" 135 "movdqa (%0), %%xmm1 \n\t"
127 "movdqa 16(%0), %%xmm5 \n\t" 136 "movdqa 16(%0), %%xmm5 \n\t"
128 "psraw $2, %%xmm0 \n\t"
129 "psraw $2, %%xmm4 \n\t"
130 "paddw %%xmm1, %%xmm0 \n\t" 137 "paddw %%xmm1, %%xmm0 \n\t"
131 "paddw %%xmm5, %%xmm4 \n\t" 138 "paddw %%xmm5, %%xmm4 \n\t"
132 "psraw $2, %%xmm0 \n\t" 139 "psraw $2, %%xmm0 \n\t"
133 "psraw $2, %%xmm4 \n\t" 140 "psraw $2, %%xmm4 \n\t"
134 "paddw %%xmm1, %%xmm0 \n\t" 141 "paddw %%xmm1, %%xmm0 \n\t"
286 IDWTELEM * const ref = b+w2 - 1; 293 IDWTELEM * const ref = b+w2 - 1;
287 294
288 i = 1; 295 i = 1;
289 b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS); 296 b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS);
290 asm volatile( 297 asm volatile(
291 "psllw $2, %%mm7 \n\t" 298 "psllw $13, %%mm7 \n\t"
299 "pcmpeqw %%mm6, %%mm6 \n\t"
300 "psrlw $13, %%mm6 \n\t"
301 "paddw %%mm7, %%mm6 \n\t"
292 ::); 302 ::);
293 for(; i<w_l-7; i+=8){ 303 for(; i<w_l-7; i+=8){
294 asm volatile( 304 asm volatile(
295 "movq (%1), %%mm0 \n\t" 305 "movq (%1), %%mm0 \n\t"
296 "movq 8(%1), %%mm4 \n\t" 306 "movq 8(%1), %%mm4 \n\t"
297 "paddw 2(%1), %%mm0 \n\t" 307 "movq 2(%1), %%mm1 \n\t"
298 "paddw 10(%1), %%mm4 \n\t" 308 "movq 10(%1), %%mm5 \n\t"
299 "paddw %%mm7, %%mm0 \n\t" 309 "paddw %%mm6, %%mm0 \n\t"
300 "paddw %%mm7, %%mm4 \n\t" 310 "paddw %%mm6, %%mm4 \n\t"
301 "psraw $2, %%mm0 \n\t" 311 "paddw %%mm7, %%mm1 \n\t"
302 "psraw $2, %%mm4 \n\t" 312 "paddw %%mm7, %%mm5 \n\t"
313 "pavgw %%mm1, %%mm0 \n\t"
314 "pavgw %%mm5, %%mm4 \n\t"
315 "psubw %%mm7, %%mm0 \n\t"
316 "psubw %%mm7, %%mm4 \n\t"
317 "psraw $1, %%mm0 \n\t"
318 "psraw $1, %%mm4 \n\t"
303 "movq (%0), %%mm1 \n\t" 319 "movq (%0), %%mm1 \n\t"
304 "movq 8(%0), %%mm5 \n\t" 320 "movq 8(%0), %%mm5 \n\t"
305 "paddw %%mm1, %%mm0 \n\t" 321 "paddw %%mm1, %%mm0 \n\t"
306 "paddw %%mm5, %%mm4 \n\t" 322 "paddw %%mm5, %%mm4 \n\t"
307 "psraw $2, %%mm0 \n\t" 323 "psraw $2, %%mm0 \n\t"
465 snow_vertical_compose_sse2_load(REG_S,"xmm0","xmm2","xmm4","xmm6") 481 snow_vertical_compose_sse2_load(REG_S,"xmm0","xmm2","xmm4","xmm6")
466 snow_vertical_compose_sse2_add(REG_c,"xmm1","xmm3","xmm5","xmm7") 482 snow_vertical_compose_sse2_add(REG_c,"xmm1","xmm3","xmm5","xmm7")
467 snow_vertical_compose_sse2_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") 483 snow_vertical_compose_sse2_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
468 snow_vertical_compose_sse2_store(REG_S,"xmm0","xmm2","xmm4","xmm6") 484 snow_vertical_compose_sse2_store(REG_S,"xmm0","xmm2","xmm4","xmm6")
469 "mov %2, %%"REG_a" \n\t" 485 "mov %2, %%"REG_a" \n\t"
470 snow_vertical_compose_sse2_add(REG_a,"xmm0","xmm2","xmm4","xmm6") 486
471 snow_vertical_compose_sse2_sra("2","xmm0","xmm2","xmm4","xmm6") 487 "pcmpeqw %%xmm7, %%xmm7 \n\t"
488 "pcmpeqw %%xmm5, %%xmm5 \n\t"
489 "psllw $15, %%xmm7 \n\t"
490 "psrlw $13, %%xmm5 \n\t"
491 "paddw %%xmm7, %%xmm5 \n\t"
492 snow_vertical_compose_sse2_r2r_add("xmm5","xmm5","xmm5","xmm5","xmm0","xmm2","xmm4","xmm6")
493 "movq (%%"REG_a",%%"REG_d",2), %%xmm1 \n\t"
494 "movq 8(%%"REG_a",%%"REG_d",2), %%xmm3 \n\t"
495 "paddw %%xmm7, %%xmm1 \n\t"
496 "paddw %%xmm7, %%xmm3 \n\t"
497 "pavgw %%xmm1, %%xmm0 \n\t"
498 "pavgw %%xmm3, %%xmm2 \n\t"
499 "movq 16(%%"REG_a",%%"REG_d",2), %%xmm1 \n\t"
500 "movq 24(%%"REG_a",%%"REG_d",2), %%xmm3 \n\t"
501 "paddw %%xmm7, %%xmm1 \n\t"
502 "paddw %%xmm7, %%xmm3 \n\t"
503 "pavgw %%xmm1, %%xmm4 \n\t"
504 "pavgw %%xmm3, %%xmm6 \n\t"
505 snow_vertical_compose_sse2_sub("xmm7","xmm7","xmm7","xmm7","xmm0","xmm2","xmm4","xmm6")
506 snow_vertical_compose_sse2_sra("1","xmm0","xmm2","xmm4","xmm6")
472 snow_vertical_compose_sse2_add(REG_c,"xmm0","xmm2","xmm4","xmm6") 507 snow_vertical_compose_sse2_add(REG_c,"xmm0","xmm2","xmm4","xmm6")
473 508
474 "pcmpeqd %%xmm1, %%xmm1 \n\t"
475 "psllw $15, %%xmm1 \n\t"
476 "psrlw $14, %%xmm1 \n\t"
477 "mov %1, %%"REG_S" \n\t" 509 "mov %1, %%"REG_S" \n\t"
478 510
479 snow_vertical_compose_sse2_r2r_add("xmm1","xmm1","xmm1","xmm1","xmm0","xmm2","xmm4","xmm6")
480 snow_vertical_compose_sse2_sra("2","xmm0","xmm2","xmm4","xmm6") 511 snow_vertical_compose_sse2_sra("2","xmm0","xmm2","xmm4","xmm6")
481 snow_vertical_compose_sse2_add(REG_c,"xmm0","xmm2","xmm4","xmm6") 512 snow_vertical_compose_sse2_add(REG_c,"xmm0","xmm2","xmm4","xmm6")
482 snow_vertical_compose_sse2_store(REG_c,"xmm0","xmm2","xmm4","xmm6") 513 snow_vertical_compose_sse2_store(REG_c,"xmm0","xmm2","xmm4","xmm6")
483 snow_vertical_compose_sse2_add(REG_S,"xmm0","xmm2","xmm4","xmm6") 514 snow_vertical_compose_sse2_add(REG_S,"xmm0","xmm2","xmm4","xmm6")
484 snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7") 515 snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
567 snow_vertical_compose_mmx_load(REG_S,"mm0","mm2","mm4","mm6") 598 snow_vertical_compose_mmx_load(REG_S,"mm0","mm2","mm4","mm6")
568 snow_vertical_compose_mmx_add(REG_c,"mm1","mm3","mm5","mm7") 599 snow_vertical_compose_mmx_add(REG_c,"mm1","mm3","mm5","mm7")
569 snow_vertical_compose_mmx_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") 600 snow_vertical_compose_mmx_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
570 snow_vertical_compose_mmx_store(REG_S,"mm0","mm2","mm4","mm6") 601 snow_vertical_compose_mmx_store(REG_S,"mm0","mm2","mm4","mm6")
571 "mov %2, %%"REG_a" \n\t" 602 "mov %2, %%"REG_a" \n\t"
572 snow_vertical_compose_mmx_add(REG_a,"mm0","mm2","mm4","mm6") 603 "pcmpeqw %%mm7, %%mm7 \n\t"
573 snow_vertical_compose_mmx_sra("2","mm0","mm2","mm4","mm6") 604 "pcmpeqw %%mm5, %%mm5 \n\t"
605 "psllw $15, %%mm7 \n\t"
606 "psrlw $13, %%mm5 \n\t"
607 "paddw %%mm7, %%mm5 \n\t"
608 snow_vertical_compose_mmx_r2r_add("mm5","mm5","mm5","mm5","mm0","mm2","mm4","mm6")
609 "movq (%%"REG_a",%%"REG_d",2), %%mm1 \n\t"
610 "movq 8(%%"REG_a",%%"REG_d",2), %%mm3 \n\t"
611 "paddw %%mm7, %%mm1 \n\t"
612 "paddw %%mm7, %%mm3 \n\t"
613 "pavgw %%mm1, %%mm0 \n\t"
614 "pavgw %%mm3, %%mm2 \n\t"
615 "movq 16(%%"REG_a",%%"REG_d",2), %%mm1 \n\t"
616 "movq 24(%%"REG_a",%%"REG_d",2), %%mm3 \n\t"
617 "paddw %%mm7, %%mm1 \n\t"
618 "paddw %%mm7, %%mm3 \n\t"
619 "pavgw %%mm1, %%mm4 \n\t"
620 "pavgw %%mm3, %%mm6 \n\t"
621 snow_vertical_compose_sse2_sub("mm7","mm7","mm7","mm7","mm0","mm2","mm4","mm6")
622 snow_vertical_compose_mmx_sra("1","mm0","mm2","mm4","mm6")
574 snow_vertical_compose_mmx_add(REG_c,"mm0","mm2","mm4","mm6") 623 snow_vertical_compose_mmx_add(REG_c,"mm0","mm2","mm4","mm6")
575 624
576 "pcmpeqw %%mm1, %%mm1 \n\t"
577 "psllw $15, %%mm1 \n\t"
578 "psrlw $14, %%mm1 \n\t"
579 "mov %1, %%"REG_S" \n\t" 625 "mov %1, %%"REG_S" \n\t"
580 626
581 snow_vertical_compose_mmx_r2r_add("mm1","mm1","mm1","mm1","mm0","mm2","mm4","mm6")
582 snow_vertical_compose_mmx_sra("2","mm0","mm2","mm4","mm6") 627 snow_vertical_compose_mmx_sra("2","mm0","mm2","mm4","mm6")
583 snow_vertical_compose_mmx_add(REG_c,"mm0","mm2","mm4","mm6") 628 snow_vertical_compose_mmx_add(REG_c,"mm0","mm2","mm4","mm6")
584 snow_vertical_compose_mmx_store(REG_c,"mm0","mm2","mm4","mm6") 629 snow_vertical_compose_mmx_store(REG_c,"mm0","mm2","mm4","mm6")
585 snow_vertical_compose_mmx_add(REG_S,"mm0","mm2","mm4","mm6") 630 snow_vertical_compose_mmx_add(REG_S,"mm0","mm2","mm4","mm6")
586 snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7") 631 snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")