Mercurial > libavcodec.hg
comparison i386/snowdsp_mmx.c @ 5594:384629ebcb93 libavcodec
avoid overflow in the 3rd lifting step, this now needs mmx2 at minimum
(patch for plain mmx support is welcome ...)
author | michael |
---|---|
date | Sun, 26 Aug 2007 01:11:02 +0000 |
parents | bd015f9ea964 |
children | 9da9e00a04a5 |
comparison
equal
deleted
inserted
replaced
5593:bd015f9ea964 | 5594:384629ebcb93 |
---|---|
109 IDWTELEM * const ref = b+w2 - 1; | 109 IDWTELEM * const ref = b+w2 - 1; |
110 IDWTELEM b_0 = b[0]; | 110 IDWTELEM b_0 = b[0]; |
111 | 111 |
112 i = 0; | 112 i = 0; |
113 asm volatile( | 113 asm volatile( |
114 "psllw $2, %%xmm7 \n\t" | 114 "psllw $13, %%xmm7 \n\t" |
115 "pcmpeqw %%xmm6, %%xmm6 \n\t" | |
116 "psrlw $13, %%xmm6 \n\t" | |
117 "paddw %%xmm7, %%xmm6 \n\t" | |
115 ::); | 118 ::); |
116 for(; i<w_l-15; i+=16){ | 119 for(; i<w_l-15; i+=16){ |
117 asm volatile( | 120 asm volatile( |
118 "movdqu (%1), %%xmm1 \n\t" | 121 "movdqu (%1), %%xmm0 \n\t" |
119 "movdqu 16(%1), %%xmm5 \n\t" | 122 "movdqu 16(%1), %%xmm4 \n\t" |
120 "movdqu 2(%1), %%xmm0 \n\t" | 123 "movdqu 2(%1), %%xmm1 \n\t" |
121 "movdqu 18(%1), %%xmm4 \n\t" //FIXME try aligned reads and shifts | 124 "movdqu 18(%1), %%xmm5 \n\t" //FIXME try aligned reads and shifts |
122 "paddw %%xmm1, %%xmm0 \n\t" | 125 "paddw %%xmm6, %%xmm0 \n\t" |
123 "paddw %%xmm5, %%xmm4 \n\t" | 126 "paddw %%xmm6, %%xmm4 \n\t" |
124 "paddw %%xmm7, %%xmm0 \n\t" | 127 "paddw %%xmm7, %%xmm1 \n\t" |
125 "paddw %%xmm7, %%xmm4 \n\t" | 128 "paddw %%xmm7, %%xmm5 \n\t" |
129 "pavgw %%xmm1, %%xmm0 \n\t" | |
130 "pavgw %%xmm5, %%xmm4 \n\t" | |
131 "psubw %%xmm7, %%xmm0 \n\t" | |
132 "psubw %%xmm7, %%xmm4 \n\t" | |
133 "psraw $1, %%xmm0 \n\t" | |
134 "psraw $1, %%xmm4 \n\t" | |
126 "movdqa (%0), %%xmm1 \n\t" | 135 "movdqa (%0), %%xmm1 \n\t" |
127 "movdqa 16(%0), %%xmm5 \n\t" | 136 "movdqa 16(%0), %%xmm5 \n\t" |
128 "psraw $2, %%xmm0 \n\t" | |
129 "psraw $2, %%xmm4 \n\t" | |
130 "paddw %%xmm1, %%xmm0 \n\t" | 137 "paddw %%xmm1, %%xmm0 \n\t" |
131 "paddw %%xmm5, %%xmm4 \n\t" | 138 "paddw %%xmm5, %%xmm4 \n\t" |
132 "psraw $2, %%xmm0 \n\t" | 139 "psraw $2, %%xmm0 \n\t" |
133 "psraw $2, %%xmm4 \n\t" | 140 "psraw $2, %%xmm4 \n\t" |
134 "paddw %%xmm1, %%xmm0 \n\t" | 141 "paddw %%xmm1, %%xmm0 \n\t" |
286 IDWTELEM * const ref = b+w2 - 1; | 293 IDWTELEM * const ref = b+w2 - 1; |
287 | 294 |
288 i = 1; | 295 i = 1; |
289 b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS); | 296 b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS); |
290 asm volatile( | 297 asm volatile( |
291 "psllw $2, %%mm7 \n\t" | 298 "psllw $13, %%mm7 \n\t" |
299 "pcmpeqw %%mm6, %%mm6 \n\t" | |
300 "psrlw $13, %%mm6 \n\t" | |
301 "paddw %%mm7, %%mm6 \n\t" | |
292 ::); | 302 ::); |
293 for(; i<w_l-7; i+=8){ | 303 for(; i<w_l-7; i+=8){ |
294 asm volatile( | 304 asm volatile( |
295 "movq (%1), %%mm0 \n\t" | 305 "movq (%1), %%mm0 \n\t" |
296 "movq 8(%1), %%mm4 \n\t" | 306 "movq 8(%1), %%mm4 \n\t" |
297 "paddw 2(%1), %%mm0 \n\t" | 307 "movq 2(%1), %%mm1 \n\t" |
298 "paddw 10(%1), %%mm4 \n\t" | 308 "movq 10(%1), %%mm5 \n\t" |
299 "paddw %%mm7, %%mm0 \n\t" | 309 "paddw %%mm6, %%mm0 \n\t" |
300 "paddw %%mm7, %%mm4 \n\t" | 310 "paddw %%mm6, %%mm4 \n\t" |
301 "psraw $2, %%mm0 \n\t" | 311 "paddw %%mm7, %%mm1 \n\t" |
302 "psraw $2, %%mm4 \n\t" | 312 "paddw %%mm7, %%mm5 \n\t" |
313 "pavgw %%mm1, %%mm0 \n\t" | |
314 "pavgw %%mm5, %%mm4 \n\t" | |
315 "psubw %%mm7, %%mm0 \n\t" | |
316 "psubw %%mm7, %%mm4 \n\t" | |
317 "psraw $1, %%mm0 \n\t" | |
318 "psraw $1, %%mm4 \n\t" | |
303 "movq (%0), %%mm1 \n\t" | 319 "movq (%0), %%mm1 \n\t" |
304 "movq 8(%0), %%mm5 \n\t" | 320 "movq 8(%0), %%mm5 \n\t" |
305 "paddw %%mm1, %%mm0 \n\t" | 321 "paddw %%mm1, %%mm0 \n\t" |
306 "paddw %%mm5, %%mm4 \n\t" | 322 "paddw %%mm5, %%mm4 \n\t" |
307 "psraw $2, %%mm0 \n\t" | 323 "psraw $2, %%mm0 \n\t" |
465 snow_vertical_compose_sse2_load(REG_S,"xmm0","xmm2","xmm4","xmm6") | 481 snow_vertical_compose_sse2_load(REG_S,"xmm0","xmm2","xmm4","xmm6") |
466 snow_vertical_compose_sse2_add(REG_c,"xmm1","xmm3","xmm5","xmm7") | 482 snow_vertical_compose_sse2_add(REG_c,"xmm1","xmm3","xmm5","xmm7") |
467 snow_vertical_compose_sse2_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") | 483 snow_vertical_compose_sse2_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") |
468 snow_vertical_compose_sse2_store(REG_S,"xmm0","xmm2","xmm4","xmm6") | 484 snow_vertical_compose_sse2_store(REG_S,"xmm0","xmm2","xmm4","xmm6") |
469 "mov %2, %%"REG_a" \n\t" | 485 "mov %2, %%"REG_a" \n\t" |
470 snow_vertical_compose_sse2_add(REG_a,"xmm0","xmm2","xmm4","xmm6") | 486 |
471 snow_vertical_compose_sse2_sra("2","xmm0","xmm2","xmm4","xmm6") | 487 "pcmpeqw %%xmm7, %%xmm7 \n\t" |
488 "pcmpeqw %%xmm5, %%xmm5 \n\t" | |
489 "psllw $15, %%xmm7 \n\t" | |
490 "psrlw $13, %%xmm5 \n\t" | |
491 "paddw %%xmm7, %%xmm5 \n\t" | |
492 snow_vertical_compose_sse2_r2r_add("xmm5","xmm5","xmm5","xmm5","xmm0","xmm2","xmm4","xmm6") | |
493 "movq (%%"REG_a",%%"REG_d",2), %%xmm1 \n\t" | |
494 "movq 8(%%"REG_a",%%"REG_d",2), %%xmm3 \n\t" | |
495 "paddw %%xmm7, %%xmm1 \n\t" | |
496 "paddw %%xmm7, %%xmm3 \n\t" | |
497 "pavgw %%xmm1, %%xmm0 \n\t" | |
498 "pavgw %%xmm3, %%xmm2 \n\t" | |
499 "movq 16(%%"REG_a",%%"REG_d",2), %%xmm1 \n\t" | |
500 "movq 24(%%"REG_a",%%"REG_d",2), %%xmm3 \n\t" | |
501 "paddw %%xmm7, %%xmm1 \n\t" | |
502 "paddw %%xmm7, %%xmm3 \n\t" | |
503 "pavgw %%xmm1, %%xmm4 \n\t" | |
504 "pavgw %%xmm3, %%xmm6 \n\t" | |
505 snow_vertical_compose_sse2_sub("xmm7","xmm7","xmm7","xmm7","xmm0","xmm2","xmm4","xmm6") | |
506 snow_vertical_compose_sse2_sra("1","xmm0","xmm2","xmm4","xmm6") | |
472 snow_vertical_compose_sse2_add(REG_c,"xmm0","xmm2","xmm4","xmm6") | 507 snow_vertical_compose_sse2_add(REG_c,"xmm0","xmm2","xmm4","xmm6") |
473 | 508 |
474 "pcmpeqd %%xmm1, %%xmm1 \n\t" | |
475 "psllw $15, %%xmm1 \n\t" | |
476 "psrlw $14, %%xmm1 \n\t" | |
477 "mov %1, %%"REG_S" \n\t" | 509 "mov %1, %%"REG_S" \n\t" |
478 | 510 |
479 snow_vertical_compose_sse2_r2r_add("xmm1","xmm1","xmm1","xmm1","xmm0","xmm2","xmm4","xmm6") | |
480 snow_vertical_compose_sse2_sra("2","xmm0","xmm2","xmm4","xmm6") | 511 snow_vertical_compose_sse2_sra("2","xmm0","xmm2","xmm4","xmm6") |
481 snow_vertical_compose_sse2_add(REG_c,"xmm0","xmm2","xmm4","xmm6") | 512 snow_vertical_compose_sse2_add(REG_c,"xmm0","xmm2","xmm4","xmm6") |
482 snow_vertical_compose_sse2_store(REG_c,"xmm0","xmm2","xmm4","xmm6") | 513 snow_vertical_compose_sse2_store(REG_c,"xmm0","xmm2","xmm4","xmm6") |
483 snow_vertical_compose_sse2_add(REG_S,"xmm0","xmm2","xmm4","xmm6") | 514 snow_vertical_compose_sse2_add(REG_S,"xmm0","xmm2","xmm4","xmm6") |
484 snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7") | 515 snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7") |
567 snow_vertical_compose_mmx_load(REG_S,"mm0","mm2","mm4","mm6") | 598 snow_vertical_compose_mmx_load(REG_S,"mm0","mm2","mm4","mm6") |
568 snow_vertical_compose_mmx_add(REG_c,"mm1","mm3","mm5","mm7") | 599 snow_vertical_compose_mmx_add(REG_c,"mm1","mm3","mm5","mm7") |
569 snow_vertical_compose_mmx_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") | 600 snow_vertical_compose_mmx_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") |
570 snow_vertical_compose_mmx_store(REG_S,"mm0","mm2","mm4","mm6") | 601 snow_vertical_compose_mmx_store(REG_S,"mm0","mm2","mm4","mm6") |
571 "mov %2, %%"REG_a" \n\t" | 602 "mov %2, %%"REG_a" \n\t" |
572 snow_vertical_compose_mmx_add(REG_a,"mm0","mm2","mm4","mm6") | 603 "pcmpeqw %%mm7, %%mm7 \n\t" |
573 snow_vertical_compose_mmx_sra("2","mm0","mm2","mm4","mm6") | 604 "pcmpeqw %%mm5, %%mm5 \n\t" |
605 "psllw $15, %%mm7 \n\t" | |
606 "psrlw $13, %%mm5 \n\t" | |
607 "paddw %%mm7, %%mm5 \n\t" | |
608 snow_vertical_compose_mmx_r2r_add("mm5","mm5","mm5","mm5","mm0","mm2","mm4","mm6") | |
609 "movq (%%"REG_a",%%"REG_d",2), %%mm1 \n\t" | |
610 "movq 8(%%"REG_a",%%"REG_d",2), %%mm3 \n\t" | |
611 "paddw %%mm7, %%mm1 \n\t" | |
612 "paddw %%mm7, %%mm3 \n\t" | |
613 "pavgw %%mm1, %%mm0 \n\t" | |
614 "pavgw %%mm3, %%mm2 \n\t" | |
615 "movq 16(%%"REG_a",%%"REG_d",2), %%mm1 \n\t" | |
616 "movq 24(%%"REG_a",%%"REG_d",2), %%mm3 \n\t" | |
617 "paddw %%mm7, %%mm1 \n\t" | |
618 "paddw %%mm7, %%mm3 \n\t" | |
619 "pavgw %%mm1, %%mm4 \n\t" | |
620 "pavgw %%mm3, %%mm6 \n\t" | |
621 snow_vertical_compose_sse2_sub("mm7","mm7","mm7","mm7","mm0","mm2","mm4","mm6") | |
622 snow_vertical_compose_mmx_sra("1","mm0","mm2","mm4","mm6") | |
574 snow_vertical_compose_mmx_add(REG_c,"mm0","mm2","mm4","mm6") | 623 snow_vertical_compose_mmx_add(REG_c,"mm0","mm2","mm4","mm6") |
575 | 624 |
576 "pcmpeqw %%mm1, %%mm1 \n\t" | |
577 "psllw $15, %%mm1 \n\t" | |
578 "psrlw $14, %%mm1 \n\t" | |
579 "mov %1, %%"REG_S" \n\t" | 625 "mov %1, %%"REG_S" \n\t" |
580 | 626 |
581 snow_vertical_compose_mmx_r2r_add("mm1","mm1","mm1","mm1","mm0","mm2","mm4","mm6") | |
582 snow_vertical_compose_mmx_sra("2","mm0","mm2","mm4","mm6") | 627 snow_vertical_compose_mmx_sra("2","mm0","mm2","mm4","mm6") |
583 snow_vertical_compose_mmx_add(REG_c,"mm0","mm2","mm4","mm6") | 628 snow_vertical_compose_mmx_add(REG_c,"mm0","mm2","mm4","mm6") |
584 snow_vertical_compose_mmx_store(REG_c,"mm0","mm2","mm4","mm6") | 629 snow_vertical_compose_mmx_store(REG_c,"mm0","mm2","mm4","mm6") |
585 snow_vertical_compose_mmx_add(REG_S,"mm0","mm2","mm4","mm6") | 630 snow_vertical_compose_mmx_add(REG_S,"mm0","mm2","mm4","mm6") |
586 snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7") | 631 snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7") |