# HG changeset patch # User mellum # Date 1025567221 0 # Node ID fa4425cf6b31b59cde6bc038caf87317948b98df # Parent ccd90a9cc09bff28eca2af525400531bec2d29af Assembly version of put_pixels. This is currently the function that takes the most time, and it allows for more efficient unaligned access and better control over memory latencies. diff -r ccd90a9cc09b -r fa4425cf6b31 alpha/dsputil_alpha.c --- a/alpha/dsputil_alpha.c Mon Jul 01 23:02:36 2002 +0000 +++ b/alpha/dsputil_alpha.c Mon Jul 01 23:47:01 2002 +0000 @@ -22,6 +22,8 @@ void simple_idct_axp(DCTELEM *block); +void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels, + int line_size, int h); void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, int line_size); void add_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, @@ -232,12 +234,12 @@ void dsputil_init_alpha(void) { - put_pixels_tab[0] = put_pixels_axp; + put_pixels_tab[0] = put_pixels_axp_asm; put_pixels_tab[1] = put_pixels_x2_axp; put_pixels_tab[2] = put_pixels_y2_axp; put_pixels_tab[3] = put_pixels_xy2_axp; - put_no_rnd_pixels_tab[0] = put_pixels_axp; + put_no_rnd_pixels_tab[0] = put_pixels_axp_asm; put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_axp; put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_axp; put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_axp; diff -r ccd90a9cc09b -r fa4425cf6b31 alpha/dsputil_alpha_asm.S --- a/alpha/dsputil_alpha_asm.S Mon Jul 01 23:02:36 2002 +0000 +++ b/alpha/dsputil_alpha_asm.S Mon Jul 01 23:47:01 2002 +0000 @@ -44,6 +44,123 @@ .text /************************************************************************ + * void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels, + * int line_size, int h) + */ + .align 6 + .globl put_pixels_axp_asm + .ent put_pixels_axp_asm +put_pixels_axp_asm: + .frame sp, 0, ra + .prologue 0 + +#ifdef HAVE_GPROF + lda AT, _mcount + jsr AT, (AT), _mcount +#endif + + and a1, 7, t0 + beq t0, $aligned + + .align 4 +$unaligned: + ldq_u t0, 0(a1) + ldq_u t1, 8(a1) + addq a1, a2, a1 + nop + + ldq_u t2, 0(a1) + ldq_u t3, 8(a1) + addq a1, a2, a1 + nop + + ldq_u t4, 0(a1) + ldq_u t5, 8(a1) + addq a1, a2, a1 + nop + + ldq_u t6, 0(a1) + ldq_u t7, 8(a1) + extql t0, a1, t0 + addq a1, a2, a1 + + extqh t1, a1, t1 + addq a0, a2, t8 + extql t2, a1, t2 + addq t8, a2, t9 + + extqh t3, a1, t3 + addq t9, a2, ta + extql t4, a1, t4 + or t0, t1, t0 + + extqh t5, a1, t5 + or t2, t3, t2 + extql t6, a1, t6 + or t4, t5, t4 + + extqh t7, a1, t7 + or t6, t7, t6 + stq t0, 0(a0) + stq t2, 0(t8) + + stq t4, 0(t9) + subq a3, 4, a3 + stq t6, 0(ta) + addq ta, a2, a0 + + bne a3, $unaligned + ret + + .align 4 +$aligned: + ldq t0, 0(a1) + addq a1, a2, a1 + ldq t1, 0(a1) + addq a1, a2, a1 + + ldq t2, 0(a1) + addq a1, a2, a1 + ldq t3, 0(a1) + addq a1, a2, a1 + + ldq t4, 0(a1) + addq a1, a2, a1 + ldq t5, 0(a1) + addq a1, a2, a1 + + ldq t6, 0(a1) + addq a1, a2, a1 + ldq t7, 0(a1) + addq a1, a2, a1 + + addq a0, a2, t8 + stq t0, 0(a0) + addq t8, a2, t9 + stq t1, 0(t8) + + addq t9, a2, ta + stq t2, 0(t9) + addq ta, a2, tb + stq t3, 0(ta) + + addq tb, a2, tc + stq t4, 0(tb) + addq tc, a2, td + stq t5, 0(tc) + + addq td, a2, te + stq t6, 0(td) + addq te, a2, a0 + stq t7, 0(te) + + subq a3, 8, a3 + bne a3, $aligned + + ret + .end put_pixels_axp_asm + +/************************************************************************ * void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, * int line_size) */