libavcodec.hg: alpha/dsputil_alpha

comparison alpha/dsputil_alpha_asm.S @ 509:cab79946302f libavcodec

Implement put_pixels_clamped and add_pixels_clamped in Assembler. This allows better scheduling of the memory accesses, and is portable among all compilers.

author	mellum
date	Mon, 01 Jul 2002 04:26:07 +0000
parents
children	ccd90a9cc09b

comparison

equal deleted inserted replaced

-:8f9fa4ec9cbb
+:cab79946302f
+/*
+* Alpha optimized DSP utils
+* Copyright (c) 2002 Falk Hueffner <falk@debian.org>
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+/*
+* These functions are scheduled for pca56. They should work
+* reasonably on ev6, though.
+*/
+#include "regdef.h"
+/* Some nicer register names.  */
+#define ta t10
+#define tb t11
+#define tc t12
+#define td AT
+/* Danger: these overlap with the argument list and the return value */
+#define te a5
+#define tf a4
+#define tg a3
+#define th v0
+.set noat
+.set noreorder
+.arch pca56
+.text
+/************************************************************************
+* void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels,
+*                                 int line_size)
+*/
+.align 6
+.globl put_pixels_clamped_mvi_asm
+.ent put_pixels_clamped_mvi_asm
+put_pixels_clamped_mvi_asm:
+.frame sp, 0, ra
+.prologue 0
+lda     t8, -1
+lda     t9, 8           # loop counter
+zap     t8, 0xaa, t8    # 00ff00ff00ff00ff
+.align 4
+1:      ldq     t0,  0(a0)
+ldq     t1,  8(a0)
+ldq     t2, 16(a0)
+ldq     t3, 24(a0)
+maxsw4  t0, zero, t0
+subq    t9, 2, t9
+maxsw4  t1, zero, t1
+lda     a0, 32(a0)
+maxsw4  t2, zero, t2
+addq    a1, a2, ta
+maxsw4  t3, zero, t3
+minsw4  t0, t8, t0
+minsw4  t1, t8, t1
+minsw4  t2, t8, t2
+minsw4  t3, t8, t3
+pkwb    t0, t0
+pkwb    t1, t1
+pkwb    t2, t2
+pkwb    t3, t3
+stl     t0, 0(a1)
+stl     t1, 4(a1)
+addq    ta, a2, a1
+stl     t2, 0(ta)
+stl     t3, 4(ta)
+bne     t9, 1b
+ret
+.end put_pixels_clamped_mvi_asm
+/************************************************************************
+* void add_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels,
+*                                 int line_size)
+*/
+.align 6
+.globl add_pixels_clamped_mvi_asm
+.ent add_pixels_clamped_mvi_asm
+add_pixels_clamped_mvi_asm:
+.frame sp, 0, ra
+.prologue 0
+lda     t1, -1
+lda     th, 8
+zap     t1, 0x33, tg
+nop
+srl     tg, 1, t0
+xor     tg, t0, tg      # 0x8000800080008000
+zap     t1, 0xaa, tf    # 0x00ff00ff00ff00ff
+.align 4
+1:      ldl     t1, 0(a1)       # pix0 (try to hit cache line soon)
+ldl     t4, 4(a1)       # pix1
+addq    a1, a2, te      # pixels += line_size
+ldq     t0, 0(a0)       # shorts0
+ldl     t7, 0(te)       # pix2 (try to hit cache line soon)
+ldl     ta, 4(te)       # pix3
+ldq     t3, 8(a0)       # shorts1
+ldq     t6, 16(a0)      # shorts2
+ldq     t9, 24(a0)      # shorts3
+unpkbw  t1, t1          # 0 0 (quarter/op no.)
+and     t0, tg, t2      # 0 1
+unpkbw  t4, t4          # 1 0
+bic     t0, tg, t0      # 0 2
+unpkbw  t7, t7          # 2 0
+and     t3, tg, t5      # 1 1
+addq    t0, t1, t0      # 0 3
+xor     t0, t2, t0      # 0 4
+unpkbw  ta, ta          # 3 0
+and     t6, tg, t8      # 2 1
+maxsw4  t0, zero, t0    # 0 5
+bic     t3, tg, t3      # 1 2
+bic     t6, tg, t6      # 2 2
+minsw4  t0, tf, t0      # 0 6
+addq    t3, t4, t3      # 1 3
+pkwb    t0, t0          # 0 7
+xor     t3, t5, t3      # 1 4
+maxsw4  t3, zero, t3    # 1 5
+addq    t6, t7, t6      # 2 3
+xor     t6, t8, t6      # 2 4
+and     t9, tg, tb      # 3 1
+minsw4  t3, tf, t3      # 1 6
+bic     t9, tg, t9      # 3 2
+maxsw4  t6, zero, t6    # 2 5
+addq    t9, ta, t9      # 3 3
+stl     t0, 0(a1)       # 0 8
+minsw4  t6, tf, t6      # 2 6
+xor     t9, tb, t9      # 3 4
+maxsw4  t9, zero, t9    # 3 5
+lda     a0, 32(a0)      # block += 16;
+pkwb    t3, t3          # 1 7
+minsw4  t9, tf, t9      # 3 6
+subq    th, 2, th
+pkwb    t6, t6          # 2 7
+pkwb    t9, t9          # 3 7
+stl     t3, 4(a1)       # 1 8
+addq    te, a2, a1      # pixels += line_size
+stl     t6, 0(te)       # 2 8
+stl     t9, 4(te)       # 3 8
+bne     th, 1b
+ret
+.end add_pixels_clamped_mvi_asm

Mercurial > libavcodec.hg

comparison alpha/dsputil_alpha_asm.S @ 509:cab79946302f libavcodec