comparison alpha/dsputil_alpha_asm.S @ 509:cab79946302f libavcodec

Implement put_pixels_clamped and add_pixels_clamped in Assembler. This allows better scheduling of the memory accesses, and is portable among all compilers.
author mellum
date Mon, 01 Jul 2002 04:26:07 +0000
parents
children ccd90a9cc09b
comparison
equal deleted inserted replaced
508:8f9fa4ec9cbb 509:cab79946302f
1 /*
2 * Alpha optimized DSP utils
3 * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19
20 /*
21 * These functions are scheduled for pca56. They should work
22 * reasonably on ev6, though.
23 */
24
25 #include "regdef.h"
26
27 /* Some nicer register names. */
28 #define ta t10
29 #define tb t11
30 #define tc t12
31 #define td AT
32 /* Danger: these overlap with the argument list and the return value */
33 #define te a5
34 #define tf a4
35 #define tg a3
36 #define th v0
37
38 .set noat
39 .set noreorder
40 .arch pca56
41 .text
42
43 /************************************************************************
44 * void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels,
45 * int line_size)
46 */
47 .align 6
48 .globl put_pixels_clamped_mvi_asm
49 .ent put_pixels_clamped_mvi_asm
50 put_pixels_clamped_mvi_asm:
51 .frame sp, 0, ra
52 .prologue 0
53
54 lda t8, -1
55 lda t9, 8 # loop counter
56 zap t8, 0xaa, t8 # 00ff00ff00ff00ff
57
58 .align 4
59 1: ldq t0, 0(a0)
60 ldq t1, 8(a0)
61 ldq t2, 16(a0)
62 ldq t3, 24(a0)
63
64 maxsw4 t0, zero, t0
65 subq t9, 2, t9
66 maxsw4 t1, zero, t1
67 lda a0, 32(a0)
68
69 maxsw4 t2, zero, t2
70 addq a1, a2, ta
71 maxsw4 t3, zero, t3
72 minsw4 t0, t8, t0
73
74 minsw4 t1, t8, t1
75 minsw4 t2, t8, t2
76 minsw4 t3, t8, t3
77 pkwb t0, t0
78
79 pkwb t1, t1
80 pkwb t2, t2
81 pkwb t3, t3
82 stl t0, 0(a1)
83
84 stl t1, 4(a1)
85 addq ta, a2, a1
86 stl t2, 0(ta)
87 stl t3, 4(ta)
88
89 bne t9, 1b
90 ret
91 .end put_pixels_clamped_mvi_asm
92
93 /************************************************************************
94 * void add_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels,
95 * int line_size)
96 */
97 .align 6
98 .globl add_pixels_clamped_mvi_asm
99 .ent add_pixels_clamped_mvi_asm
100 add_pixels_clamped_mvi_asm:
101 .frame sp, 0, ra
102 .prologue 0
103
104 lda t1, -1
105 lda th, 8
106 zap t1, 0x33, tg
107 nop
108
109 srl tg, 1, t0
110 xor tg, t0, tg # 0x8000800080008000
111 zap t1, 0xaa, tf # 0x00ff00ff00ff00ff
112
113 .align 4
114 1: ldl t1, 0(a1) # pix0 (try to hit cache line soon)
115 ldl t4, 4(a1) # pix1
116 addq a1, a2, te # pixels += line_size
117 ldq t0, 0(a0) # shorts0
118
119 ldl t7, 0(te) # pix2 (try to hit cache line soon)
120 ldl ta, 4(te) # pix3
121 ldq t3, 8(a0) # shorts1
122 ldq t6, 16(a0) # shorts2
123
124 ldq t9, 24(a0) # shorts3
125 unpkbw t1, t1 # 0 0 (quarter/op no.)
126 and t0, tg, t2 # 0 1
127 unpkbw t4, t4 # 1 0
128
129 bic t0, tg, t0 # 0 2
130 unpkbw t7, t7 # 2 0
131 and t3, tg, t5 # 1 1
132 addq t0, t1, t0 # 0 3
133
134 xor t0, t2, t0 # 0 4
135 unpkbw ta, ta # 3 0
136 and t6, tg, t8 # 2 1
137 maxsw4 t0, zero, t0 # 0 5
138
139 bic t3, tg, t3 # 1 2
140 bic t6, tg, t6 # 2 2
141 minsw4 t0, tf, t0 # 0 6
142 addq t3, t4, t3 # 1 3
143
144 pkwb t0, t0 # 0 7
145 xor t3, t5, t3 # 1 4
146 maxsw4 t3, zero, t3 # 1 5
147 addq t6, t7, t6 # 2 3
148
149 xor t6, t8, t6 # 2 4
150 and t9, tg, tb # 3 1
151 minsw4 t3, tf, t3 # 1 6
152 bic t9, tg, t9 # 3 2
153
154 maxsw4 t6, zero, t6 # 2 5
155 addq t9, ta, t9 # 3 3
156 stl t0, 0(a1) # 0 8
157 minsw4 t6, tf, t6 # 2 6
158
159 xor t9, tb, t9 # 3 4
160 maxsw4 t9, zero, t9 # 3 5
161 lda a0, 32(a0) # block += 16;
162 pkwb t3, t3 # 1 7
163
164 minsw4 t9, tf, t9 # 3 6
165 subq th, 2, th
166 pkwb t6, t6 # 2 7
167 pkwb t9, t9 # 3 7
168
169 stl t3, 4(a1) # 1 8
170 addq te, a2, a1 # pixels += line_size
171 stl t6, 0(te) # 2 8
172 stl t9, 4(te) # 3 8
173
174 bne th, 1b
175 ret
176 .end add_pixels_clamped_mvi_asm