Mercurial > libavcodec.hg
comparison bfin/pixels_bfin.S @ 4765:85298e8c55c4 libavcodec
bfin dsputils, basic pixel operations sads, diffs, motion compensation
and standard IEEE 8x8 block transforms
patch by Marc Hoffman, mmh pleasantst com
author | diego |
---|---|
date | Sun, 01 Apr 2007 22:28:45 +0000 |
parents | |
children | 75bf61c6c385 |
comparison
equal
deleted
inserted
replaced
4764:da0598df2e53 | 4765:85298e8c55c4 |
---|---|
1 /* | |
2 * Blackfin Pixel Operations | |
3 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com> | |
4 * | |
5 * This file is part of FFmpeg. | |
6 * | |
7 * FFmpeg is free software; you can redistribute it and/or | |
8 * modify it under the terms of the GNU Lesser General Public | |
9 * License as published by the Free Software Foundation; either | |
10 * version 2.1 of the License, or (at your option) any later version. | |
11 * | |
12 * FFmpeg is distributed in the hope that it will be useful, | |
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 * Lesser General Public License for more details. | |
16 * | |
17 * You should have received a copy of the GNU Lesser General Public | |
18 * License along with FFmpeg; if not, write to the Free Software | |
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 */ | |
21 #include "config_bfin.h" | |
22 | |
23 DEFUN(put_pixels_clamped,mL1, | |
24 (DCTELEM *block, uint8_t *dest, int line_size)): | |
25 [--SP] = (R7:4); | |
26 R4 = 0; | |
27 R5.l = 0x00ff; | |
28 R5.h = 0x00ff; | |
29 I0 = R0; // block | |
30 I1 = R1; // dest | |
31 R2 += -4; // line_size | |
32 M1 = R2; | |
33 P0 = 8; | |
34 R0 = [I0++]; | |
35 R1 = [I0++]; | |
36 R2 = MAX(R0, R4) (V); | |
37 LSETUP (ppc$0,ppc$1) LC0=P0; | |
38 ppc$0: R2 = MIN(R2, R5) (V); | |
39 R3 = MAX(R1, R4) (V); | |
40 R3 = MIN(R3, R5) (V) || R0 = [I0++]; | |
41 R6 = BYTEPACK (R2,R3) || R1 = [I0++]; | |
42 R2 = MAX(R0, R4) (V) || [I1++] = R6; | |
43 R2 = MIN(R2, R5) (V); | |
44 R3 = MAX(R1, R4) (V); | |
45 R3 = MIN(R3, R5) (V) || R0 = [I0++]; | |
46 R6 = BYTEPACK (R2,R3) || R1 = [I0++]; | |
47 ppc$1: R2 = Max(R0, R4) (V) || [I1++M1] = R6; | |
48 | |
49 (R7:4) = [SP++]; | |
50 RTS; | |
51 | |
52 DEFUN(add_pixels_clamped,mL1, | |
53 (DCTELEM *block, uint8_t *dest, int line_size)): | |
54 [-- SP] = (R7:4); | |
55 R4 = 0; | |
56 I0 = 0; | |
57 R2 += -4; // line_size | |
58 M0 = R2; | |
59 I1 = R1; // dest | |
60 I3 = R0; // block | |
61 I2 = R1; // dest | |
62 P0 = 8; | |
63 M3 = 2; | |
64 R0 = [I3++] || R2 = [I1]; | |
65 R2 = R2 << 8 || R0.H = W[I3--] || R3 = [I1++]; | |
66 R3 = R3 >> 8 || R1.L = W[I3] || I3 += 4; | |
67 R6 = BYTEOP3P(R1:0, R3:2) (LO) || R1.H = W[I3++] || R2 = [I1]; | |
68 | |
69 LSETUP(apc$2,apc$3) LC1 = P0; | |
70 apc$2: R7 = BYTEOP3P(R1:0, R3:2) (HI, R) || R0 = [I3++] || R3 = [I1++M0]; | |
71 R2 = R2 << 8 || R0.H = W[I3--]; | |
72 R3 = R3 >> 8 || R1.L = W[I3] || I3 += 4; | |
73 R6 = R6 + R7 (S) || R1.H = W[I3]; | |
74 R6 = BYTEOP3P(R1:0, R3:2) (LO) || I3+=M3 || [I2++]=R6; | |
75 R7 = BYTEOP3P(R1:0, R3:2) (HI, R) || R0 = [I3++] || R2 = [I1]; | |
76 R2 = R2 << 8 || R0.H = W[I3--] || R3 = [I1++]; | |
77 R3 = R3 >> 8 || R1.L = W[I3] || I3 += 4; | |
78 R6 = R6 + R7 (S) || R1.H = W[I3++]; | |
79 apc$3: R6 = BYTEOP3P(R1:0, R3:2) (LO) || [I2++M0] = R6 || R2 = [I1]; | |
80 | |
81 (R7:4) = [SP++]; | |
82 RTS; | |
83 | |
84 | |
85 /* | |
86 motion compensation | |
87 primitives | |
88 | |
89 * Halfpel motion compensation with rounding (a+b+1)>>1. | |
90 * This is an array[4][4] of motion compensation funcions for 4 | |
91 * horizontal blocksizes (8,16) and the 4 halfpel positions<br> | |
92 * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ] | |
93 * @param block destination where the result is stored | |
94 * @param pixels source | |
95 * @param line_size number of bytes in a horizontal line of block | |
96 * @param h height | |
97 | |
98 */ | |
99 | |
100 DEFUN(put_pixels8uc,mL1, | |
101 (uint8_t *block, const uint8_t *s0, const uint8_t *s1, | |
102 int dest_size, int line_size, int h)): | |
103 i3=r0; // dest | |
104 i0=r1; // src0 | |
105 i1=r2; // src1 | |
106 r0=[sp+12]; // dest_size | |
107 r2=[sp+16]; // line_size | |
108 p0=[sp+20]; // h | |
109 [--sp] = (r7:6); | |
110 r0+=-4; | |
111 m3=r0; | |
112 r2+=-8; | |
113 m0=r2; | |
114 LSETUP(pp8$0,pp8$1) LC0=P0; | |
115 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++]; | |
116 | |
117 pp8$0: DISALGNEXCPT || R1 = [I0++] || R3 =[I1++]; | |
118 R6 = BYTEOP1P(R1:0,R3:2) || R0 = [I0++M0]|| R2 =[I1++M0]; | |
119 R7 = BYTEOP1P(R1:0,R3:2)(R) || R0 = [I0++] || [I3++] = R6 ; | |
120 pp8$1: DISALGNEXCPT || R2 = [I1++] || [I3++M3] = R7; | |
121 | |
122 (r7:6) = [sp++]; | |
123 RTS; | |
124 | |
125 DEFUN(put_pixels16uc,mL1, | |
126 (uint8_t *block, const uint8_t *s0, const uint8_t *s1, | |
127 int dest_size, int line_size, int h)): | |
128 link 0; | |
129 [--sp] = (r7:6); | |
130 i3=r0; // dest | |
131 i0=r1; // src0 | |
132 i1=r2; // src1 | |
133 r0=[fp+20]; // dest_size | |
134 r2=[fp+24]; // line_size | |
135 p0=[fp+28]; // h | |
136 | |
137 | |
138 r0+=-12; | |
139 m3=r0; // line_size | |
140 r2+=-16; | |
141 m0=r2; | |
142 | |
143 LSETUP(pp16$0,pp16$1) LC0=P0; | |
144 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++]; | |
145 | |
146 pp16$0: DISALGNEXCPT || R1 = [I0++] || R3 =[I1++]; | |
147 R6 = BYTEOP1P(R1:0,R3:2) || R0 = [I0++] || R2 =[I1++]; | |
148 R7 = BYTEOP1P(R1:0,R3:2)(R) || R1 = [I0++] || R3 =[I1++]; | |
149 [I3++] = R6; | |
150 R6 = BYTEOP1P(R1:0,R3:2) || R0 = [I0++M0] || R2 =[I1++M0]; | |
151 R7 = BYTEOP1P(R1:0,R3:2)(R) || R0 = [I0++] || [I3++] = R7 ; | |
152 [I3++] = R6; | |
153 pp16$1: DISALGNEXCPT || R2 = [I1++] || [I3++M3] = R7; | |
154 | |
155 (r7:6) = [sp++]; | |
156 unlink; | |
157 RTS; | |
158 | |
159 | |
160 | |
161 | |
162 | |
163 | |
164 DEFUN(put_pixels8uc_nornd,mL1, | |
165 (uint8_t *block, const uint8_t *s0, const uint8_t *s1, | |
166 int line_size, int h)): | |
167 i3=r0; // dest | |
168 i0=r1; // src0 | |
169 i1=r2; // src1 | |
170 r2=[sp+12]; // line_size | |
171 p0=[sp+16]; // h | |
172 [--sp] = (r7:6); | |
173 r2+=-4; | |
174 m3=r2; | |
175 r2+=-4; | |
176 m0=r2; | |
177 LSETUP(pp8$2,pp8$3) LC0=P0; | |
178 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++]; | |
179 | |
180 pp8$2: DISALGNEXCPT || R1 = [I0++] || R3 =[I1++]; | |
181 R6 = BYTEOP1P(R1:0,R3:2)(T) || R0 = [I0++M0]|| R2 =[I1++M0]; | |
182 R7 = BYTEOP1P(R1:0,R3:2)(T,R) || R0 = [I0++] || [I3++] = R6 ; | |
183 pp8$3: DISALGNEXCPT || R2 = [I1++] || [I3++M3] = R7; | |
184 | |
185 (r7:6) = [sp++]; | |
186 RTS; | |
187 | |
188 DEFUN(put_pixels16uc_nornd,mL1, | |
189 (uint8_t *block, const uint8_t *s0, const uint8_t *s1, | |
190 int line_size, int h)): | |
191 i3=r0; // dest | |
192 i0=r1; // src0 | |
193 i1=r2; // src1 | |
194 r2=[sp+12]; // line_size | |
195 p0=[sp+16]; // h | |
196 | |
197 [--sp] = (r7:6); | |
198 r2+=-12; | |
199 m3=r2; // line_size | |
200 r2+=-4; | |
201 m0=r2; | |
202 | |
203 LSETUP(pp16$2,pp16$3) LC0=P0; | |
204 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++]; | |
205 | |
206 pp16$2: | |
207 DISALGNEXCPT || R1 = [I0++] || R3 =[I1++]; | |
208 R6 = BYTEOP1P(R1:0,R3:2)(T) || R0 = [I0++] || R2 =[I1++]; | |
209 R7 = BYTEOP1P(R1:0,R3:2)(T,R) || R1 = [I0++] || R3 =[I1++]; | |
210 [I3++] = R6; | |
211 | |
212 R6 = BYTEOP1P(R1:0,R3:2)(T) || R0 = [I0++M0] || R2 =[I1++M0]; | |
213 R7 = BYTEOP1P(R1:0,R3:2)(T,R) || R0 = [I0++] || [I3++] = R7 ; | |
214 [I3++] = R6; | |
215 pp16$3: DISALGNEXCPT || R2 = [I1++] || [I3++M3] = R7; | |
216 | |
217 (r7:6) = [sp++]; | |
218 | |
219 RTS; | |
220 | |
221 DEFUN(z_put_pixels16_xy2,mL1, | |
222 (uint8_t *block, const uint8_t *s0, | |
223 int dest_size, int line_size, int h)): | |
224 link 0; | |
225 [--sp] = (r7:4); | |
226 i3=r0; // dest | |
227 i0=r1; // src0--> pixels | |
228 i1=r1; // src1--> pixels + line_size | |
229 r2+=-12; | |
230 m2=r2; // m2=dest_width-4 | |
231 r2=[fp+20]; | |
232 m3=r2; // line_size | |
233 p0=[fp+24]; // h | |
234 r2+=-16; | |
235 i1+=m3; /* src1 + line_size */ | |
236 m0=r2; /* line-size - 20 */ | |
237 | |
238 B0 = I0; | |
239 B1 = I1; | |
240 B3 = I3; | |
241 | |
242 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++]; | |
243 | |
244 LSETUP(LS$16E,LE$16E) LC0=P0; | |
245 LS$16E: DISALGNEXCPT || R1 = [I0++] || R3 =[I1++]; | |
246 R4 = BYTEOP2P (R3:2,R1:0) (RNDL) || R0 = [I0++] || R2 =[I1++]; | |
247 R5 = BYTEOP2P (R3:2,R1:0) (RNDL,R) || R1 = [I0++] || [I3++] = R4 ; | |
248 DISALGNEXCPT || R3 = [I1++] || [I3++] = R5; | |
249 R4 = BYTEOP2P (R3:2,R1:0) (RNDL) || R0 = [I0++M0]|| R2 = [I1++M0]; | |
250 R5 = BYTEOP2P (R3:2,R1:0) (RNDL,R) || R0 = [I0++] || [I3++] = R4 ; | |
251 LE$16E: DISALGNEXCPT || R2 = [I1++] || [I3++M2] = R5; | |
252 | |
253 M1 = 1; | |
254 I3 = B3; | |
255 I1 = B1; | |
256 I0 = B0; | |
257 | |
258 I0 += M1; | |
259 I1 += M1; | |
260 | |
261 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++]; | |
262 LSETUP(LS$16O,LE$16O) LC0=P0; | |
263 LS$16O: DISALGNEXCPT || R1 = [I0++] || R3 =[I1++]; | |
264 R4 = BYTEOP2P (R3:2,R1:0) (RNDH) || R0 = [I0++] || R2 =[I1++]; | |
265 R5 = BYTEOP2P (R3:2,R1:0) (RNDH,R) || R1 = [I0++] || R6 =[I3++]; | |
266 R4 = R4 +|+ R6 || R7 = [I3--]; | |
267 R5 = R5 +|+ R7 || [I3++] = R4; | |
268 DISALGNEXCPT || R3 =[I1++] || [I3++] = R5; | |
269 R4 = BYTEOP2P (R3:2,R1:0) (RNDH) || R0 = [I0++M0]|| R2 = [I1++M0]; | |
270 R5 = BYTEOP2P (R3:2,R1:0) (RNDH,R) || R0 = [I0++] || R6 = [I3++]; | |
271 R4 = R4 +|+ R6 || R7 = [I3--]; | |
272 R5 = R5 +|+ R7 || [I3++] = R4; | |
273 LE$16O: DISALGNEXCPT || R2 = [I1++] || [I3++M2] = R5; | |
274 | |
275 (r7:4) = [sp++]; | |
276 unlink; | |
277 rts; | |
278 | |
279 DEFUN(put_pixels16_xy2_nornd,mL1, | |
280 (uint8_t *block, const uint8_t *s0, | |
281 int line_size, int h)): | |
282 link 0; | |
283 [--sp] = (r7:4); | |
284 i3=r0; // dest | |
285 i0=r1; // src0--> pixels | |
286 i1=r1; // src1--> pixels + line_size | |
287 m3=r2; | |
288 r2+=-12; | |
289 m2=r2; | |
290 r2+=-4; | |
291 i1+=m3; /* src1 + line_size */ | |
292 m0=r2; /* line-size - 20 */ | |
293 p0=[fp+20]; // h | |
294 | |
295 B0=I0; | |
296 B1=I1; | |
297 B3=I3; | |
298 | |
299 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++]; | |
300 | |
301 LSETUP(LS$16ET,LE$16ET) LC0=P0; | |
302 LS$16ET:DISALGNEXCPT || R1 = [I0++] || R3 =[I1++]; | |
303 R4 = BYTEOP2P (R3:2,R1:0) (TL) || R0 = [I0++] || R2 =[I1++]; | |
304 R5 = BYTEOP2P (R3:2,R1:0) (TL,R) || R1 = [I0++] || [I3++] = R4 ; | |
305 DISALGNEXCPT || R3 = [I1++] || [I3++] = R5; | |
306 R4 = BYTEOP2P (R3:2,R1:0) (TL) || R0 = [I0++M0]|| R2 = [I1++M0]; | |
307 R5 = BYTEOP2P (R3:2,R1:0) (TL,R) || R0 = [I0++] || [I3++] = R4 ; | |
308 LE$16ET:DISALGNEXCPT || R2 = [I1++] || [I3++M2] = R5; | |
309 | |
310 M1 = 1; | |
311 I3=B3; | |
312 I1=B1; | |
313 I0=B0; | |
314 | |
315 I0 += M1; | |
316 I1 += M1; | |
317 | |
318 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++]; | |
319 LSETUP(LS$16OT,LE$16OT) LC0=P0; | |
320 LS$16OT:DISALGNEXCPT || R1 = [I0++] || R3 =[I1++]; | |
321 R4 = BYTEOP2P (R3:2,R1:0) (TH) || R0 = [I0++] || R2 =[I1++]; | |
322 R5 = BYTEOP2P (R3:2,R1:0) (TH,R) || R1 = [I0++] || R6 =[I3++]; | |
323 R4 = R4 +|+ R6 || R7 = [I3--]; | |
324 R5 = R5 +|+ R7 || [I3++] = R4; | |
325 DISALGNEXCPT || R3 =[I1++] || [I3++] = R5; | |
326 R4 = BYTEOP2P (R3:2,R1:0) (TH) || R0 = [I0++M0]|| R2 = [I1++M0]; | |
327 R5 = BYTEOP2P (R3:2,R1:0) (TH,R) || R0 = [I0++] || R6 = [I3++]; | |
328 R4 = R4 +|+ R6 || R7 = [I3--]; | |
329 R5 = R5 +|+ R7 || [I3++] = R4; | |
330 LE$16OT:DISALGNEXCPT || R2 = [I1++] || [I3++M2] = R5; | |
331 | |
332 (r7:4) = [sp++]; | |
333 unlink; | |
334 rts; | |
335 | |
336 DEFUN(z_put_pixels8_xy2,mL1, | |
337 (uint8_t *block, const uint8_t *s0, | |
338 int dest_size, int line_size, int h)): | |
339 link 0; | |
340 [--sp] = (r7:4); | |
341 i3=r0; // dest | |
342 i0=r1; // src0--> pixels | |
343 i1=r1; // src1--> pixels + line_size | |
344 r2+=-4; | |
345 m2=r2; // m2=dest_width-4 | |
346 r2=[fp+20]; | |
347 m3=r2; // line_size | |
348 p0=[fp+24]; // h | |
349 r2+=-8; | |
350 i1+=m3; /* src1 + line_size */ | |
351 m0=r2; /* line-size - 20 */ | |
352 | |
353 b0 = I0; | |
354 b1 = I1; | |
355 b3 = I3; | |
356 | |
357 LSETUP(LS$8E,LE$8E) LC0=P0; | |
358 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++]; | |
359 LS$8E: DISALGNEXCPT || R1 = [I0++] || R3 =[I1++]; | |
360 R4 = BYTEOP2P (R3:2,R1:0) (RNDL) || R0 = [I0++M0] || R2 =[I1++M0]; | |
361 R5 = BYTEOP2P (R3:2,R1:0) (RNDL,R) || R0 = [I0++] || [I3++] = R4 ; | |
362 LE$8E: DISALGNEXCPT || R2 = [I1++] || [I3++M2] = R5; | |
363 | |
364 M1 = 1; | |
365 I3 = b3; | |
366 I1 = b1; | |
367 I0 = b0; | |
368 | |
369 I0 += M1; | |
370 I1 += M1; | |
371 | |
372 LSETUP(LS$8O,LE$8O) LC0=P0; | |
373 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++]; | |
374 LS$8O: DISALGNEXCPT || R1 = [I0++] || R3 =[I1++]; | |
375 R4 = BYTEOP2P (R3:2,R1:0) (RNDH) || R0 = [I0++M0] || R2 =[I1++M0]; | |
376 R5 = BYTEOP2P (R3:2,R1:0) (RNDH,R) || R0 = [I0++] || R6 =[I3++]; | |
377 R4 = R4 +|+ R6 || R7 = [I3--]; | |
378 R5 = R5 +|+ R7 || [I3++] = R4; | |
379 LE$8O: DISALGNEXCPT || R2 =[I1++] || [I3++M2] = R5; | |
380 | |
381 (r7:4) = [sp++]; | |
382 unlink; | |
383 rts; | |
384 | |
385 DEFUN(put_pixels8_xy2_nornd,mL1, | |
386 (uint8_t *block, const uint8_t *s0, int line_size, int h)): | |
387 link 0; | |
388 [--sp] = (r7:4); | |
389 i3=r0; // dest | |
390 i0=r1; // src0--> pixels | |
391 i1=r1; // src1--> pixels + line_size | |
392 m3=r2; | |
393 r2+=-4; | |
394 m2=r2; | |
395 r2+=-4; | |
396 i1+=m3; /* src1 + line_size */ | |
397 m0=r2; /* line-size - 20 */ | |
398 p0=[fp+20]; // h | |
399 | |
400 | |
401 b0 = I0; | |
402 b1 = I1; | |
403 b3 = I3; | |
404 | |
405 LSETUP(LS$8ET,LE$8ET) LC0=P0; | |
406 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++]; | |
407 | |
408 LS$8ET: DISALGNEXCPT || R1 = [I0++] || R3 = [I1++]; | |
409 R4 = BYTEOP2P (R3:2,R1:0) (TL) || R0 = [I0++M0] || R2 = [I1++M0]; | |
410 R5 = BYTEOP2P (R3:2,R1:0) (TL,R) || R0 = [I0++] || [I3++] = R4 ; | |
411 LE$8ET: DISALGNEXCPT || R2 = [I1++] || [I3++M2] = R5; | |
412 | |
413 M1 = 1; | |
414 I3 = b3; | |
415 I1 = b1; | |
416 I0 = b0; | |
417 | |
418 I0 += M1; | |
419 I1 += M1; | |
420 | |
421 LSETUP(LS$8OT,LE$8OT) LC0=P0; | |
422 DISALGNEXCPT || R0 = [I0++] || R2 = [I1++]; | |
423 | |
424 LS$8OT: DISALGNEXCPT || R1 = [I0++] || R3 = [I1++]; | |
425 R4 = BYTEOP2P (R3:2,R1:0) (TH) || R0 = [I0++M0] || R2 = [I1++M0]; | |
426 R5 = BYTEOP2P (R3:2,R1:0) (TH,R) || R0 = [I0++] || R6 = [I3++]; | |
427 R4 = R4 +|+ R6 || R7 = [I3--]; | |
428 R5 = R5 +|+ R7 || [I3++] = R4; | |
429 LE$8OT: DISALGNEXCPT || R2 =[I1++] || [I3++M2] = R5; | |
430 | |
431 (r7:4) = [sp++]; | |
432 unlink; | |
433 rts; | |
434 | |
435 DEFUN(diff_pixels,mL1, | |
436 (DCTELEM *block, uint8_t *s1, uint8_t *s2, int stride)): | |
437 link 0; | |
438 [--sp] = (r7:4); | |
439 p0=8; | |
440 i3=r0; // block | |
441 i0=r1; // s1 | |
442 i1=r2; // s2 | |
443 r2=[fp+20]; // stride | |
444 r2+=-8; | |
445 m0=r2; | |
446 | |
447 | |
448 LSETUP(.LS0,.LE0) LC0=P0; | |
449 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++]; | |
450 | |
451 .LS0: DISALGNEXCPT || R1 = [I0++] || R3 = [I1++]; | |
452 (R5,R4) = BYTEOP16M (R1:0,R3:2) || R0 = [I0++M0] || R2 = [I1++M0]; | |
453 (R7,R6) = BYTEOP16M (R1:0,R3:2) (R)|| R0 = [I0++] || [I3++] = R4; | |
454 DISALGNEXCPT || R2 = [I1++] || [I3++] = R5; | |
455 [i3++]=r6; | |
456 .LE0: [i3++]=r7; | |
457 | |
458 (r7:4) = [sp++]; | |
459 unlink; | |
460 rts; | |
461 | |
462 /* | |
463 for (i = 0; i < 16; i++) { | |
464 for (j = 0; j < 16; j++) { | |
465 sum += pix[j]; | |
466 } | |
467 pix += line_size; | |
468 } | |
469 */ | |
470 DEFUN(pix_sum,mL1, | |
471 (uint8_t *p, int stride)): | |
472 link 0; | |
473 [--sp] = (r7:4); | |
474 p0=8; | |
475 i0=r0; // s1 | |
476 i1=r0; | |
477 m1=r1; | |
478 r1=r1+r1; | |
479 r1+=-16; // stride | |
480 m0=r1; | |
481 i1+=m1; | |
482 | |
483 r6=0; | |
484 | |
485 LSETUP(LS$PS,LE$PS) LC0=P0; | |
486 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++]; | |
487 | |
488 LS$PS: DISALGNEXCPT || R1 = [I0++] || R3 = [I1++]; | |
489 (R5,R4) = BYTEOP16P (R3:2,R1:0) || R0 = [I0++] || R2 = [I1++]; | |
490 r6=r6+|+r5; | |
491 r6=r6+|+r4; | |
492 (R5,R4) = BYTEOP16P (R3:2,R1:0) (R)|| R1 = [I0++] || R3 = [I1++]; | |
493 r6=r6+|+r5; | |
494 r6=r6+|+r4; | |
495 (R5,R4) = BYTEOP16P (R3:2,R1:0) || R0 = [I0++m0] || R2 = [I1++m0]; | |
496 r6=r6+|+r5; | |
497 r6=r6+|+r4; | |
498 (R5,R4) = BYTEOP16P (R3:2,R1:0) (R)|| R0 = [I0++] || R2 = [I1++]; | |
499 r6=r6+|+r5; | |
500 LE$PS: r6=r6+|+r4; | |
501 r0.l=r6.l+r6.h; | |
502 r0.h=0; | |
503 | |
504 (r7:4) = [sp++]; | |
505 unlink; | |
506 rts; | |
507 | |
508 | |
509 DEFUN(get_pixels,mL1, | |
510 (DCTELEM *restrict block, const uint8_t *pixels, int line_size)): | |
511 [--sp] = (r7:4); | |
512 i3=r0; // dest | |
513 i0=r1; // src0 | |
514 p0=8; | |
515 r2+=-8; | |
516 m0=r2; | |
517 LSETUP(gp8$0,gp8$1) LC0=P0; | |
518 | |
519 DISALGNEXCPT || R0 = [I0++]; | |
520 DISALGNEXCPT || R1 = [I0++]; | |
521 | |
522 gp8$0: (R7,R6) = byteunpack R1:0 || R0 = [I0++M0]; | |
523 (R5,R4) = byteunpack R1:0 (R) || R0 = [I0++] || [I3++]=R6; | |
524 DISALGNEXCPT || R1 = [I0++] || [I3++]=R7; | |
525 [I3++]=R4; | |
526 gp8$1: [I3++]=R5 | |
527 | |
528 | |
529 (r7:4) = [sp++]; | |
530 RTS; | |
531 | |
532 | |
533 /* sad = sad16x16 (ubyte *mb, ubyte *refwin, srcwidth, refwinwidth, h) */ | |
534 /* 91 cycles */ | |
535 DEFUN(z_sad16x16,mL1, | |
536 (uint8_t *blk1, uint8_t *blk2, int dsz, int line_size, int h)): | |
537 link 0; | |
538 I0 = R0; | |
539 I1 = R1; | |
540 | |
541 A1 = A0 = 0; | |
542 R0 = [sp+20]; // rwidth | |
543 P2 = [sp+24]; // height | |
544 R3 = 16; | |
545 R0 = R0 - R3; | |
546 R3 = R2 - R3; | |
547 M1 = R0; | |
548 M0 = R3; | |
549 | |
550 DISALGNEXCPT || R0 = [I0++] || R2 = [I1++]; | |
551 LSETUP (s$16, e$16) LC0=P2; | |
552 s$16: DISALGNEXCPT || R1 = [I0++] || R3 = [I1++]; | |
553 SAA (R1:0,R3:2) || R0 = [I0++] || R2 = [I1++]; | |
554 SAA (R1:0,R3:2) (R) || R1 = [I0++] || R3 = [I1++]; | |
555 SAA (R1:0,R3:2) || R0 = [I0++M0] || R2 = [I1++M1]; | |
556 e$16: SAA (R1:0,R3:2) (R) || R0 = [I0++] || R2 = [I1++]; | |
557 | |
558 R3=A1.L+A1.H, R2=A0.L+A0.H ; | |
559 R0 = R2 + R3 ; | |
560 unlink; | |
561 RTS; | |
562 | |
563 /* sad = sad8x8 (ubyte *mb, ubyte *refwin, int srcwidth, int refwinwidth, int h) */ | |
564 /* 36 cycles */ | |
565 DEFUN(z_sad8x8,mL1, | |
566 (uint8_t *blk1, uint8_t *blk2, int dsz, int line_size, int h)): | |
567 I0 = R0; | |
568 I1 = R1; | |
569 | |
570 A1 = A0 = 0; | |
571 r0 = [sp+12]; // rwidth | |
572 P2 = [sp+16]; //height | |
573 R3 = 8; | |
574 R0 = R0 - R3; | |
575 R3 = R2 - R3; | |
576 M0 = R3; | |
577 M1 = R0; | |
578 | |
579 LSETUP (s$8, e$8) LC0=P2; | |
580 DISALGNEXCPT || R0 = [I0++] || R2 = [I1++]; | |
581 DISALGNEXCPT || R1 = [I0++] || R3 = [I1++]; | |
582 s$8: SAA (R1:0,R3:2) || R0 = [I0++M0] || R2 = [I1++M1]; | |
583 SAA (R1:0,R3:2) (R) || R0 = [I0++] || R2 = [I1++]; | |
584 e$8: DISALGNEXCPT || R1 = [I0++] || R3 = [I1++]; | |
585 | |
586 R3=A1.L+A1.H, R2=A0.L+A0.H ; | |
587 R0 = R2 + R3 ; | |
588 RTS; | |
589 | |
590 DEFUN(pix_norm1,mL1, | |
591 (uint8_t * pix, int line_size)): | |
592 [--SP]=(R7:4,P5:3); | |
593 | |
594 // Fetch the input arguments. | |
595 P1 = R0; // pix | |
596 P0 = R1; // line_size | |
597 P5 = 16; // loop ctr. | |
598 P0 -= P5; | |
599 M0 = P0; // M0 = line_size-16; | |
600 // Now for the real work. | |
601 A1 = A0 = 0; | |
602 lsetup(_pix_norm1_blkfn_loopStart, _pix_norm1_blkfn_loopEnd) LC1 = P5; | |
603 I0 = P1; | |
604 DISALGNEXCPT || r0 = [i0++]; | |
605 | |
606 _pix_norm1_blkfn_loopStart: | |
607 // following unpacks pix1[0..15] pix1+line_size[0..15] | |
608 DISALGNEXCPT || r1 = [i0++]; | |
609 | |
610 (r5, r4) = byteunpack r1:0 || r0 = [i0++]; | |
611 a1 += r5.h * r5.h, a0 += r5.l * r5.l (is); | |
612 a1 += r4.h * r4.h, a0 += r4.l * r4.l (is); | |
613 (r5, r4) = byteunpack r1:0(r) || r1 = [i0++]; | |
614 a1 += r5.h * r5.h, a0 += r5.l * r5.l (is); | |
615 a1 += r4.h * r4.h, a0 += r4.l * r4.l (is); | |
616 (r5, r4) = byteunpack r1:0 || r0 = [i0++M0]; | |
617 a1 += r5.h * r5.h, a0 += r5.l * r5.l (is); | |
618 a1 += r4.h * r4.h, a0 += r4.l * r4.l (is); | |
619 (r5, r4) = byteunpack r1:0(r) || r0 = [i0++]; | |
620 a1 += r5.h * r5.h, a0 += r5.l * r5.l (is); | |
621 _pix_norm1_blkfn_loopEnd: | |
622 a1 += r4.h * r4.h, a0 += r4.l * r4.l (is); | |
623 | |
624 | |
625 // Clean up at the end: | |
626 R2 = A0, R3 = A1; | |
627 R0 = R2 + R3 (S); | |
628 | |
629 (R7:4,P5:3)=[SP++]; | |
630 | |
631 RTS; | |
632 | |
633 DEFUN(sse4,mL1, | |
634 (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)): | |
635 link 0; | |
636 [--sp] = (r7:6); | |
637 p0=[fp+24]; // h | |
638 i0=r1; // pix1 | |
639 i1=r2; // pix2 | |
640 r2=[fp+20]; // line_size | |
641 r2+=-4; | |
642 m0=r2; | |
643 | |
644 a0=a1=0; | |
645 LSETUP(.S40,.E40) LC0=P0; | |
646 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++]; | |
647 | |
648 .S40: DISALGNEXCPT || R1 = [I0++M0] || R3 = [I1++M0]; | |
649 (R7,R6) = BYTEOP16M (R1:0,R3:2); | |
650 a0 += r7.l * r7.l, a1 += r7.h * r7.h (is); | |
651 .E40: a0 += r6.l * r6.l, a1 += r6.h * r6.h (is); | |
652 a0 += a1; | |
653 r0 = a0; | |
654 | |
655 (r7:6) = [sp++]; | |
656 unlink; | |
657 rts; | |
658 | |
659 DEFUN(sse8,mL1, | |
660 (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)): | |
661 link 0; | |
662 [--sp] = (r7:6); | |
663 p0=[fp+24]; // h | |
664 i0=r1; // pix1 | |
665 i1=r2; // pix2 | |
666 r2=[fp+20]; // line_size | |
667 r2+=-8; | |
668 m0=r2; | |
669 | |
670 a0=a1=0; | |
671 LSETUP(.S80,.E80) LC0=P0; | |
672 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++]; | |
673 | |
674 .S80: DISALGNEXCPT || R1 = [I0++] || R3 = [I1++]; | |
675 (R7,R6) = BYTEOP16M (R1:0,R3:2) || R0 = [I0++M0] || R2 = [I1++M0]; | |
676 a0 += r7.l * r7.l, a1 += r7.h * r7.h (is); | |
677 a0 += r6.l * r6.l, a1 += r6.h * r6.h (is); | |
678 (R7,R6) = BYTEOP16M (R1:0,R3:2) (R)|| R0 = [I0++] || R2 = [I1++]; | |
679 a0 += r7.l * r7.l, a1 += r7.h * r7.h (is); | |
680 .E80: a0 += r6.l * r6.l, a1 += r6.h * r6.h (is); | |
681 a0 += a1; | |
682 r0 = a0; | |
683 | |
684 (r7:6) = [sp++]; | |
685 unlink; | |
686 rts; | |
687 | |
688 DEFUN(sse16,mL1, | |
689 (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)): | |
690 link 0; | |
691 [--sp] = (r7:6); | |
692 p0=[fp+24]; // h | |
693 i0=r1; // pix1 | |
694 i1=r2; // pix2 | |
695 r2=[fp+20]; // line_size | |
696 r2+=-16; | |
697 m0=r2; | |
698 | |
699 a0=a1=0; | |
700 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++]; | |
701 LSETUP(.S160,.E160) LC0=P0; | |
702 | |
703 .S160: DISALGNEXCPT || R1 = [I0++] || R3 = [I1++]; | |
704 (R7,R6) = BYTEOP16M (R1:0,R3:2) || R0 = [I0++] || R2 = [I1++]; | |
705 a0 += r7.l * r7.l, a1 += r7.h * r7.h (is); | |
706 a0 += r6.l * r6.l, a1 += r6.h * r6.h (is); | |
707 (R7,R6) = BYTEOP16M (R1:0,R3:2) (R)|| R1 = [I0++] || R3 = [I1++]; | |
708 a0 += r7.l * r7.l, a1 += r7.h * r7.h (is); | |
709 a0 += r6.l * r6.l, a1 += r6.h * r6.h (is); | |
710 (R7,R6) = BYTEOP16M (R1:0,R3:2) || R0 = [I0++M0] || R2 = [I1++M0]; | |
711 a0 += r7.l * r7.l, a1 += r7.h * r7.h (is); | |
712 a0 += r6.l * r6.l, a1 += r6.h * r6.h (is); | |
713 (R7,R6) = BYTEOP16M (R1:0,R3:2) (R)|| R0 = [I0++] || R2 = [I1++]; | |
714 a0 += r7.l * r7.l, a1 += r7.h * r7.h (is); | |
715 .E160: a0 += r6.l * r6.l, a1 += r6.h * r6.h (is); | |
716 a0 += a1; | |
717 r0 = a0; | |
718 | |
719 (r7:6) = [sp++]; | |
720 unlink; | |
721 rts; | |
722 | |
723 |