comparison bfin/pixels_bfin.S @ 4765:85298e8c55c4 libavcodec

bfin dsputils, basic pixel operations sads, diffs, motion compensation and standard IEEE 8x8 block transforms patch by Marc Hoffman, mmh pleasantst com
author diego
date Sun, 01 Apr 2007 22:28:45 +0000
parents
children 75bf61c6c385
comparison
equal deleted inserted replaced
4764:da0598df2e53 4765:85298e8c55c4
1 /*
2 * Blackfin Pixel Operations
3 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21 #include "config_bfin.h"
22
23 DEFUN(put_pixels_clamped,mL1,
24 (DCTELEM *block, uint8_t *dest, int line_size)):
25 [--SP] = (R7:4);
26 R4 = 0;
27 R5.l = 0x00ff;
28 R5.h = 0x00ff;
29 I0 = R0; // block
30 I1 = R1; // dest
31 R2 += -4; // line_size
32 M1 = R2;
33 P0 = 8;
34 R0 = [I0++];
35 R1 = [I0++];
36 R2 = MAX(R0, R4) (V);
37 LSETUP (ppc$0,ppc$1) LC0=P0;
38 ppc$0: R2 = MIN(R2, R5) (V);
39 R3 = MAX(R1, R4) (V);
40 R3 = MIN(R3, R5) (V) || R0 = [I0++];
41 R6 = BYTEPACK (R2,R3) || R1 = [I0++];
42 R2 = MAX(R0, R4) (V) || [I1++] = R6;
43 R2 = MIN(R2, R5) (V);
44 R3 = MAX(R1, R4) (V);
45 R3 = MIN(R3, R5) (V) || R0 = [I0++];
46 R6 = BYTEPACK (R2,R3) || R1 = [I0++];
47 ppc$1: R2 = Max(R0, R4) (V) || [I1++M1] = R6;
48
49 (R7:4) = [SP++];
50 RTS;
51
52 DEFUN(add_pixels_clamped,mL1,
53 (DCTELEM *block, uint8_t *dest, int line_size)):
54 [-- SP] = (R7:4);
55 R4 = 0;
56 I0 = 0;
57 R2 += -4; // line_size
58 M0 = R2;
59 I1 = R1; // dest
60 I3 = R0; // block
61 I2 = R1; // dest
62 P0 = 8;
63 M3 = 2;
64 R0 = [I3++] || R2 = [I1];
65 R2 = R2 << 8 || R0.H = W[I3--] || R3 = [I1++];
66 R3 = R3 >> 8 || R1.L = W[I3] || I3 += 4;
67 R6 = BYTEOP3P(R1:0, R3:2) (LO) || R1.H = W[I3++] || R2 = [I1];
68
69 LSETUP(apc$2,apc$3) LC1 = P0;
70 apc$2: R7 = BYTEOP3P(R1:0, R3:2) (HI, R) || R0 = [I3++] || R3 = [I1++M0];
71 R2 = R2 << 8 || R0.H = W[I3--];
72 R3 = R3 >> 8 || R1.L = W[I3] || I3 += 4;
73 R6 = R6 + R7 (S) || R1.H = W[I3];
74 R6 = BYTEOP3P(R1:0, R3:2) (LO) || I3+=M3 || [I2++]=R6;
75 R7 = BYTEOP3P(R1:0, R3:2) (HI, R) || R0 = [I3++] || R2 = [I1];
76 R2 = R2 << 8 || R0.H = W[I3--] || R3 = [I1++];
77 R3 = R3 >> 8 || R1.L = W[I3] || I3 += 4;
78 R6 = R6 + R7 (S) || R1.H = W[I3++];
79 apc$3: R6 = BYTEOP3P(R1:0, R3:2) (LO) || [I2++M0] = R6 || R2 = [I1];
80
81 (R7:4) = [SP++];
82 RTS;
83
84
85 /*
86 motion compensation
87 primitives
88
89 * Halfpel motion compensation with rounding (a+b+1)>>1.
90 * This is an array[4][4] of motion compensation funcions for 4
91 * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
92 * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
93 * @param block destination where the result is stored
94 * @param pixels source
95 * @param line_size number of bytes in a horizontal line of block
96 * @param h height
97
98 */
99
100 DEFUN(put_pixels8uc,mL1,
101 (uint8_t *block, const uint8_t *s0, const uint8_t *s1,
102 int dest_size, int line_size, int h)):
103 i3=r0; // dest
104 i0=r1; // src0
105 i1=r2; // src1
106 r0=[sp+12]; // dest_size
107 r2=[sp+16]; // line_size
108 p0=[sp+20]; // h
109 [--sp] = (r7:6);
110 r0+=-4;
111 m3=r0;
112 r2+=-8;
113 m0=r2;
114 LSETUP(pp8$0,pp8$1) LC0=P0;
115 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++];
116
117 pp8$0: DISALGNEXCPT || R1 = [I0++] || R3 =[I1++];
118 R6 = BYTEOP1P(R1:0,R3:2) || R0 = [I0++M0]|| R2 =[I1++M0];
119 R7 = BYTEOP1P(R1:0,R3:2)(R) || R0 = [I0++] || [I3++] = R6 ;
120 pp8$1: DISALGNEXCPT || R2 = [I1++] || [I3++M3] = R7;
121
122 (r7:6) = [sp++];
123 RTS;
124
125 DEFUN(put_pixels16uc,mL1,
126 (uint8_t *block, const uint8_t *s0, const uint8_t *s1,
127 int dest_size, int line_size, int h)):
128 link 0;
129 [--sp] = (r7:6);
130 i3=r0; // dest
131 i0=r1; // src0
132 i1=r2; // src1
133 r0=[fp+20]; // dest_size
134 r2=[fp+24]; // line_size
135 p0=[fp+28]; // h
136
137
138 r0+=-12;
139 m3=r0; // line_size
140 r2+=-16;
141 m0=r2;
142
143 LSETUP(pp16$0,pp16$1) LC0=P0;
144 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++];
145
146 pp16$0: DISALGNEXCPT || R1 = [I0++] || R3 =[I1++];
147 R6 = BYTEOP1P(R1:0,R3:2) || R0 = [I0++] || R2 =[I1++];
148 R7 = BYTEOP1P(R1:0,R3:2)(R) || R1 = [I0++] || R3 =[I1++];
149 [I3++] = R6;
150 R6 = BYTEOP1P(R1:0,R3:2) || R0 = [I0++M0] || R2 =[I1++M0];
151 R7 = BYTEOP1P(R1:0,R3:2)(R) || R0 = [I0++] || [I3++] = R7 ;
152 [I3++] = R6;
153 pp16$1: DISALGNEXCPT || R2 = [I1++] || [I3++M3] = R7;
154
155 (r7:6) = [sp++];
156 unlink;
157 RTS;
158
159
160
161
162
163
164 DEFUN(put_pixels8uc_nornd,mL1,
165 (uint8_t *block, const uint8_t *s0, const uint8_t *s1,
166 int line_size, int h)):
167 i3=r0; // dest
168 i0=r1; // src0
169 i1=r2; // src1
170 r2=[sp+12]; // line_size
171 p0=[sp+16]; // h
172 [--sp] = (r7:6);
173 r2+=-4;
174 m3=r2;
175 r2+=-4;
176 m0=r2;
177 LSETUP(pp8$2,pp8$3) LC0=P0;
178 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++];
179
180 pp8$2: DISALGNEXCPT || R1 = [I0++] || R3 =[I1++];
181 R6 = BYTEOP1P(R1:0,R3:2)(T) || R0 = [I0++M0]|| R2 =[I1++M0];
182 R7 = BYTEOP1P(R1:0,R3:2)(T,R) || R0 = [I0++] || [I3++] = R6 ;
183 pp8$3: DISALGNEXCPT || R2 = [I1++] || [I3++M3] = R7;
184
185 (r7:6) = [sp++];
186 RTS;
187
188 DEFUN(put_pixels16uc_nornd,mL1,
189 (uint8_t *block, const uint8_t *s0, const uint8_t *s1,
190 int line_size, int h)):
191 i3=r0; // dest
192 i0=r1; // src0
193 i1=r2; // src1
194 r2=[sp+12]; // line_size
195 p0=[sp+16]; // h
196
197 [--sp] = (r7:6);
198 r2+=-12;
199 m3=r2; // line_size
200 r2+=-4;
201 m0=r2;
202
203 LSETUP(pp16$2,pp16$3) LC0=P0;
204 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++];
205
206 pp16$2:
207 DISALGNEXCPT || R1 = [I0++] || R3 =[I1++];
208 R6 = BYTEOP1P(R1:0,R3:2)(T) || R0 = [I0++] || R2 =[I1++];
209 R7 = BYTEOP1P(R1:0,R3:2)(T,R) || R1 = [I0++] || R3 =[I1++];
210 [I3++] = R6;
211
212 R6 = BYTEOP1P(R1:0,R3:2)(T) || R0 = [I0++M0] || R2 =[I1++M0];
213 R7 = BYTEOP1P(R1:0,R3:2)(T,R) || R0 = [I0++] || [I3++] = R7 ;
214 [I3++] = R6;
215 pp16$3: DISALGNEXCPT || R2 = [I1++] || [I3++M3] = R7;
216
217 (r7:6) = [sp++];
218
219 RTS;
220
221 DEFUN(z_put_pixels16_xy2,mL1,
222 (uint8_t *block, const uint8_t *s0,
223 int dest_size, int line_size, int h)):
224 link 0;
225 [--sp] = (r7:4);
226 i3=r0; // dest
227 i0=r1; // src0--> pixels
228 i1=r1; // src1--> pixels + line_size
229 r2+=-12;
230 m2=r2; // m2=dest_width-4
231 r2=[fp+20];
232 m3=r2; // line_size
233 p0=[fp+24]; // h
234 r2+=-16;
235 i1+=m3; /* src1 + line_size */
236 m0=r2; /* line-size - 20 */
237
238 B0 = I0;
239 B1 = I1;
240 B3 = I3;
241
242 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++];
243
244 LSETUP(LS$16E,LE$16E) LC0=P0;
245 LS$16E: DISALGNEXCPT || R1 = [I0++] || R3 =[I1++];
246 R4 = BYTEOP2P (R3:2,R1:0) (RNDL) || R0 = [I0++] || R2 =[I1++];
247 R5 = BYTEOP2P (R3:2,R1:0) (RNDL,R) || R1 = [I0++] || [I3++] = R4 ;
248 DISALGNEXCPT || R3 = [I1++] || [I3++] = R5;
249 R4 = BYTEOP2P (R3:2,R1:0) (RNDL) || R0 = [I0++M0]|| R2 = [I1++M0];
250 R5 = BYTEOP2P (R3:2,R1:0) (RNDL,R) || R0 = [I0++] || [I3++] = R4 ;
251 LE$16E: DISALGNEXCPT || R2 = [I1++] || [I3++M2] = R5;
252
253 M1 = 1;
254 I3 = B3;
255 I1 = B1;
256 I0 = B0;
257
258 I0 += M1;
259 I1 += M1;
260
261 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++];
262 LSETUP(LS$16O,LE$16O) LC0=P0;
263 LS$16O: DISALGNEXCPT || R1 = [I0++] || R3 =[I1++];
264 R4 = BYTEOP2P (R3:2,R1:0) (RNDH) || R0 = [I0++] || R2 =[I1++];
265 R5 = BYTEOP2P (R3:2,R1:0) (RNDH,R) || R1 = [I0++] || R6 =[I3++];
266 R4 = R4 +|+ R6 || R7 = [I3--];
267 R5 = R5 +|+ R7 || [I3++] = R4;
268 DISALGNEXCPT || R3 =[I1++] || [I3++] = R5;
269 R4 = BYTEOP2P (R3:2,R1:0) (RNDH) || R0 = [I0++M0]|| R2 = [I1++M0];
270 R5 = BYTEOP2P (R3:2,R1:0) (RNDH,R) || R0 = [I0++] || R6 = [I3++];
271 R4 = R4 +|+ R6 || R7 = [I3--];
272 R5 = R5 +|+ R7 || [I3++] = R4;
273 LE$16O: DISALGNEXCPT || R2 = [I1++] || [I3++M2] = R5;
274
275 (r7:4) = [sp++];
276 unlink;
277 rts;
278
279 DEFUN(put_pixels16_xy2_nornd,mL1,
280 (uint8_t *block, const uint8_t *s0,
281 int line_size, int h)):
282 link 0;
283 [--sp] = (r7:4);
284 i3=r0; // dest
285 i0=r1; // src0--> pixels
286 i1=r1; // src1--> pixels + line_size
287 m3=r2;
288 r2+=-12;
289 m2=r2;
290 r2+=-4;
291 i1+=m3; /* src1 + line_size */
292 m0=r2; /* line-size - 20 */
293 p0=[fp+20]; // h
294
295 B0=I0;
296 B1=I1;
297 B3=I3;
298
299 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++];
300
301 LSETUP(LS$16ET,LE$16ET) LC0=P0;
302 LS$16ET:DISALGNEXCPT || R1 = [I0++] || R3 =[I1++];
303 R4 = BYTEOP2P (R3:2,R1:0) (TL) || R0 = [I0++] || R2 =[I1++];
304 R5 = BYTEOP2P (R3:2,R1:0) (TL,R) || R1 = [I0++] || [I3++] = R4 ;
305 DISALGNEXCPT || R3 = [I1++] || [I3++] = R5;
306 R4 = BYTEOP2P (R3:2,R1:0) (TL) || R0 = [I0++M0]|| R2 = [I1++M0];
307 R5 = BYTEOP2P (R3:2,R1:0) (TL,R) || R0 = [I0++] || [I3++] = R4 ;
308 LE$16ET:DISALGNEXCPT || R2 = [I1++] || [I3++M2] = R5;
309
310 M1 = 1;
311 I3=B3;
312 I1=B1;
313 I0=B0;
314
315 I0 += M1;
316 I1 += M1;
317
318 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++];
319 LSETUP(LS$16OT,LE$16OT) LC0=P0;
320 LS$16OT:DISALGNEXCPT || R1 = [I0++] || R3 =[I1++];
321 R4 = BYTEOP2P (R3:2,R1:0) (TH) || R0 = [I0++] || R2 =[I1++];
322 R5 = BYTEOP2P (R3:2,R1:0) (TH,R) || R1 = [I0++] || R6 =[I3++];
323 R4 = R4 +|+ R6 || R7 = [I3--];
324 R5 = R5 +|+ R7 || [I3++] = R4;
325 DISALGNEXCPT || R3 =[I1++] || [I3++] = R5;
326 R4 = BYTEOP2P (R3:2,R1:0) (TH) || R0 = [I0++M0]|| R2 = [I1++M0];
327 R5 = BYTEOP2P (R3:2,R1:0) (TH,R) || R0 = [I0++] || R6 = [I3++];
328 R4 = R4 +|+ R6 || R7 = [I3--];
329 R5 = R5 +|+ R7 || [I3++] = R4;
330 LE$16OT:DISALGNEXCPT || R2 = [I1++] || [I3++M2] = R5;
331
332 (r7:4) = [sp++];
333 unlink;
334 rts;
335
336 DEFUN(z_put_pixels8_xy2,mL1,
337 (uint8_t *block, const uint8_t *s0,
338 int dest_size, int line_size, int h)):
339 link 0;
340 [--sp] = (r7:4);
341 i3=r0; // dest
342 i0=r1; // src0--> pixels
343 i1=r1; // src1--> pixels + line_size
344 r2+=-4;
345 m2=r2; // m2=dest_width-4
346 r2=[fp+20];
347 m3=r2; // line_size
348 p0=[fp+24]; // h
349 r2+=-8;
350 i1+=m3; /* src1 + line_size */
351 m0=r2; /* line-size - 20 */
352
353 b0 = I0;
354 b1 = I1;
355 b3 = I3;
356
357 LSETUP(LS$8E,LE$8E) LC0=P0;
358 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++];
359 LS$8E: DISALGNEXCPT || R1 = [I0++] || R3 =[I1++];
360 R4 = BYTEOP2P (R3:2,R1:0) (RNDL) || R0 = [I0++M0] || R2 =[I1++M0];
361 R5 = BYTEOP2P (R3:2,R1:0) (RNDL,R) || R0 = [I0++] || [I3++] = R4 ;
362 LE$8E: DISALGNEXCPT || R2 = [I1++] || [I3++M2] = R5;
363
364 M1 = 1;
365 I3 = b3;
366 I1 = b1;
367 I0 = b0;
368
369 I0 += M1;
370 I1 += M1;
371
372 LSETUP(LS$8O,LE$8O) LC0=P0;
373 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++];
374 LS$8O: DISALGNEXCPT || R1 = [I0++] || R3 =[I1++];
375 R4 = BYTEOP2P (R3:2,R1:0) (RNDH) || R0 = [I0++M0] || R2 =[I1++M0];
376 R5 = BYTEOP2P (R3:2,R1:0) (RNDH,R) || R0 = [I0++] || R6 =[I3++];
377 R4 = R4 +|+ R6 || R7 = [I3--];
378 R5 = R5 +|+ R7 || [I3++] = R4;
379 LE$8O: DISALGNEXCPT || R2 =[I1++] || [I3++M2] = R5;
380
381 (r7:4) = [sp++];
382 unlink;
383 rts;
384
385 DEFUN(put_pixels8_xy2_nornd,mL1,
386 (uint8_t *block, const uint8_t *s0, int line_size, int h)):
387 link 0;
388 [--sp] = (r7:4);
389 i3=r0; // dest
390 i0=r1; // src0--> pixels
391 i1=r1; // src1--> pixels + line_size
392 m3=r2;
393 r2+=-4;
394 m2=r2;
395 r2+=-4;
396 i1+=m3; /* src1 + line_size */
397 m0=r2; /* line-size - 20 */
398 p0=[fp+20]; // h
399
400
401 b0 = I0;
402 b1 = I1;
403 b3 = I3;
404
405 LSETUP(LS$8ET,LE$8ET) LC0=P0;
406 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++];
407
408 LS$8ET: DISALGNEXCPT || R1 = [I0++] || R3 = [I1++];
409 R4 = BYTEOP2P (R3:2,R1:0) (TL) || R0 = [I0++M0] || R2 = [I1++M0];
410 R5 = BYTEOP2P (R3:2,R1:0) (TL,R) || R0 = [I0++] || [I3++] = R4 ;
411 LE$8ET: DISALGNEXCPT || R2 = [I1++] || [I3++M2] = R5;
412
413 M1 = 1;
414 I3 = b3;
415 I1 = b1;
416 I0 = b0;
417
418 I0 += M1;
419 I1 += M1;
420
421 LSETUP(LS$8OT,LE$8OT) LC0=P0;
422 DISALGNEXCPT || R0 = [I0++] || R2 = [I1++];
423
424 LS$8OT: DISALGNEXCPT || R1 = [I0++] || R3 = [I1++];
425 R4 = BYTEOP2P (R3:2,R1:0) (TH) || R0 = [I0++M0] || R2 = [I1++M0];
426 R5 = BYTEOP2P (R3:2,R1:0) (TH,R) || R0 = [I0++] || R6 = [I3++];
427 R4 = R4 +|+ R6 || R7 = [I3--];
428 R5 = R5 +|+ R7 || [I3++] = R4;
429 LE$8OT: DISALGNEXCPT || R2 =[I1++] || [I3++M2] = R5;
430
431 (r7:4) = [sp++];
432 unlink;
433 rts;
434
435 DEFUN(diff_pixels,mL1,
436 (DCTELEM *block, uint8_t *s1, uint8_t *s2, int stride)):
437 link 0;
438 [--sp] = (r7:4);
439 p0=8;
440 i3=r0; // block
441 i0=r1; // s1
442 i1=r2; // s2
443 r2=[fp+20]; // stride
444 r2+=-8;
445 m0=r2;
446
447
448 LSETUP(.LS0,.LE0) LC0=P0;
449 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++];
450
451 .LS0: DISALGNEXCPT || R1 = [I0++] || R3 = [I1++];
452 (R5,R4) = BYTEOP16M (R1:0,R3:2) || R0 = [I0++M0] || R2 = [I1++M0];
453 (R7,R6) = BYTEOP16M (R1:0,R3:2) (R)|| R0 = [I0++] || [I3++] = R4;
454 DISALGNEXCPT || R2 = [I1++] || [I3++] = R5;
455 [i3++]=r6;
456 .LE0: [i3++]=r7;
457
458 (r7:4) = [sp++];
459 unlink;
460 rts;
461
462 /*
463 for (i = 0; i < 16; i++) {
464 for (j = 0; j < 16; j++) {
465 sum += pix[j];
466 }
467 pix += line_size;
468 }
469 */
470 DEFUN(pix_sum,mL1,
471 (uint8_t *p, int stride)):
472 link 0;
473 [--sp] = (r7:4);
474 p0=8;
475 i0=r0; // s1
476 i1=r0;
477 m1=r1;
478 r1=r1+r1;
479 r1+=-16; // stride
480 m0=r1;
481 i1+=m1;
482
483 r6=0;
484
485 LSETUP(LS$PS,LE$PS) LC0=P0;
486 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++];
487
488 LS$PS: DISALGNEXCPT || R1 = [I0++] || R3 = [I1++];
489 (R5,R4) = BYTEOP16P (R3:2,R1:0) || R0 = [I0++] || R2 = [I1++];
490 r6=r6+|+r5;
491 r6=r6+|+r4;
492 (R5,R4) = BYTEOP16P (R3:2,R1:0) (R)|| R1 = [I0++] || R3 = [I1++];
493 r6=r6+|+r5;
494 r6=r6+|+r4;
495 (R5,R4) = BYTEOP16P (R3:2,R1:0) || R0 = [I0++m0] || R2 = [I1++m0];
496 r6=r6+|+r5;
497 r6=r6+|+r4;
498 (R5,R4) = BYTEOP16P (R3:2,R1:0) (R)|| R0 = [I0++] || R2 = [I1++];
499 r6=r6+|+r5;
500 LE$PS: r6=r6+|+r4;
501 r0.l=r6.l+r6.h;
502 r0.h=0;
503
504 (r7:4) = [sp++];
505 unlink;
506 rts;
507
508
509 DEFUN(get_pixels,mL1,
510 (DCTELEM *restrict block, const uint8_t *pixels, int line_size)):
511 [--sp] = (r7:4);
512 i3=r0; // dest
513 i0=r1; // src0
514 p0=8;
515 r2+=-8;
516 m0=r2;
517 LSETUP(gp8$0,gp8$1) LC0=P0;
518
519 DISALGNEXCPT || R0 = [I0++];
520 DISALGNEXCPT || R1 = [I0++];
521
522 gp8$0: (R7,R6) = byteunpack R1:0 || R0 = [I0++M0];
523 (R5,R4) = byteunpack R1:0 (R) || R0 = [I0++] || [I3++]=R6;
524 DISALGNEXCPT || R1 = [I0++] || [I3++]=R7;
525 [I3++]=R4;
526 gp8$1: [I3++]=R5
527
528
529 (r7:4) = [sp++];
530 RTS;
531
532
533 /* sad = sad16x16 (ubyte *mb, ubyte *refwin, srcwidth, refwinwidth, h) */
534 /* 91 cycles */
535 DEFUN(z_sad16x16,mL1,
536 (uint8_t *blk1, uint8_t *blk2, int dsz, int line_size, int h)):
537 link 0;
538 I0 = R0;
539 I1 = R1;
540
541 A1 = A0 = 0;
542 R0 = [sp+20]; // rwidth
543 P2 = [sp+24]; // height
544 R3 = 16;
545 R0 = R0 - R3;
546 R3 = R2 - R3;
547 M1 = R0;
548 M0 = R3;
549
550 DISALGNEXCPT || R0 = [I0++] || R2 = [I1++];
551 LSETUP (s$16, e$16) LC0=P2;
552 s$16: DISALGNEXCPT || R1 = [I0++] || R3 = [I1++];
553 SAA (R1:0,R3:2) || R0 = [I0++] || R2 = [I1++];
554 SAA (R1:0,R3:2) (R) || R1 = [I0++] || R3 = [I1++];
555 SAA (R1:0,R3:2) || R0 = [I0++M0] || R2 = [I1++M1];
556 e$16: SAA (R1:0,R3:2) (R) || R0 = [I0++] || R2 = [I1++];
557
558 R3=A1.L+A1.H, R2=A0.L+A0.H ;
559 R0 = R2 + R3 ;
560 unlink;
561 RTS;
562
563 /* sad = sad8x8 (ubyte *mb, ubyte *refwin, int srcwidth, int refwinwidth, int h) */
564 /* 36 cycles */
565 DEFUN(z_sad8x8,mL1,
566 (uint8_t *blk1, uint8_t *blk2, int dsz, int line_size, int h)):
567 I0 = R0;
568 I1 = R1;
569
570 A1 = A0 = 0;
571 r0 = [sp+12]; // rwidth
572 P2 = [sp+16]; //height
573 R3 = 8;
574 R0 = R0 - R3;
575 R3 = R2 - R3;
576 M0 = R3;
577 M1 = R0;
578
579 LSETUP (s$8, e$8) LC0=P2;
580 DISALGNEXCPT || R0 = [I0++] || R2 = [I1++];
581 DISALGNEXCPT || R1 = [I0++] || R3 = [I1++];
582 s$8: SAA (R1:0,R3:2) || R0 = [I0++M0] || R2 = [I1++M1];
583 SAA (R1:0,R3:2) (R) || R0 = [I0++] || R2 = [I1++];
584 e$8: DISALGNEXCPT || R1 = [I0++] || R3 = [I1++];
585
586 R3=A1.L+A1.H, R2=A0.L+A0.H ;
587 R0 = R2 + R3 ;
588 RTS;
589
590 DEFUN(pix_norm1,mL1,
591 (uint8_t * pix, int line_size)):
592 [--SP]=(R7:4,P5:3);
593
594 // Fetch the input arguments.
595 P1 = R0; // pix
596 P0 = R1; // line_size
597 P5 = 16; // loop ctr.
598 P0 -= P5;
599 M0 = P0; // M0 = line_size-16;
600 // Now for the real work.
601 A1 = A0 = 0;
602 lsetup(_pix_norm1_blkfn_loopStart, _pix_norm1_blkfn_loopEnd) LC1 = P5;
603 I0 = P1;
604 DISALGNEXCPT || r0 = [i0++];
605
606 _pix_norm1_blkfn_loopStart:
607 // following unpacks pix1[0..15] pix1+line_size[0..15]
608 DISALGNEXCPT || r1 = [i0++];
609
610 (r5, r4) = byteunpack r1:0 || r0 = [i0++];
611 a1 += r5.h * r5.h, a0 += r5.l * r5.l (is);
612 a1 += r4.h * r4.h, a0 += r4.l * r4.l (is);
613 (r5, r4) = byteunpack r1:0(r) || r1 = [i0++];
614 a1 += r5.h * r5.h, a0 += r5.l * r5.l (is);
615 a1 += r4.h * r4.h, a0 += r4.l * r4.l (is);
616 (r5, r4) = byteunpack r1:0 || r0 = [i0++M0];
617 a1 += r5.h * r5.h, a0 += r5.l * r5.l (is);
618 a1 += r4.h * r4.h, a0 += r4.l * r4.l (is);
619 (r5, r4) = byteunpack r1:0(r) || r0 = [i0++];
620 a1 += r5.h * r5.h, a0 += r5.l * r5.l (is);
621 _pix_norm1_blkfn_loopEnd:
622 a1 += r4.h * r4.h, a0 += r4.l * r4.l (is);
623
624
625 // Clean up at the end:
626 R2 = A0, R3 = A1;
627 R0 = R2 + R3 (S);
628
629 (R7:4,P5:3)=[SP++];
630
631 RTS;
632
633 DEFUN(sse4,mL1,
634 (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)):
635 link 0;
636 [--sp] = (r7:6);
637 p0=[fp+24]; // h
638 i0=r1; // pix1
639 i1=r2; // pix2
640 r2=[fp+20]; // line_size
641 r2+=-4;
642 m0=r2;
643
644 a0=a1=0;
645 LSETUP(.S40,.E40) LC0=P0;
646 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++];
647
648 .S40: DISALGNEXCPT || R1 = [I0++M0] || R3 = [I1++M0];
649 (R7,R6) = BYTEOP16M (R1:0,R3:2);
650 a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
651 .E40: a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
652 a0 += a1;
653 r0 = a0;
654
655 (r7:6) = [sp++];
656 unlink;
657 rts;
658
659 DEFUN(sse8,mL1,
660 (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)):
661 link 0;
662 [--sp] = (r7:6);
663 p0=[fp+24]; // h
664 i0=r1; // pix1
665 i1=r2; // pix2
666 r2=[fp+20]; // line_size
667 r2+=-8;
668 m0=r2;
669
670 a0=a1=0;
671 LSETUP(.S80,.E80) LC0=P0;
672 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++];
673
674 .S80: DISALGNEXCPT || R1 = [I0++] || R3 = [I1++];
675 (R7,R6) = BYTEOP16M (R1:0,R3:2) || R0 = [I0++M0] || R2 = [I1++M0];
676 a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
677 a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
678 (R7,R6) = BYTEOP16M (R1:0,R3:2) (R)|| R0 = [I0++] || R2 = [I1++];
679 a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
680 .E80: a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
681 a0 += a1;
682 r0 = a0;
683
684 (r7:6) = [sp++];
685 unlink;
686 rts;
687
688 DEFUN(sse16,mL1,
689 (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)):
690 link 0;
691 [--sp] = (r7:6);
692 p0=[fp+24]; // h
693 i0=r1; // pix1
694 i1=r2; // pix2
695 r2=[fp+20]; // line_size
696 r2+=-16;
697 m0=r2;
698
699 a0=a1=0;
700 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++];
701 LSETUP(.S160,.E160) LC0=P0;
702
703 .S160: DISALGNEXCPT || R1 = [I0++] || R3 = [I1++];
704 (R7,R6) = BYTEOP16M (R1:0,R3:2) || R0 = [I0++] || R2 = [I1++];
705 a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
706 a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
707 (R7,R6) = BYTEOP16M (R1:0,R3:2) (R)|| R1 = [I0++] || R3 = [I1++];
708 a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
709 a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
710 (R7,R6) = BYTEOP16M (R1:0,R3:2) || R0 = [I0++M0] || R2 = [I1++M0];
711 a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
712 a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
713 (R7,R6) = BYTEOP16M (R1:0,R3:2) (R)|| R0 = [I0++] || R2 = [I1++];
714 a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
715 .E160: a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
716 a0 += a1;
717 r0 = a0;
718
719 (r7:6) = [sp++];
720 unlink;
721 rts;
722
723