annotate alpha/dsputil_alpha.c @ 511:fa4425cf6b31 libavcodec

Assembly version of put_pixels. This is currently the function that takes the most time, and it allows for more efficient unaligned access and better control over memory latencies.
author mellum
date Mon, 01 Jul 2002 23:47:01 +0000
parents cab79946302f
children fb670ca9f8eb
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
214
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
1 /*
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
2 * Alpha optimized DSP utils
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
3 * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
4 *
429
718a22dc121f license/copyright change
glantau
parents: 214
diff changeset
5 * This library is free software; you can redistribute it and/or
718a22dc121f license/copyright change
glantau
parents: 214
diff changeset
6 * modify it under the terms of the GNU Lesser General Public
718a22dc121f license/copyright change
glantau
parents: 214
diff changeset
7 * License as published by the Free Software Foundation; either
718a22dc121f license/copyright change
glantau
parents: 214
diff changeset
8 * version 2 of the License, or (at your option) any later version.
214
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
9 *
429
718a22dc121f license/copyright change
glantau
parents: 214
diff changeset
10 * This library is distributed in the hope that it will be useful,
214
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
429
718a22dc121f license/copyright change
glantau
parents: 214
diff changeset
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
718a22dc121f license/copyright change
glantau
parents: 214
diff changeset
13 * Lesser General Public License for more details.
214
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
14 *
429
718a22dc121f license/copyright change
glantau
parents: 214
diff changeset
15 * You should have received a copy of the GNU Lesser General Public
718a22dc121f license/copyright change
glantau
parents: 214
diff changeset
16 * License along with this library; if not, write to the Free Software
718a22dc121f license/copyright change
glantau
parents: 214
diff changeset
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
214
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
18 */
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
19
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
20 #include "asm.h"
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
21 #include "../dsputil.h"
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
22
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
23 void simple_idct_axp(DCTELEM *block);
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
24
511
fa4425cf6b31 Assembly version of put_pixels. This is currently the function that
mellum
parents: 509
diff changeset
25 void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels,
fa4425cf6b31 Assembly version of put_pixels. This is currently the function that
mellum
parents: 509
diff changeset
26 int line_size, int h);
509
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
27 void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels,
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
28 int line_size);
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
29 void add_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels,
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
30 int line_size);
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
31
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
32 #if 0
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
33 /* These functions were the base for the optimized assembler routines,
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
34 and remain here for documentation purposes. */
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
35 static void put_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels,
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
36 int line_size)
214
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
37 {
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
38 int i = 8;
509
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
39 uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
505
7a976bf93394 Ugly hack to make the assembler accept MVI instructions.
mellum
parents: 429
diff changeset
40
7a976bf93394 Ugly hack to make the assembler accept MVI instructions.
mellum
parents: 429
diff changeset
41 ASM_ACCEPT_MVI;
7a976bf93394 Ugly hack to make the assembler accept MVI instructions.
mellum
parents: 429
diff changeset
42
214
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
43 do {
509
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
44 uint64_t shorts0, shorts1;
214
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
45
509
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
46 shorts0 = ldq(block);
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
47 shorts0 = maxsw4(shorts0, 0);
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
48 shorts0 = minsw4(shorts0, clampmask);
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
49 stl(pkwb(shorts0), pixels);
214
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
50
509
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
51 shorts1 = ldq(block + 4);
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
52 shorts1 = maxsw4(shorts1, 0);
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
53 shorts1 = minsw4(shorts1, clampmask);
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
54 stl(pkwb(shorts1), pixels + 4);
214
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
55
509
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
56 pixels += line_size;
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
57 block += 8;
214
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
58 } while (--i);
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
59 }
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
60
509
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
61 void add_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels,
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
62 int line_size)
214
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
63 {
509
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
64 int h = 8;
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
65 /* Keep this function a leaf function by generating the constants
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
66 manually (mainly for the hack value ;-). */
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
67 uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
68 uint64_t signmask = zap(-1, 0x33);
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
69 signmask ^= signmask >> 1; /* 0x8000800080008000 */
505
7a976bf93394 Ugly hack to make the assembler accept MVI instructions.
mellum
parents: 429
diff changeset
70
7a976bf93394 Ugly hack to make the assembler accept MVI instructions.
mellum
parents: 429
diff changeset
71 ASM_ACCEPT_MVI;
7a976bf93394 Ugly hack to make the assembler accept MVI instructions.
mellum
parents: 429
diff changeset
72
214
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
73 do {
509
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
74 uint64_t shorts0, pix0, signs0;
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
75 uint64_t shorts1, pix1, signs1;
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
76
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
77 shorts0 = ldq(block);
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
78 shorts1 = ldq(block + 4);
214
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
79
509
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
80 pix0 = unpkbw(ldl(pixels));
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
81 /* Signed subword add (MMX paddw). */
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
82 signs0 = shorts0 & signmask;
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
83 shorts0 &= ~signmask;
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
84 shorts0 += pix0;
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
85 shorts0 ^= signs0;
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
86 /* Clamp. */
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
87 shorts0 = maxsw4(shorts0, 0);
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
88 shorts0 = minsw4(shorts0, clampmask);
214
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
89
509
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
90 /* Next 4. */
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
91 pix1 = unpkbw(ldl(pixels + 4));
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
92 signs1 = shorts1 & signmask;
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
93 shorts1 &= ~signmask;
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
94 shorts1 += pix1;
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
95 shorts1 ^= signs1;
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
96 shorts1 = maxsw4(shorts1, 0);
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
97 shorts1 = minsw4(shorts1, clampmask);
214
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
98
509
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
99 stl(pkwb(shorts0), pixels);
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
100 stl(pkwb(shorts1), pixels + 4);
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
101
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
102 pixels += line_size;
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
103 block += 8;
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
104 } while (--h);
214
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
105 }
509
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
106 #endif
214
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
107
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
108 /* Average 8 unsigned bytes in parallel: (b1 + b2) >> 1
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
109 Since the immediate result could be greater than 255, we do the
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
110 shift first. The result is too low by one if the bytes were both
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
111 odd, so we need to add (l1 & l2) & BYTE_VEC(0x01). */
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
112 static inline UINT64 avg2_no_rnd(UINT64 l1, UINT64 l2)
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
113 {
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
114 UINT64 correction = (l1 & l2) & BYTE_VEC(0x01);
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
115 l1 = (l1 & ~BYTE_VEC(0x01)) >> 1;
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
116 l2 = (l2 & ~BYTE_VEC(0x01)) >> 1;
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
117 return l1 + l2 + correction;
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
118 }
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
119
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
120 /* Average 8 bytes with rounding: (b1 + b2 + 1) >> 1
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
121 The '1' only has an effect when one byte is even and the other odd,
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
122 i. e. we also need to add (l1 ^ l2) & BYTE_VEC(0x01).
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
123 Incidentally, that is equivalent to (l1 | l2) & BYTE_VEC(0x01). */
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
124 static inline UINT64 avg2(UINT64 l1, UINT64 l2)
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
125 {
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
126 UINT64 correction = (l1 | l2) & BYTE_VEC(0x01);
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
127 l1 = (l1 & ~BYTE_VEC(0x01)) >> 1;
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
128 l2 = (l2 & ~BYTE_VEC(0x01)) >> 1;
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
129 return l1 + l2 + correction;
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
130 }
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
131
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
132 static inline UINT64 avg4(UINT64 l1, UINT64 l2, UINT64 l3, UINT64 l4)
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
133 {
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
134 UINT64 r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
135 + ((l2 & ~BYTE_VEC(0x03)) >> 2)
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
136 + ((l3 & ~BYTE_VEC(0x03)) >> 2)
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
137 + ((l4 & ~BYTE_VEC(0x03)) >> 2);
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
138 UINT64 r2 = (( (l1 & BYTE_VEC(0x03))
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
139 + (l2 & BYTE_VEC(0x03))
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
140 + (l3 & BYTE_VEC(0x03))
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
141 + (l4 & BYTE_VEC(0x03))
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
142 + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
143 return r1 + r2;
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
144 }
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
145
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
146 static inline UINT64 avg4_no_rnd(UINT64 l1, UINT64 l2, UINT64 l3, UINT64 l4)
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
147 {
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
148 UINT64 r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
149 + ((l2 & ~BYTE_VEC(0x03)) >> 2)
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
150 + ((l3 & ~BYTE_VEC(0x03)) >> 2)
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
151 + ((l4 & ~BYTE_VEC(0x03)) >> 2);
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
152 UINT64 r2 = (( (l1 & BYTE_VEC(0x03))
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
153 + (l2 & BYTE_VEC(0x03))
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
154 + (l3 & BYTE_VEC(0x03))
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
155 + (l4 & BYTE_VEC(0x03))
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
156 + BYTE_VEC(0x01)) >> 2) & BYTE_VEC(0x03);
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
157 return r1 + r2;
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
158 }
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
159
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
160 #define PIXOPNAME(suffix) put ## suffix
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
161 #define BTYPE UINT8
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
162 #define AVG2 avg2
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
163 #define AVG4 avg4
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
164 #define STORE(l, b) stq(l, b)
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
165 #include "pixops.h"
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
166 #undef PIXOPNAME
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
167 #undef BTYPE
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
168 #undef AVG2
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
169 #undef AVG4
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
170 #undef STORE
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
171
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
172 #define PIXOPNAME(suffix) put_no_rnd ## suffix
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
173 #define BTYPE UINT8
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
174 #define AVG2 avg2_no_rnd
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
175 #define AVG4 avg4_no_rnd
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
176 #define STORE(l, b) stq(l, b)
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
177 #include "pixops.h"
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
178 #undef PIXOPNAME
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
179 #undef BTYPE
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
180 #undef AVG2
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
181 #undef AVG4
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
182 #undef STORE
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
183
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
184 /* The following functions are untested. */
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
185 #if 0
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
186
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
187 #define PIXOPNAME(suffix) avg ## suffix
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
188 #define BTYPE UINT8
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
189 #define AVG2 avg2
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
190 #define AVG4 avg4
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
191 #define STORE(l, b) stq(AVG2(l, ldq(b)), b);
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
192 #include "pixops.h"
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
193 #undef PIXOPNAME
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
194 #undef BTYPE
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
195 #undef AVG2
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
196 #undef AVG4
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
197 #undef STORE
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
198
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
199 #define PIXOPNAME(suffix) avg_no_rnd ## suffix
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
200 #define BTYPE UINT8
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
201 #define AVG2 avg2_no_rnd
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
202 #define AVG4 avg4_no_rnd
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
203 #define STORE(l, b) stq(AVG2(l, ldq(b)), b);
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
204 #include "pixops.h"
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
205 #undef PIXOPNAME
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
206 #undef BTYPE
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
207 #undef AVG2
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
208 #undef AVG4
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
209 #undef STORE
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
210
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
211 #define PIXOPNAME(suffix) sub ## suffix
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
212 #define BTYPE DCTELEM
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
213 #define AVG2 avg2
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
214 #define AVG4 avg4
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
215 #define STORE(l, block) do { \
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
216 UINT64 xxx = l; \
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
217 (block)[0] -= (xxx >> 0) & 0xff; \
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
218 (block)[1] -= (xxx >> 8) & 0xff; \
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
219 (block)[2] -= (xxx >> 16) & 0xff; \
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
220 (block)[3] -= (xxx >> 24) & 0xff; \
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
221 (block)[4] -= (xxx >> 32) & 0xff; \
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
222 (block)[5] -= (xxx >> 40) & 0xff; \
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
223 (block)[6] -= (xxx >> 48) & 0xff; \
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
224 (block)[7] -= (xxx >> 56) & 0xff; \
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
225 } while (0)
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
226 #include "pixops.h"
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
227 #undef PIXOPNAME
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
228 #undef BTYPE
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
229 #undef AVG2
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
230 #undef AVG4
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
231 #undef STORE
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
232
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
233 #endif
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
234
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
235 void dsputil_init_alpha(void)
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
236 {
511
fa4425cf6b31 Assembly version of put_pixels. This is currently the function that
mellum
parents: 509
diff changeset
237 put_pixels_tab[0] = put_pixels_axp_asm;
214
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
238 put_pixels_tab[1] = put_pixels_x2_axp;
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
239 put_pixels_tab[2] = put_pixels_y2_axp;
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
240 put_pixels_tab[3] = put_pixels_xy2_axp;
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
241
511
fa4425cf6b31 Assembly version of put_pixels. This is currently the function that
mellum
parents: 509
diff changeset
242 put_no_rnd_pixels_tab[0] = put_pixels_axp_asm;
214
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
243 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_axp;
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
244 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_axp;
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
245 put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_axp;
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
246
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
247 /* amask clears all bits that correspond to present features. */
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
248 if (amask(AMASK_MVI) == 0) {
509
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
249 put_pixels_clamped = put_pixels_clamped_mvi_asm;
cab79946302f Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents: 505
diff changeset
250 add_pixels_clamped = add_pixels_clamped_mvi_asm;
214
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
251 }
73df666cacc7 Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff changeset
252 }