Mercurial > libavcodec.hg
annotate i386/dsputil_mmx_rnd.h @ 7170:04da42c2b7b4 libavcodec
gain code, gain pitch and pitch delay decoding for ACELP based codecs
author | voroshil |
---|---|
date | Mon, 30 Jun 2008 18:03:38 +0000 |
parents | 33896780c612 |
children | eebc7209c47f |
rev | line source |
---|---|
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
diff
changeset
|
1 /* |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
diff
changeset
|
2 * DSP utils mmx functions are compiled twice for rnd/no_rnd |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
diff
changeset
|
3 * Copyright (c) 2000, 2001 Fabrice Bellard. |
1739
07a484280a82
copyright year update of the files i touched and remembered, things look annoyingly unmaintained otherwise
michael
parents:
1064
diff
changeset
|
4 * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at> |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
diff
changeset
|
5 * |
5214 | 6 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> |
7 * mostly rewritten by Michael Niedermayer <michaelni@gmx.at> | |
8 * and improved by Zdenek Kabelac <kabi@users.sf.net> | |
9 * | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3576
diff
changeset
|
10 * This file is part of FFmpeg. |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3576
diff
changeset
|
11 * |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3576
diff
changeset
|
12 * FFmpeg is free software; you can redistribute it and/or |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
diff
changeset
|
13 * modify it under the terms of the GNU Lesser General Public |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
diff
changeset
|
14 * License as published by the Free Software Foundation; either |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3576
diff
changeset
|
15 * version 2.1 of the License, or (at your option) any later version. |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
diff
changeset
|
16 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3576
diff
changeset
|
17 * FFmpeg is distributed in the hope that it will be useful, |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
diff
changeset
|
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
diff
changeset
|
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
diff
changeset
|
20 * Lesser General Public License for more details. |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
diff
changeset
|
21 * |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
diff
changeset
|
22 * You should have received a copy of the GNU Lesser General Public |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3576
diff
changeset
|
23 * License along with FFmpeg; if not, write to the Free Software |
3036
0b546eab515d
Update licensing information: The FSF changed postal address.
diego
parents:
2979
diff
changeset
|
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
diff
changeset
|
25 */ |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
diff
changeset
|
26 |
5833
dfd34e7f243f
Explain why there are no multiple inclusion guards in these header files.
diego
parents:
5831
diff
changeset
|
27 /* This header intentionally has no multiple inclusion guards. It is meant to |
dfd34e7f243f
Explain why there are no multiple inclusion guards in these header files.
diego
parents:
5831
diff
changeset
|
28 * be included multiple times and generates different code depending on the |
dfd34e7f243f
Explain why there are no multiple inclusion guards in these header files.
diego
parents:
5831
diff
changeset
|
29 * value of certain #defines. */ |
dfd34e7f243f
Explain why there are no multiple inclusion guards in these header files.
diego
parents:
5831
diff
changeset
|
30 |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
diff
changeset
|
31 // put_pixels |
1064 | 32 static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
diff
changeset
|
33 { |
448 | 34 MOVQ_BFE(mm6); |
6392 | 35 asm volatile( |
2979 | 36 "lea (%3, %3), %%"REG_a" \n\t" |
3576
f7125bf10892
Support for MacIntel, last part: balign directives
gpoirier
parents:
3036
diff
changeset
|
37 ASMALIGN(3) |
2979 | 38 "1: \n\t" |
39 "movq (%1), %%mm0 \n\t" | |
40 "movq 1(%1), %%mm1 \n\t" | |
41 "movq (%1, %3), %%mm2 \n\t" | |
42 "movq 1(%1, %3), %%mm3 \n\t" | |
43 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |
44 "movq %%mm4, (%2) \n\t" | |
45 "movq %%mm5, (%2, %3) \n\t" | |
46 "add %%"REG_a", %1 \n\t" | |
47 "add %%"REG_a", %2 \n\t" | |
48 "movq (%1), %%mm0 \n\t" | |
49 "movq 1(%1), %%mm1 \n\t" | |
50 "movq (%1, %3), %%mm2 \n\t" | |
51 "movq 1(%1, %3), %%mm3 \n\t" | |
52 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |
53 "movq %%mm4, (%2) \n\t" | |
54 "movq %%mm5, (%2, %3) \n\t" | |
55 "add %%"REG_a", %1 \n\t" | |
56 "add %%"REG_a", %2 \n\t" | |
57 "subl $4, %0 \n\t" | |
58 "jnz 1b \n\t" | |
59 :"+g"(h), "+S"(pixels), "+D"(block) | |
6755
33896780c612
Do not misuse long as the size of a register in x86.
ramiro
parents:
6392
diff
changeset
|
60 :"r"((x86_reg)line_size) |
2979 | 61 :REG_a, "memory"); |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
diff
changeset
|
62 } |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
diff
changeset
|
63 |
5083
ce36118abbbb
rename attribute_unused to av_unused and moves its declaration to common.h
benoit
parents:
5006
diff
changeset
|
64 static void av_unused DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) |
954 | 65 { |
66 MOVQ_BFE(mm6); | |
6392 | 67 asm volatile( |
2979 | 68 "testl $1, %0 \n\t" |
69 " jz 1f \n\t" | |
70 "movq (%1), %%mm0 \n\t" | |
71 "movq (%2), %%mm1 \n\t" | |
72 "add %4, %1 \n\t" | |
73 "add $8, %2 \n\t" | |
74 PAVGB(%%mm0, %%mm1, %%mm4, %%mm6) | |
75 "movq %%mm4, (%3) \n\t" | |
76 "add %5, %3 \n\t" | |
77 "decl %0 \n\t" | |
3576
f7125bf10892
Support for MacIntel, last part: balign directives
gpoirier
parents:
3036
diff
changeset
|
78 ASMALIGN(3) |
2979 | 79 "1: \n\t" |
80 "movq (%1), %%mm0 \n\t" | |
81 "movq (%2), %%mm1 \n\t" | |
82 "add %4, %1 \n\t" | |
83 "movq (%1), %%mm2 \n\t" | |
84 "movq 8(%2), %%mm3 \n\t" | |
85 "add %4, %1 \n\t" | |
86 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |
87 "movq %%mm4, (%3) \n\t" | |
88 "add %5, %3 \n\t" | |
89 "movq %%mm5, (%3) \n\t" | |
90 "add %5, %3 \n\t" | |
91 "movq (%1), %%mm0 \n\t" | |
92 "movq 16(%2), %%mm1 \n\t" | |
93 "add %4, %1 \n\t" | |
94 "movq (%1), %%mm2 \n\t" | |
95 "movq 24(%2), %%mm3 \n\t" | |
96 "add %4, %1 \n\t" | |
97 "add $32, %2 \n\t" | |
98 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |
99 "movq %%mm4, (%3) \n\t" | |
100 "add %5, %3 \n\t" | |
101 "movq %%mm5, (%3) \n\t" | |
102 "add %5, %3 \n\t" | |
103 "subl $4, %0 \n\t" | |
104 "jnz 1b \n\t" | |
5127 | 105 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used |
967 | 106 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
107 #else | |
108 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) | |
109 #endif | |
6755
33896780c612
Do not misuse long as the size of a register in x86.
ramiro
parents:
6392
diff
changeset
|
110 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) |
2979 | 111 :"memory"); |
954 | 112 } |
113 | |
1064 | 114 static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
651 | 115 { |
116 MOVQ_BFE(mm6); | |
6392 | 117 asm volatile( |
2979 | 118 "lea (%3, %3), %%"REG_a" \n\t" |
3576
f7125bf10892
Support for MacIntel, last part: balign directives
gpoirier
parents:
3036
diff
changeset
|
119 ASMALIGN(3) |
2979 | 120 "1: \n\t" |
121 "movq (%1), %%mm0 \n\t" | |
122 "movq 1(%1), %%mm1 \n\t" | |
123 "movq (%1, %3), %%mm2 \n\t" | |
124 "movq 1(%1, %3), %%mm3 \n\t" | |
125 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |
126 "movq %%mm4, (%2) \n\t" | |
127 "movq %%mm5, (%2, %3) \n\t" | |
128 "movq 8(%1), %%mm0 \n\t" | |
129 "movq 9(%1), %%mm1 \n\t" | |
130 "movq 8(%1, %3), %%mm2 \n\t" | |
131 "movq 9(%1, %3), %%mm3 \n\t" | |
132 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |
133 "movq %%mm4, 8(%2) \n\t" | |
134 "movq %%mm5, 8(%2, %3) \n\t" | |
135 "add %%"REG_a", %1 \n\t" | |
136 "add %%"REG_a", %2 \n\t" | |
137 "movq (%1), %%mm0 \n\t" | |
138 "movq 1(%1), %%mm1 \n\t" | |
139 "movq (%1, %3), %%mm2 \n\t" | |
140 "movq 1(%1, %3), %%mm3 \n\t" | |
141 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |
142 "movq %%mm4, (%2) \n\t" | |
143 "movq %%mm5, (%2, %3) \n\t" | |
144 "movq 8(%1), %%mm0 \n\t" | |
145 "movq 9(%1), %%mm1 \n\t" | |
146 "movq 8(%1, %3), %%mm2 \n\t" | |
147 "movq 9(%1, %3), %%mm3 \n\t" | |
148 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |
149 "movq %%mm4, 8(%2) \n\t" | |
150 "movq %%mm5, 8(%2, %3) \n\t" | |
151 "add %%"REG_a", %1 \n\t" | |
152 "add %%"REG_a", %2 \n\t" | |
153 "subl $4, %0 \n\t" | |
154 "jnz 1b \n\t" | |
155 :"+g"(h), "+S"(pixels), "+D"(block) | |
6755
33896780c612
Do not misuse long as the size of a register in x86.
ramiro
parents:
6392
diff
changeset
|
156 :"r"((x86_reg)line_size) |
2979 | 157 :REG_a, "memory"); |
651 | 158 } |
159 | |
5083
ce36118abbbb
rename attribute_unused to av_unused and moves its declaration to common.h
benoit
parents:
5006
diff
changeset
|
160 static void av_unused DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) |
954 | 161 { |
162 MOVQ_BFE(mm6); | |
6392 | 163 asm volatile( |
2979 | 164 "testl $1, %0 \n\t" |
165 " jz 1f \n\t" | |
166 "movq (%1), %%mm0 \n\t" | |
167 "movq (%2), %%mm1 \n\t" | |
168 "movq 8(%1), %%mm2 \n\t" | |
169 "movq 8(%2), %%mm3 \n\t" | |
170 "add %4, %1 \n\t" | |
171 "add $16, %2 \n\t" | |
172 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |
173 "movq %%mm4, (%3) \n\t" | |
174 "movq %%mm5, 8(%3) \n\t" | |
175 "add %5, %3 \n\t" | |
176 "decl %0 \n\t" | |
3576
f7125bf10892
Support for MacIntel, last part: balign directives
gpoirier
parents:
3036
diff
changeset
|
177 ASMALIGN(3) |
2979 | 178 "1: \n\t" |
179 "movq (%1), %%mm0 \n\t" | |
180 "movq (%2), %%mm1 \n\t" | |
181 "movq 8(%1), %%mm2 \n\t" | |
182 "movq 8(%2), %%mm3 \n\t" | |
183 "add %4, %1 \n\t" | |
184 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |
185 "movq %%mm4, (%3) \n\t" | |
186 "movq %%mm5, 8(%3) \n\t" | |
187 "add %5, %3 \n\t" | |
188 "movq (%1), %%mm0 \n\t" | |
189 "movq 16(%2), %%mm1 \n\t" | |
190 "movq 8(%1), %%mm2 \n\t" | |
191 "movq 24(%2), %%mm3 \n\t" | |
192 "add %4, %1 \n\t" | |
193 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |
194 "movq %%mm4, (%3) \n\t" | |
195 "movq %%mm5, 8(%3) \n\t" | |
196 "add %5, %3 \n\t" | |
197 "add $32, %2 \n\t" | |
198 "subl $2, %0 \n\t" | |
199 "jnz 1b \n\t" | |
5127 | 200 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used |
2979 | 201 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
967 | 202 #else |
2979 | 203 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) |
967 | 204 #endif |
6755
33896780c612
Do not misuse long as the size of a register in x86.
ramiro
parents:
6392
diff
changeset
|
205 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) |
2979 | 206 :"memory"); |
954 | 207 } |
208 | |
1064 | 209 static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
diff
changeset
|
210 { |
448 | 211 MOVQ_BFE(mm6); |
6392 | 212 asm volatile( |
2979 | 213 "lea (%3, %3), %%"REG_a" \n\t" |
214 "movq (%1), %%mm0 \n\t" | |
3576
f7125bf10892
Support for MacIntel, last part: balign directives
gpoirier
parents:
3036
diff
changeset
|
215 ASMALIGN(3) |
2979 | 216 "1: \n\t" |
217 "movq (%1, %3), %%mm1 \n\t" | |
218 "movq (%1, %%"REG_a"),%%mm2 \n\t" | |
219 PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) | |
220 "movq %%mm4, (%2) \n\t" | |
221 "movq %%mm5, (%2, %3) \n\t" | |
222 "add %%"REG_a", %1 \n\t" | |
223 "add %%"REG_a", %2 \n\t" | |
224 "movq (%1, %3), %%mm1 \n\t" | |
225 "movq (%1, %%"REG_a"),%%mm0 \n\t" | |
226 PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) | |
227 "movq %%mm4, (%2) \n\t" | |
228 "movq %%mm5, (%2, %3) \n\t" | |
229 "add %%"REG_a", %1 \n\t" | |
230 "add %%"REG_a", %2 \n\t" | |
231 "subl $4, %0 \n\t" | |
232 "jnz 1b \n\t" | |
233 :"+g"(h), "+S"(pixels), "+D"(block) | |
6755
33896780c612
Do not misuse long as the size of a register in x86.
ramiro
parents:
6392
diff
changeset
|
234 :"r"((x86_reg)line_size) |
2979 | 235 :REG_a, "memory"); |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
diff
changeset
|
236 } |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
diff
changeset
|
237 |
1064 | 238 static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
diff
changeset
|
239 { |
448 | 240 MOVQ_ZERO(mm7); |
241 SET_RND(mm6); // =2 for rnd and =1 for no_rnd version | |
6392 | 242 asm volatile( |
2979 | 243 "movq (%1), %%mm0 \n\t" |
244 "movq 1(%1), %%mm4 \n\t" | |
245 "movq %%mm0, %%mm1 \n\t" | |
246 "movq %%mm4, %%mm5 \n\t" | |
247 "punpcklbw %%mm7, %%mm0 \n\t" | |
248 "punpcklbw %%mm7, %%mm4 \n\t" | |
249 "punpckhbw %%mm7, %%mm1 \n\t" | |
250 "punpckhbw %%mm7, %%mm5 \n\t" | |
251 "paddusw %%mm0, %%mm4 \n\t" | |
252 "paddusw %%mm1, %%mm5 \n\t" | |
253 "xor %%"REG_a", %%"REG_a" \n\t" | |
254 "add %3, %1 \n\t" | |
3576
f7125bf10892
Support for MacIntel, last part: balign directives
gpoirier
parents:
3036
diff
changeset
|
255 ASMALIGN(3) |
2979 | 256 "1: \n\t" |
257 "movq (%1, %%"REG_a"), %%mm0 \n\t" | |
258 "movq 1(%1, %%"REG_a"), %%mm2 \n\t" | |
259 "movq %%mm0, %%mm1 \n\t" | |
260 "movq %%mm2, %%mm3 \n\t" | |
261 "punpcklbw %%mm7, %%mm0 \n\t" | |
262 "punpcklbw %%mm7, %%mm2 \n\t" | |
263 "punpckhbw %%mm7, %%mm1 \n\t" | |
264 "punpckhbw %%mm7, %%mm3 \n\t" | |
265 "paddusw %%mm2, %%mm0 \n\t" | |
266 "paddusw %%mm3, %%mm1 \n\t" | |
267 "paddusw %%mm6, %%mm4 \n\t" | |
268 "paddusw %%mm6, %%mm5 \n\t" | |
269 "paddusw %%mm0, %%mm4 \n\t" | |
270 "paddusw %%mm1, %%mm5 \n\t" | |
271 "psrlw $2, %%mm4 \n\t" | |
272 "psrlw $2, %%mm5 \n\t" | |
273 "packuswb %%mm5, %%mm4 \n\t" | |
274 "movq %%mm4, (%2, %%"REG_a") \n\t" | |
275 "add %3, %%"REG_a" \n\t" | |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
diff
changeset
|
276 |
2979 | 277 "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 |
278 "movq 1(%1, %%"REG_a"), %%mm4 \n\t" | |
279 "movq %%mm2, %%mm3 \n\t" | |
280 "movq %%mm4, %%mm5 \n\t" | |
281 "punpcklbw %%mm7, %%mm2 \n\t" | |
282 "punpcklbw %%mm7, %%mm4 \n\t" | |
283 "punpckhbw %%mm7, %%mm3 \n\t" | |
284 "punpckhbw %%mm7, %%mm5 \n\t" | |
285 "paddusw %%mm2, %%mm4 \n\t" | |
286 "paddusw %%mm3, %%mm5 \n\t" | |
287 "paddusw %%mm6, %%mm0 \n\t" | |
288 "paddusw %%mm6, %%mm1 \n\t" | |
289 "paddusw %%mm4, %%mm0 \n\t" | |
290 "paddusw %%mm5, %%mm1 \n\t" | |
291 "psrlw $2, %%mm0 \n\t" | |
292 "psrlw $2, %%mm1 \n\t" | |
293 "packuswb %%mm1, %%mm0 \n\t" | |
294 "movq %%mm0, (%2, %%"REG_a") \n\t" | |
295 "add %3, %%"REG_a" \n\t" | |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
diff
changeset
|
296 |
2979 | 297 "subl $2, %0 \n\t" |
298 "jnz 1b \n\t" | |
299 :"+g"(h), "+S"(pixels) | |
6755
33896780c612
Do not misuse long as the size of a register in x86.
ramiro
parents:
6392
diff
changeset
|
300 :"D"(block), "r"((x86_reg)line_size) |
2979 | 301 :REG_a, "memory"); |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
diff
changeset
|
302 } |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
diff
changeset
|
303 |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
diff
changeset
|
304 // avg_pixels |
5083
ce36118abbbb
rename attribute_unused to av_unused and moves its declaration to common.h
benoit
parents:
5006
diff
changeset
|
305 static void av_unused DEF(avg, pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
2209 | 306 { |
307 MOVQ_BFE(mm6); | |
308 JUMPALIGN(); | |
309 do { | |
6392 | 310 asm volatile( |
2979 | 311 "movd %0, %%mm0 \n\t" |
312 "movd %1, %%mm1 \n\t" | |
313 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | |
314 "movd %%mm2, %0 \n\t" | |
315 :"+m"(*block) | |
316 :"m"(*pixels) | |
317 :"memory"); | |
318 pixels += line_size; | |
319 block += line_size; | |
2209 | 320 } |
321 while (--h); | |
322 } | |
323 | |
470
b94e82d31b06
* implemented remaing avg_ pixel functions (these are not used offen)
kabi
parents:
448
diff
changeset
|
324 // in case more speed is needed - unroling would certainly help |
1064 | 325 static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
470
b94e82d31b06
* implemented remaing avg_ pixel functions (these are not used offen)
kabi
parents:
448
diff
changeset
|
326 { |
b94e82d31b06
* implemented remaing avg_ pixel functions (these are not used offen)
kabi
parents:
448
diff
changeset
|
327 MOVQ_BFE(mm6); |
b94e82d31b06
* implemented remaing avg_ pixel functions (these are not used offen)
kabi
parents:
448
diff
changeset
|
328 JUMPALIGN(); |
b94e82d31b06
* implemented remaing avg_ pixel functions (these are not used offen)
kabi
parents:
448
diff
changeset
|
329 do { |
6392 | 330 asm volatile( |
2979 | 331 "movq %0, %%mm0 \n\t" |
332 "movq %1, %%mm1 \n\t" | |
333 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | |
334 "movq %%mm2, %0 \n\t" | |
335 :"+m"(*block) | |
336 :"m"(*pixels) | |
337 :"memory"); | |
338 pixels += line_size; | |
339 block += line_size; | |
470
b94e82d31b06
* implemented remaing avg_ pixel functions (these are not used offen)
kabi
parents:
448
diff
changeset
|
340 } |
b94e82d31b06
* implemented remaing avg_ pixel functions (these are not used offen)
kabi
parents:
448
diff
changeset
|
341 while (--h); |
b94e82d31b06
* implemented remaing avg_ pixel functions (these are not used offen)
kabi
parents:
448
diff
changeset
|
342 } |
b94e82d31b06
* implemented remaing avg_ pixel functions (these are not used offen)
kabi
parents:
448
diff
changeset
|
343 |
1064 | 344 static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
651 | 345 { |
346 MOVQ_BFE(mm6); | |
347 JUMPALIGN(); | |
348 do { | |
6392 | 349 asm volatile( |
2979 | 350 "movq %0, %%mm0 \n\t" |
351 "movq %1, %%mm1 \n\t" | |
352 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | |
353 "movq %%mm2, %0 \n\t" | |
354 "movq 8%0, %%mm0 \n\t" | |
355 "movq 8%1, %%mm1 \n\t" | |
356 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | |
357 "movq %%mm2, 8%0 \n\t" | |
358 :"+m"(*block) | |
359 :"m"(*pixels) | |
360 :"memory"); | |
361 pixels += line_size; | |
362 block += line_size; | |
651 | 363 } |
364 while (--h); | |
365 } | |
366 | |
1064 | 367 static void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
470
b94e82d31b06
* implemented remaing avg_ pixel functions (these are not used offen)
kabi
parents:
448
diff
changeset
|
368 { |
b94e82d31b06
* implemented remaing avg_ pixel functions (these are not used offen)
kabi
parents:
448
diff
changeset
|
369 MOVQ_BFE(mm6); |
b94e82d31b06
* implemented remaing avg_ pixel functions (these are not used offen)
kabi
parents:
448
diff
changeset
|
370 JUMPALIGN(); |
b94e82d31b06
* implemented remaing avg_ pixel functions (these are not used offen)
kabi
parents:
448
diff
changeset
|
371 do { |
6392 | 372 asm volatile( |
2979 | 373 "movq %1, %%mm0 \n\t" |
374 "movq 1%1, %%mm1 \n\t" | |
375 "movq %0, %%mm3 \n\t" | |
376 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | |
377 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) | |
378 "movq %%mm0, %0 \n\t" | |
379 :"+m"(*block) | |
380 :"m"(*pixels) | |
381 :"memory"); | |
382 pixels += line_size; | |
383 block += line_size; | |
470
b94e82d31b06
* implemented remaing avg_ pixel functions (these are not used offen)
kabi
parents:
448
diff
changeset
|
384 } while (--h); |
b94e82d31b06
* implemented remaing avg_ pixel functions (these are not used offen)
kabi
parents:
448
diff
changeset
|
385 } |
b94e82d31b06
* implemented remaing avg_ pixel functions (these are not used offen)
kabi
parents:
448
diff
changeset
|
386 |
5083
ce36118abbbb
rename attribute_unused to av_unused and moves its declaration to common.h
benoit
parents:
5006
diff
changeset
|
387 static av_unused void DEF(avg, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) |
954 | 388 { |
389 MOVQ_BFE(mm6); | |
390 JUMPALIGN(); | |
391 do { | |
6392 | 392 asm volatile( |
2979 | 393 "movq %1, %%mm0 \n\t" |
394 "movq %2, %%mm1 \n\t" | |
395 "movq %0, %%mm3 \n\t" | |
396 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | |
397 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) | |
398 "movq %%mm0, %0 \n\t" | |
399 :"+m"(*dst) | |
400 :"m"(*src1), "m"(*src2) | |
401 :"memory"); | |
402 dst += dstStride; | |
954 | 403 src1 += src1Stride; |
404 src2 += 8; | |
405 } while (--h); | |
406 } | |
407 | |
1064 | 408 static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
651 | 409 { |
410 MOVQ_BFE(mm6); | |
411 JUMPALIGN(); | |
412 do { | |
6392 | 413 asm volatile( |
2979 | 414 "movq %1, %%mm0 \n\t" |
415 "movq 1%1, %%mm1 \n\t" | |
416 "movq %0, %%mm3 \n\t" | |
417 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | |
418 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) | |
419 "movq %%mm0, %0 \n\t" | |
420 "movq 8%1, %%mm0 \n\t" | |
421 "movq 9%1, %%mm1 \n\t" | |
422 "movq 8%0, %%mm3 \n\t" | |
423 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | |
424 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) | |
425 "movq %%mm0, 8%0 \n\t" | |
426 :"+m"(*block) | |
427 :"m"(*pixels) | |
428 :"memory"); | |
429 pixels += line_size; | |
430 block += line_size; | |
651 | 431 } while (--h); |
432 } | |
433 | |
5083
ce36118abbbb
rename attribute_unused to av_unused and moves its declaration to common.h
benoit
parents:
5006
diff
changeset
|
434 static av_unused void DEF(avg, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) |
954 | 435 { |
436 MOVQ_BFE(mm6); | |
437 JUMPALIGN(); | |
438 do { | |
6392 | 439 asm volatile( |
2979 | 440 "movq %1, %%mm0 \n\t" |
441 "movq %2, %%mm1 \n\t" | |
442 "movq %0, %%mm3 \n\t" | |
443 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | |
444 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) | |
445 "movq %%mm0, %0 \n\t" | |
446 "movq 8%1, %%mm0 \n\t" | |
447 "movq 8%2, %%mm1 \n\t" | |
448 "movq 8%0, %%mm3 \n\t" | |
449 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) | |
450 PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) | |
451 "movq %%mm0, 8%0 \n\t" | |
452 :"+m"(*dst) | |
453 :"m"(*src1), "m"(*src2) | |
454 :"memory"); | |
455 dst += dstStride; | |
954 | 456 src1 += src1Stride; |
457 src2 += 16; | |
458 } while (--h); | |
459 } | |
460 | |
1064 | 461 static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
470
b94e82d31b06
* implemented remaing avg_ pixel functions (these are not used offen)
kabi
parents:
448
diff
changeset
|
462 { |
b94e82d31b06
* implemented remaing avg_ pixel functions (these are not used offen)
kabi
parents:
448
diff
changeset
|
463 MOVQ_BFE(mm6); |
6392 | 464 asm volatile( |
2979 | 465 "lea (%3, %3), %%"REG_a" \n\t" |
466 "movq (%1), %%mm0 \n\t" | |
3576
f7125bf10892
Support for MacIntel, last part: balign directives
gpoirier
parents:
3036
diff
changeset
|
467 ASMALIGN(3) |
2979 | 468 "1: \n\t" |
469 "movq (%1, %3), %%mm1 \n\t" | |
470 "movq (%1, %%"REG_a"), %%mm2 \n\t" | |
471 PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) | |
472 "movq (%2), %%mm3 \n\t" | |
473 PAVGB(%%mm3, %%mm4, %%mm0, %%mm6) | |
474 "movq (%2, %3), %%mm3 \n\t" | |
475 PAVGB(%%mm3, %%mm5, %%mm1, %%mm6) | |
476 "movq %%mm0, (%2) \n\t" | |
477 "movq %%mm1, (%2, %3) \n\t" | |
478 "add %%"REG_a", %1 \n\t" | |
479 "add %%"REG_a", %2 \n\t" | |
470
b94e82d31b06
* implemented remaing avg_ pixel functions (these are not used offen)
kabi
parents:
448
diff
changeset
|
480 |
2979 | 481 "movq (%1, %3), %%mm1 \n\t" |
482 "movq (%1, %%"REG_a"), %%mm0 \n\t" | |
483 PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) | |
484 "movq (%2), %%mm3 \n\t" | |
485 PAVGB(%%mm3, %%mm4, %%mm2, %%mm6) | |
486 "movq (%2, %3), %%mm3 \n\t" | |
487 PAVGB(%%mm3, %%mm5, %%mm1, %%mm6) | |
488 "movq %%mm2, (%2) \n\t" | |
489 "movq %%mm1, (%2, %3) \n\t" | |
490 "add %%"REG_a", %1 \n\t" | |
491 "add %%"REG_a", %2 \n\t" | |
470
b94e82d31b06
* implemented remaing avg_ pixel functions (these are not used offen)
kabi
parents:
448
diff
changeset
|
492 |
2979 | 493 "subl $4, %0 \n\t" |
494 "jnz 1b \n\t" | |
495 :"+g"(h), "+S"(pixels), "+D"(block) | |
6755
33896780c612
Do not misuse long as the size of a register in x86.
ramiro
parents:
6392
diff
changeset
|
496 :"r"((x86_reg)line_size) |
2979 | 497 :REG_a, "memory"); |
470
b94e82d31b06
* implemented remaing avg_ pixel functions (these are not used offen)
kabi
parents:
448
diff
changeset
|
498 } |
b94e82d31b06
* implemented remaing avg_ pixel functions (these are not used offen)
kabi
parents:
448
diff
changeset
|
499 |
b94e82d31b06
* implemented remaing avg_ pixel functions (these are not used offen)
kabi
parents:
448
diff
changeset
|
500 // this routine is 'slightly' suboptimal but mostly unused |
1064 | 501 static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
470
b94e82d31b06
* implemented remaing avg_ pixel functions (these are not used offen)
kabi
parents:
448
diff
changeset
|
502 { |
b94e82d31b06
* implemented remaing avg_ pixel functions (these are not used offen)
kabi
parents:
448
diff
changeset
|
503 MOVQ_ZERO(mm7); |
b94e82d31b06
* implemented remaing avg_ pixel functions (these are not used offen)
kabi
parents:
448
diff
changeset
|
504 SET_RND(mm6); // =2 for rnd and =1 for no_rnd version |
6392 | 505 asm volatile( |
2979 | 506 "movq (%1), %%mm0 \n\t" |
507 "movq 1(%1), %%mm4 \n\t" | |
508 "movq %%mm0, %%mm1 \n\t" | |
509 "movq %%mm4, %%mm5 \n\t" | |
510 "punpcklbw %%mm7, %%mm0 \n\t" | |
511 "punpcklbw %%mm7, %%mm4 \n\t" | |
512 "punpckhbw %%mm7, %%mm1 \n\t" | |
513 "punpckhbw %%mm7, %%mm5 \n\t" | |
514 "paddusw %%mm0, %%mm4 \n\t" | |
515 "paddusw %%mm1, %%mm5 \n\t" | |
516 "xor %%"REG_a", %%"REG_a" \n\t" | |
517 "add %3, %1 \n\t" | |
3576
f7125bf10892
Support for MacIntel, last part: balign directives
gpoirier
parents:
3036
diff
changeset
|
518 ASMALIGN(3) |
2979 | 519 "1: \n\t" |
520 "movq (%1, %%"REG_a"), %%mm0 \n\t" | |
521 "movq 1(%1, %%"REG_a"), %%mm2 \n\t" | |
522 "movq %%mm0, %%mm1 \n\t" | |
523 "movq %%mm2, %%mm3 \n\t" | |
524 "punpcklbw %%mm7, %%mm0 \n\t" | |
525 "punpcklbw %%mm7, %%mm2 \n\t" | |
526 "punpckhbw %%mm7, %%mm1 \n\t" | |
527 "punpckhbw %%mm7, %%mm3 \n\t" | |
528 "paddusw %%mm2, %%mm0 \n\t" | |
529 "paddusw %%mm3, %%mm1 \n\t" | |
530 "paddusw %%mm6, %%mm4 \n\t" | |
531 "paddusw %%mm6, %%mm5 \n\t" | |
532 "paddusw %%mm0, %%mm4 \n\t" | |
533 "paddusw %%mm1, %%mm5 \n\t" | |
534 "psrlw $2, %%mm4 \n\t" | |
535 "psrlw $2, %%mm5 \n\t" | |
536 "movq (%2, %%"REG_a"), %%mm3 \n\t" | |
537 "packuswb %%mm5, %%mm4 \n\t" | |
538 "pcmpeqd %%mm2, %%mm2 \n\t" | |
539 "paddb %%mm2, %%mm2 \n\t" | |
540 PAVGB(%%mm3, %%mm4, %%mm5, %%mm2) | |
541 "movq %%mm5, (%2, %%"REG_a") \n\t" | |
542 "add %3, %%"REG_a" \n\t" | |
470
b94e82d31b06
* implemented remaing avg_ pixel functions (these are not used offen)
kabi
parents:
448
diff
changeset
|
543 |
2979 | 544 "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 |
545 "movq 1(%1, %%"REG_a"), %%mm4 \n\t" | |
546 "movq %%mm2, %%mm3 \n\t" | |
547 "movq %%mm4, %%mm5 \n\t" | |
548 "punpcklbw %%mm7, %%mm2 \n\t" | |
549 "punpcklbw %%mm7, %%mm4 \n\t" | |
550 "punpckhbw %%mm7, %%mm3 \n\t" | |
551 "punpckhbw %%mm7, %%mm5 \n\t" | |
552 "paddusw %%mm2, %%mm4 \n\t" | |
553 "paddusw %%mm3, %%mm5 \n\t" | |
554 "paddusw %%mm6, %%mm0 \n\t" | |
555 "paddusw %%mm6, %%mm1 \n\t" | |
556 "paddusw %%mm4, %%mm0 \n\t" | |
557 "paddusw %%mm5, %%mm1 \n\t" | |
558 "psrlw $2, %%mm0 \n\t" | |
559 "psrlw $2, %%mm1 \n\t" | |
560 "movq (%2, %%"REG_a"), %%mm3 \n\t" | |
561 "packuswb %%mm1, %%mm0 \n\t" | |
562 "pcmpeqd %%mm2, %%mm2 \n\t" | |
563 "paddb %%mm2, %%mm2 \n\t" | |
564 PAVGB(%%mm3, %%mm0, %%mm1, %%mm2) | |
565 "movq %%mm1, (%2, %%"REG_a") \n\t" | |
566 "add %3, %%"REG_a" \n\t" | |
470
b94e82d31b06
* implemented remaing avg_ pixel functions (these are not used offen)
kabi
parents:
448
diff
changeset
|
567 |
2979 | 568 "subl $2, %0 \n\t" |
569 "jnz 1b \n\t" | |
570 :"+g"(h), "+S"(pixels) | |
6755
33896780c612
Do not misuse long as the size of a register in x86.
ramiro
parents:
6392
diff
changeset
|
571 :"D"(block), "r"((x86_reg)line_size) |
2979 | 572 :REG_a, "memory"); |
470
b94e82d31b06
* implemented remaing avg_ pixel functions (these are not used offen)
kabi
parents:
448
diff
changeset
|
573 } |
651 | 574 |
575 //FIXME optimize | |
1064 | 576 static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ |
651 | 577 DEF(put, pixels8_y2)(block , pixels , line_size, h); |
578 DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h); | |
579 } | |
580 | |
1064 | 581 static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ |
651 | 582 DEF(put, pixels8_xy2)(block , pixels , line_size, h); |
583 DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h); | |
584 } | |
585 | |
1064 | 586 static void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ |
651 | 587 DEF(avg, pixels8_y2)(block , pixels , line_size, h); |
588 DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h); | |
589 } | |
590 | |
1064 | 591 static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ |
651 | 592 DEF(avg, pixels8_xy2)(block , pixels , line_size, h); |
593 DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h); | |
594 } |