Mercurial > libavcodec.hg
annotate i386/dsputil_mmx.c @ 446:efe0c0d40577 libavcodec
* reenabled original xy2 put routine - rounding error is really bad with
the new code
* added PAVGP macros for parallel processing to safe few more cycles on
celerons
author | kabi |
---|---|
date | Wed, 29 May 2002 19:57:21 +0000 |
parents | 62c01dbdc1e0 |
children | e8c8ca9106aa |
rev | line source |
---|---|
0 | 1 /* |
2 * MMX optimized DSP utils | |
429 | 3 * Copyright (c) 2000, 2001 Fabrice Bellard. |
0 | 4 * |
429 | 5 * This library is free software; you can redistribute it and/or |
6 * modify it under the terms of the GNU Lesser General Public | |
7 * License as published by the Free Software Foundation; either | |
8 * version 2 of the License, or (at your option) any later version. | |
0 | 9 * |
429 | 10 * This library is distributed in the hope that it will be useful, |
0 | 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
429 | 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 * Lesser General Public License for more details. | |
0 | 14 * |
429 | 15 * You should have received a copy of the GNU Lesser General Public |
16 * License along with this library; if not, write to the Free Software | |
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
0 | 18 * |
19 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
20 */ | |
21 | |
22 #include "../dsputil.h" | |
174
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
23 #include "../simple_idct.h" |
0 | 24 |
5 | 25 int mm_flags; /* multimedia extension flags */ |
26 | |
294 | 27 int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx); |
28 int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
29 int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
30 int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
31 | |
32 int pix_abs16x16_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
33 int pix_abs16x16_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
34 int pix_abs16x16_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
35 int pix_abs16x16_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
36 | |
37 int pix_abs8x8_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
38 int pix_abs8x8_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
39 int pix_abs8x8_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
40 int pix_abs8x8_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); | |
41 | |
42 int pix_abs8x8_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
43 int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
44 int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
45 int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); | |
46 | |
42 | 47 /* external functions, from idct_mmx.c */ |
48 void ff_mmx_idct(DCTELEM *block); | |
49 void ff_mmxext_idct(DCTELEM *block); | |
19
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
50 |
0 | 51 /* pixel operations */ |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
52 static const uint64_t mm_bfe __attribute__ ((aligned(8))) = 0xfefefefefefefefeULL; |
387 | 53 static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL; |
54 static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL; | |
55 static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL; | |
8 | 56 //static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 }; |
57 //static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 }; | |
0 | 58 |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
59 #define JUMPALIGN() __asm __volatile (".balign 8"::) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
60 #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
61 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
62 #ifndef PIC |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
63 #define MOVQ_WONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wone)) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
64 #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo)) |
387 | 65 #define MOVQ_BONE(regd) "movq "MANGLE(mm_bone)", "#regd" \n\t" |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
66 #define MOVQ_BFE(regd) "movq "MANGLE(mm_bfe)", "#regd" \n\t" |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
67 #else |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
68 // for shared library it's better to use this way for accessing constants |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
69 // pcmpeqd -> -1 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
70 #define MOVQ_WONE(regd) \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
71 __asm __volatile ( \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
72 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
73 "psrlw $15, %%" #regd ::) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
74 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
75 #define MOVQ_WTWO(regd) \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
76 __asm __volatile ( \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
77 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
78 "psrlw $15, %%" #regd " \n\t" \ |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
79 "psllw $1, %%" #regd ::) |
387 | 80 |
81 #define MOVQ_BONE(regd) \ | |
82 "pcmpeqd " #regd ", " #regd " \n\t" \ | |
83 "psrlw $15, " #regd " \n\t"\ | |
84 "packuswb " #regd ", " #regd " \n\t" | |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
85 |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
86 #define MOVQ_BFE(regd) \ |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
87 "pcmpeqd " #regd ", " #regd " \n\t"\ |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
88 "paddb " #regd ", " #regd " \n\t" |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
89 #endif |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
90 |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
91 // using mm6 as temporary and for the output result |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
92 // first argument is unmodifed and second is trashed |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
93 // mm7 is supposed to contain 0xfefefefefefefefe |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
94 #define PAVGB_MMX_NO_RND(rega, regb, regr) \ |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
95 "movq " #rega ", " #regr " \n\t"\ |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
96 "pand " #regb ", " #regr " \n\t"\ |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
97 "pxor " #rega ", " #regb " \n\t"\ |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
98 "pand %%mm7, " #regb " \n\t"\ |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
99 "psrlq $1, " #regb " \n\t"\ |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
100 "paddb " #regb ", " #regr " \n\t" |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
101 |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
102 #define PAVGB_MMX(rega, regb, regr) \ |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
103 "movq " #rega ", " #regr " \n\t"\ |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
104 "por " #regb ", " #regr " \n\t"\ |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
105 "pxor " #rega ", " #regb " \n\t"\ |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
106 "pand %%mm7, " #regb " \n\t"\ |
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
107 "psrlq $1, " #regb " \n\t"\ |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
108 "psubb " #regb ", " #regr " \n\t" |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
109 |
446
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
110 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
111 "movq " #rega ", " #regr " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
112 "movq " #regc ", " #regp " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
113 "pand " #regb ", " #regr " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
114 "pand " #regd ", " #regp " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
115 "pxor " #rega ", " #regb " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
116 "pxor " #regc ", " #regd " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
117 "pand %%mm7, " #regb " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
118 "pand %%mm7, " #regd " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
119 "psrlq $1, " #regb " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
120 "psrlq $1, " #regd " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
121 "paddb " #regb ", " #regr " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
122 "paddb " #regd ", " #regp " \n\t" |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
123 |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
124 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
125 "movq " #rega ", " #regr " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
126 "movq " #regc ", " #regp " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
127 "por " #regb ", " #regr " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
128 "por " #regd ", " #regp " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
129 "pxor " #rega ", " #regb " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
130 "pxor " #regc ", " #regd " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
131 "pand %%mm7, " #regb " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
132 "pand %%mm7, " #regd " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
133 "psrlq $1, " #regd " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
134 "psrlq $1, " #regb " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
135 "psubb " #regb ", " #regr " \n\t"\ |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
136 "psubb " #regd ", " #regp " \n\t" |
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
137 |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
138 /***********************************/ |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
139 /* MMX no rounding */ |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
140 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
141 |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
142 #define PAVGB(a, b) PAVGB_MMX_NO_RND(a, b, %%mm6) |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
143 #define PAVGBR(a, b, c) PAVGB_MMX_NO_RND(a, b, c) |
446
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
144 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f) |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
145 #include "dsputil_mmx_rnd.h" |
444
a5edef76dac6
* new mmx code - based upon http://aggregate.org/MAGIC
kabi
parents:
438
diff
changeset
|
146 |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
147 #undef DEF |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
148 #undef PAVGB |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
149 #undef PAVGBR |
446
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
150 #undef PAVGBP |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
151 /***********************************/ |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
152 /* MMX rounding */ |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
153 |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
154 #define DEF(x, y) x ## _ ## y ##_mmx |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
155 |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
156 #define PAVGB(a, b) PAVGB_MMX(a, b, %%mm6) |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
157 #define PAVGBR(a, b, c) PAVGB_MMX(a, b, c) |
446
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
158 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f) |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
159 #include "dsputil_mmx_rnd.h" |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
160 |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
161 #undef DEF |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
162 #undef PAVGB |
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
163 #undef PAVGBR |
446
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
164 #undef PAVGBP |
387 | 165 |
0 | 166 /***********************************/ |
167 /* 3Dnow specific */ | |
168 | |
169 #define DEF(x) x ## _3dnow | |
170 /* for Athlons PAVGUSB is prefered */ | |
171 #define PAVGB "pavgusb" | |
172 | |
173 #include "dsputil_mmx_avg.h" | |
174 | |
175 #undef DEF | |
176 #undef PAVGB | |
177 | |
178 /***********************************/ | |
179 /* MMX2 specific */ | |
180 | |
386 | 181 #define DEF(x) x ## _mmx2 |
0 | 182 |
183 /* Introduced only in MMX2 set */ | |
184 #define PAVGB "pavgb" | |
185 | |
186 #include "dsputil_mmx_avg.h" | |
187 | |
188 #undef DEF | |
189 #undef PAVGB | |
190 | |
191 /***********************************/ | |
192 /* standard MMX */ | |
193 | |
194 static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size) | |
195 { | |
386 | 196 asm volatile( |
197 "movl $-128, %%eax \n\t" | |
198 "pxor %%mm7, %%mm7 \n\t" | |
199 ".balign 16 \n\t" | |
200 "1: \n\t" | |
201 "movq (%0), %%mm0 \n\t" | |
202 "movq (%0, %2), %%mm2 \n\t" | |
203 "movq %%mm0, %%mm1 \n\t" | |
204 "movq %%mm2, %%mm3 \n\t" | |
205 "punpcklbw %%mm7, %%mm0 \n\t" | |
206 "punpckhbw %%mm7, %%mm1 \n\t" | |
207 "punpcklbw %%mm7, %%mm2 \n\t" | |
208 "punpckhbw %%mm7, %%mm3 \n\t" | |
209 "movq %%mm0, (%1, %%eax)\n\t" | |
210 "movq %%mm1, 8(%1, %%eax)\n\t" | |
211 "movq %%mm2, 16(%1, %%eax)\n\t" | |
212 "movq %%mm3, 24(%1, %%eax)\n\t" | |
213 "addl %3, %0 \n\t" | |
214 "addl $32, %%eax \n\t" | |
215 "js 1b \n\t" | |
216 : "+r" (pixels) | |
217 : "r" (block+64), "r" (line_size), "r" (line_size*2) | |
218 : "%eax" | |
219 ); | |
0 | 220 } |
221 | |
324 | 222 static void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride) |
223 { | |
224 asm volatile( | |
386 | 225 "pxor %%mm7, %%mm7 \n\t" |
226 "movl $-128, %%eax \n\t" | |
324 | 227 ".balign 16 \n\t" |
228 "1: \n\t" | |
229 "movq (%0), %%mm0 \n\t" | |
230 "movq (%1), %%mm2 \n\t" | |
231 "movq %%mm0, %%mm1 \n\t" | |
232 "movq %%mm2, %%mm3 \n\t" | |
233 "punpcklbw %%mm7, %%mm0 \n\t" | |
234 "punpckhbw %%mm7, %%mm1 \n\t" | |
235 "punpcklbw %%mm7, %%mm2 \n\t" | |
236 "punpckhbw %%mm7, %%mm3 \n\t" | |
237 "psubw %%mm2, %%mm0 \n\t" | |
238 "psubw %%mm3, %%mm1 \n\t" | |
239 "movq %%mm0, (%2, %%eax)\n\t" | |
240 "movq %%mm1, 8(%2, %%eax)\n\t" | |
241 "addl %3, %0 \n\t" | |
242 "addl %3, %1 \n\t" | |
243 "addl $16, %%eax \n\t" | |
244 "jnz 1b \n\t" | |
245 : "+r" (s1), "+r" (s2) | |
246 : "r" (block+64), "r" (stride) | |
247 : "%eax" | |
248 ); | |
249 } | |
250 | |
0 | 251 static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size) |
252 { | |
253 const DCTELEM *p; | |
254 UINT8 *pix; | |
255 | |
256 /* read the pixels */ | |
257 p = block; | |
258 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
259 /* unrolled loop */ |
0 | 260 __asm __volatile( |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
261 "movq %3, %%mm0\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
262 "movq 8%3, %%mm1\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
263 "movq 16%3, %%mm2\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
264 "movq 24%3, %%mm3\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
265 "movq 32%3, %%mm4\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
266 "movq 40%3, %%mm5\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
267 "movq 48%3, %%mm6\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
268 "movq 56%3, %%mm7\n\t" |
0 | 269 "packuswb %%mm1, %%mm0\n\t" |
270 "packuswb %%mm3, %%mm2\n\t" | |
271 "packuswb %%mm5, %%mm4\n\t" | |
272 "packuswb %%mm7, %%mm6\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
273 "movq %%mm0, (%0)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
274 "movq %%mm2, (%0, %1)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
275 "movq %%mm4, (%0, %1, 2)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
276 "movq %%mm6, (%0, %2)\n\t" |
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
277 ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p) |
0 | 278 :"memory"); |
279 pix += line_size*4; | |
280 p += 32; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
281 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
282 // if here would be an exact copy of the code above |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
283 // compiler would generate some very strange code |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
284 // thus using "r" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
285 __asm __volatile( |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
286 "movq (%3), %%mm0\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
287 "movq 8(%3), %%mm1\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
288 "movq 16(%3), %%mm2\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
289 "movq 24(%3), %%mm3\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
290 "movq 32(%3), %%mm4\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
291 "movq 40(%3), %%mm5\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
292 "movq 48(%3), %%mm6\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
293 "movq 56(%3), %%mm7\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
294 "packuswb %%mm1, %%mm0\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
295 "packuswb %%mm3, %%mm2\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
296 "packuswb %%mm5, %%mm4\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
297 "packuswb %%mm7, %%mm6\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
298 "movq %%mm0, (%0)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
299 "movq %%mm2, (%0, %1)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
300 "movq %%mm4, (%0, %1, 2)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
301 "movq %%mm6, (%0, %2)\n\t" |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
302 ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p) |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
303 :"memory"); |
0 | 304 } |
305 | |
306 static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size) | |
307 { | |
308 const DCTELEM *p; | |
309 UINT8 *pix; | |
310 int i; | |
311 | |
312 /* read the pixels */ | |
313 p = block; | |
314 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
315 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
316 i = 4; |
342
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
317 do { |
0 | 318 __asm __volatile( |
342
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
319 "movq (%2), %%mm0\n\t" |
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
320 "movq 8(%2), %%mm1\n\t" |
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
321 "movq 16(%2), %%mm2\n\t" |
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
322 "movq 24(%2), %%mm3\n\t" |
0 | 323 "movq %0, %%mm4\n\t" |
324 "movq %1, %%mm6\n\t" | |
325 "movq %%mm4, %%mm5\n\t" | |
326 "punpcklbw %%mm7, %%mm4\n\t" | |
327 "punpckhbw %%mm7, %%mm5\n\t" | |
328 "paddsw %%mm4, %%mm0\n\t" | |
329 "paddsw %%mm5, %%mm1\n\t" | |
330 "movq %%mm6, %%mm5\n\t" | |
331 "punpcklbw %%mm7, %%mm6\n\t" | |
332 "punpckhbw %%mm7, %%mm5\n\t" | |
333 "paddsw %%mm6, %%mm2\n\t" | |
334 "paddsw %%mm5, %%mm3\n\t" | |
335 "packuswb %%mm1, %%mm0\n\t" | |
336 "packuswb %%mm3, %%mm2\n\t" | |
337 "movq %%mm0, %0\n\t" | |
338 "movq %%mm2, %1\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
339 :"+m"(*pix), "+m"(*(pix+line_size)) |
342
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
340 :"r"(p) |
0 | 341 :"memory"); |
342 pix += line_size*2; | |
343 p += 16; | |
342
8635a7036395
* fixes problem with -funroll-loops and buggy gcc compiler
kabi
parents:
324
diff
changeset
|
344 } while (--i); |
0 | 345 } |
346 | |
347 static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
348 { | |
420 | 349 asm volatile |
350 ( | |
351 "lea (%3, %3), %%eax \n\t" | |
422 | 352 ".balign 8 \n\t" |
420 | 353 "1: \n\t" |
354 "movq (%1), %%mm0 \n\t" | |
355 "movq (%1, %3), %%mm1 \n\t" | |
356 "movq %%mm0, (%2) \n\t" | |
357 "movq %%mm1, (%2, %3) \n\t" | |
358 "addl %%eax, %1 \n\t" | |
359 "addl %%eax, %2 \n\t" | |
360 "movq (%1), %%mm0 \n\t" | |
361 "movq (%1, %3), %%mm1 \n\t" | |
362 "movq %%mm0, (%2) \n\t" | |
363 "movq %%mm1, (%2, %3) \n\t" | |
364 "addl %%eax, %1 \n\t" | |
365 "addl %%eax, %2 \n\t" | |
366 "subl $4, %0 \n\t" | |
367 "jnz 1b \n\t" | |
368 : "+g"(h), "+r" (pixels), "+r" (block) | |
369 : "r"(line_size) | |
370 : "%eax", "memory" | |
371 ); | |
0 | 372 } |
373 | |
446
efe0c0d40577
* reenabled original xy2 put routine - rounding error is really bad with
kabi
parents:
445
diff
changeset
|
374 #if 1 |
0 | 375 static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
376 { | |
377 UINT8 *p; | |
378 const UINT8 *pix; | |
379 p = block; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
380 pix = pixels; // 1s |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
381 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
382 MOVQ_WTWO(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
383 JUMPALIGN(); |
0 | 384 do { |
385 __asm __volatile( | |
386 "movq %1, %%mm0\n\t" | |
387 "movq %2, %%mm1\n\t" | |
388 "movq 1%1, %%mm4\n\t" | |
389 "movq 1%2, %%mm5\n\t" | |
390 "movq %%mm0, %%mm2\n\t" | |
391 "movq %%mm1, %%mm3\n\t" | |
392 "punpcklbw %%mm7, %%mm0\n\t" | |
393 "punpcklbw %%mm7, %%mm1\n\t" | |
394 "punpckhbw %%mm7, %%mm2\n\t" | |
395 "punpckhbw %%mm7, %%mm3\n\t" | |
396 "paddusw %%mm1, %%mm0\n\t" | |
397 "paddusw %%mm3, %%mm2\n\t" | |
398 "movq %%mm4, %%mm1\n\t" | |
399 "movq %%mm5, %%mm3\n\t" | |
400 "punpcklbw %%mm7, %%mm4\n\t" | |
401 "punpcklbw %%mm7, %%mm5\n\t" | |
402 "punpckhbw %%mm7, %%mm1\n\t" | |
403 "punpckhbw %%mm7, %%mm3\n\t" | |
404 "paddusw %%mm5, %%mm4\n\t" | |
405 "paddusw %%mm3, %%mm1\n\t" | |
406 "paddusw %%mm6, %%mm4\n\t" | |
407 "paddusw %%mm6, %%mm1\n\t" | |
408 "paddusw %%mm4, %%mm0\n\t" | |
409 "paddusw %%mm1, %%mm2\n\t" | |
410 "psrlw $2, %%mm0\n\t" | |
411 "psrlw $2, %%mm2\n\t" | |
412 "packuswb %%mm2, %%mm0\n\t" | |
413 "movq %%mm0, %0\n\t" | |
414 :"=m"(*p) | |
415 :"m"(*pix), | |
416 "m"(*(pix+line_size)) | |
417 :"memory"); | |
418 pix += line_size; | |
419 p += line_size; | |
420 } while(--h); | |
421 } | |
422 | |
423 static void put_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
424 { | |
425 UINT8 *p; | |
426 const UINT8 *pix; | |
427 p = block; | |
428 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
429 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
430 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
431 JUMPALIGN(); |
0 | 432 do { |
433 __asm __volatile( | |
434 "movq %1, %%mm0\n\t" | |
435 "movq %2, %%mm1\n\t" | |
436 "movq 1%1, %%mm4\n\t" | |
437 "movq 1%2, %%mm5\n\t" | |
438 "movq %%mm0, %%mm2\n\t" | |
439 "movq %%mm1, %%mm3\n\t" | |
440 "punpcklbw %%mm7, %%mm0\n\t" | |
441 "punpcklbw %%mm7, %%mm1\n\t" | |
442 "punpckhbw %%mm7, %%mm2\n\t" | |
443 "punpckhbw %%mm7, %%mm3\n\t" | |
444 "paddusw %%mm1, %%mm0\n\t" | |
445 "paddusw %%mm3, %%mm2\n\t" | |
446 "movq %%mm4, %%mm1\n\t" | |
447 "movq %%mm5, %%mm3\n\t" | |
448 "punpcklbw %%mm7, %%mm4\n\t" | |
449 "punpcklbw %%mm7, %%mm5\n\t" | |
450 "punpckhbw %%mm7, %%mm1\n\t" | |
451 "punpckhbw %%mm7, %%mm3\n\t" | |
452 "paddusw %%mm5, %%mm4\n\t" | |
453 "paddusw %%mm3, %%mm1\n\t" | |
454 "paddusw %%mm6, %%mm4\n\t" | |
455 "paddusw %%mm6, %%mm1\n\t" | |
456 "paddusw %%mm4, %%mm0\n\t" | |
457 "paddusw %%mm1, %%mm2\n\t" | |
458 "psrlw $2, %%mm0\n\t" | |
459 "psrlw $2, %%mm2\n\t" | |
460 "packuswb %%mm2, %%mm0\n\t" | |
461 "movq %%mm0, %0\n\t" | |
462 :"=m"(*p) | |
463 :"m"(*pix), | |
464 "m"(*(pix+line_size)) | |
465 :"memory"); | |
466 pix += line_size; | |
467 p += line_size; | |
468 } while(--h); | |
469 } | |
445
62c01dbdc1e0
* code with new PAVGB for MMX only CPU splited into separate file
kabi
parents:
444
diff
changeset
|
470 #endif |
0 | 471 static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
472 { | |
473 UINT8 *p; | |
474 const UINT8 *pix; | |
475 p = block; | |
476 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
477 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
478 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
479 JUMPALIGN(); |
0 | 480 do { |
481 __asm __volatile( | |
482 "movq %0, %%mm0\n\t" | |
483 "movq %1, %%mm1\n\t" | |
484 "movq %%mm0, %%mm2\n\t" | |
485 "movq %%mm1, %%mm3\n\t" | |
486 "punpcklbw %%mm7, %%mm0\n\t" | |
487 "punpcklbw %%mm7, %%mm1\n\t" | |
488 "punpckhbw %%mm7, %%mm2\n\t" | |
489 "punpckhbw %%mm7, %%mm3\n\t" | |
490 "paddusw %%mm1, %%mm0\n\t" | |
491 "paddusw %%mm3, %%mm2\n\t" | |
492 "paddusw %%mm6, %%mm0\n\t" | |
493 "paddusw %%mm6, %%mm2\n\t" | |
494 "psrlw $1, %%mm0\n\t" | |
495 "psrlw $1, %%mm2\n\t" | |
496 "packuswb %%mm2, %%mm0\n\t" | |
497 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
498 :"+m"(*p) |
0 | 499 :"m"(*pix) |
500 :"memory"); | |
501 pix += line_size; | |
502 p += line_size; | |
503 } | |
504 while (--h); | |
505 } | |
506 | |
507 static void avg_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
508 { | |
509 UINT8 *p; | |
510 const UINT8 *pix; | |
511 p = block; | |
512 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
513 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
514 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
515 JUMPALIGN(); |
0 | 516 do { |
517 __asm __volatile( | |
518 "movq %1, %%mm1\n\t" | |
519 "movq %0, %%mm0\n\t" | |
520 "movq 1%1, %%mm4\n\t" | |
521 "movq %%mm0, %%mm2\n\t" | |
522 "movq %%mm1, %%mm3\n\t" | |
523 "movq %%mm4, %%mm5\n\t" | |
524 "punpcklbw %%mm7, %%mm1\n\t" | |
525 "punpckhbw %%mm7, %%mm3\n\t" | |
526 "punpcklbw %%mm7, %%mm4\n\t" | |
527 "punpckhbw %%mm7, %%mm5\n\t" | |
528 "punpcklbw %%mm7, %%mm0\n\t" | |
529 "punpckhbw %%mm7, %%mm2\n\t" | |
530 "paddusw %%mm4, %%mm1\n\t" | |
531 "paddusw %%mm5, %%mm3\n\t" | |
532 "paddusw %%mm6, %%mm1\n\t" | |
533 "paddusw %%mm6, %%mm3\n\t" | |
534 "psrlw $1, %%mm1\n\t" | |
535 "psrlw $1, %%mm3\n\t" | |
536 "paddusw %%mm6, %%mm0\n\t" | |
537 "paddusw %%mm6, %%mm2\n\t" | |
538 "paddusw %%mm1, %%mm0\n\t" | |
539 "paddusw %%mm3, %%mm2\n\t" | |
540 "psrlw $1, %%mm0\n\t" | |
541 "psrlw $1, %%mm2\n\t" | |
542 "packuswb %%mm2, %%mm0\n\t" | |
543 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
544 :"+m"(*p) |
0 | 545 :"m"(*pix) |
546 :"memory"); | |
547 pix += line_size; | |
548 p += line_size; | |
549 } while (--h); | |
550 } | |
551 | |
552 static void avg_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
553 { | |
554 UINT8 *p; | |
555 const UINT8 *pix; | |
556 p = block; | |
557 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
558 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
559 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
560 JUMPALIGN(); |
0 | 561 do { |
562 __asm __volatile( | |
563 "movq %1, %%mm1\n\t" | |
564 "movq %0, %%mm0\n\t" | |
565 "movq %2, %%mm4\n\t" | |
566 "movq %%mm0, %%mm2\n\t" | |
567 "movq %%mm1, %%mm3\n\t" | |
568 "movq %%mm4, %%mm5\n\t" | |
569 "punpcklbw %%mm7, %%mm1\n\t" | |
570 "punpckhbw %%mm7, %%mm3\n\t" | |
571 "punpcklbw %%mm7, %%mm4\n\t" | |
572 "punpckhbw %%mm7, %%mm5\n\t" | |
573 "punpcklbw %%mm7, %%mm0\n\t" | |
574 "punpckhbw %%mm7, %%mm2\n\t" | |
575 "paddusw %%mm4, %%mm1\n\t" | |
576 "paddusw %%mm5, %%mm3\n\t" | |
577 "paddusw %%mm6, %%mm1\n\t" | |
578 "paddusw %%mm6, %%mm3\n\t" | |
579 "psrlw $1, %%mm1\n\t" | |
580 "psrlw $1, %%mm3\n\t" | |
581 "paddusw %%mm6, %%mm0\n\t" | |
582 "paddusw %%mm6, %%mm2\n\t" | |
583 "paddusw %%mm1, %%mm0\n\t" | |
584 "paddusw %%mm3, %%mm2\n\t" | |
585 "psrlw $1, %%mm0\n\t" | |
586 "psrlw $1, %%mm2\n\t" | |
587 "packuswb %%mm2, %%mm0\n\t" | |
588 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
589 :"+m"(*p) |
0 | 590 :"m"(*pix), "m"(*(pix+line_size)) |
591 :"memory"); | |
592 pix += line_size; | |
593 p += line_size ; | |
594 } while(--h); | |
595 } | |
596 | |
597 static void avg_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
598 { | |
599 UINT8 *p; | |
600 const UINT8 *pix; | |
601 p = block; | |
602 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
603 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
604 // this doesn't seem to be used offten - so |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
605 // the inside usage of mm_wone is not optimized |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
606 MOVQ_WTWO(mm6); |
0 | 607 do { |
608 __asm __volatile( | |
609 "movq %1, %%mm0\n\t" | |
610 "movq %2, %%mm1\n\t" | |
611 "movq 1%1, %%mm4\n\t" | |
612 "movq 1%2, %%mm5\n\t" | |
613 "movq %%mm0, %%mm2\n\t" | |
614 "movq %%mm1, %%mm3\n\t" | |
615 "punpcklbw %%mm7, %%mm0\n\t" | |
616 "punpcklbw %%mm7, %%mm1\n\t" | |
617 "punpckhbw %%mm7, %%mm2\n\t" | |
618 "punpckhbw %%mm7, %%mm3\n\t" | |
619 "paddusw %%mm1, %%mm0\n\t" | |
620 "paddusw %%mm3, %%mm2\n\t" | |
621 "movq %%mm4, %%mm1\n\t" | |
622 "movq %%mm5, %%mm3\n\t" | |
623 "punpcklbw %%mm7, %%mm4\n\t" | |
624 "punpcklbw %%mm7, %%mm5\n\t" | |
625 "punpckhbw %%mm7, %%mm1\n\t" | |
626 "punpckhbw %%mm7, %%mm3\n\t" | |
627 "paddusw %%mm5, %%mm4\n\t" | |
628 "paddusw %%mm3, %%mm1\n\t" | |
629 "paddusw %%mm6, %%mm4\n\t" | |
630 "paddusw %%mm6, %%mm1\n\t" | |
631 "paddusw %%mm4, %%mm0\n\t" | |
632 "paddusw %%mm1, %%mm2\n\t" | |
633 "movq %3, %%mm5\n\t" | |
634 "psrlw $2, %%mm0\n\t" | |
635 "movq %0, %%mm1\n\t" | |
636 "psrlw $2, %%mm2\n\t" | |
637 "movq %%mm1, %%mm3\n\t" | |
638 "punpcklbw %%mm7, %%mm1\n\t" | |
639 "punpckhbw %%mm7, %%mm3\n\t" | |
640 "paddusw %%mm1, %%mm0\n\t" | |
641 "paddusw %%mm3, %%mm2\n\t" | |
642 "paddusw %%mm5, %%mm0\n\t" | |
643 "paddusw %%mm5, %%mm2\n\t" | |
644 "psrlw $1, %%mm0\n\t" | |
645 "psrlw $1, %%mm2\n\t" | |
646 "packuswb %%mm2, %%mm0\n\t" | |
647 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
648 :"+m"(*p) |
0 | 649 :"m"(*pix), |
8 | 650 "m"(*(pix+line_size)), "m"(mm_wone) |
0 | 651 :"memory"); |
652 pix += line_size; | |
653 p += line_size ; | |
654 } while(--h); | |
655 } | |
656 | |
657 static void avg_no_rnd_pixels_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
658 { | |
659 UINT8 *p; | |
660 const UINT8 *pix; | |
661 p = block; | |
662 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
663 MOVQ_ZERO(mm7); |
0 | 664 do { |
665 __asm __volatile( | |
666 "movq %1, %%mm0\n\t" | |
667 "movq %0, %%mm1\n\t" | |
668 "movq %%mm0, %%mm2\n\t" | |
669 "movq %%mm1, %%mm3\n\t" | |
670 "punpcklbw %%mm7, %%mm0\n\t" | |
671 "punpcklbw %%mm7, %%mm1\n\t" | |
672 "punpckhbw %%mm7, %%mm2\n\t" | |
673 "punpckhbw %%mm7, %%mm3\n\t" | |
674 "paddusw %%mm1, %%mm0\n\t" | |
675 "paddusw %%mm3, %%mm2\n\t" | |
676 "psrlw $1, %%mm0\n\t" | |
677 "psrlw $1, %%mm2\n\t" | |
678 "packuswb %%mm2, %%mm0\n\t" | |
679 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
680 :"+m"(*p) |
0 | 681 :"m"(*pix) |
682 :"memory"); | |
683 pix += line_size; | |
684 p += line_size ; | |
685 } while (--h); | |
686 } | |
687 | |
688 static void avg_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
689 { | |
690 UINT8 *p; | |
691 const UINT8 *pix; | |
692 p = block; | |
693 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
694 MOVQ_ZERO(mm7); |
0 | 695 do { |
696 __asm __volatile( | |
697 "movq %1, %%mm0\n\t" | |
698 "movq 1%1, %%mm1\n\t" | |
699 "movq %0, %%mm4\n\t" | |
700 "movq %%mm0, %%mm2\n\t" | |
701 "movq %%mm1, %%mm3\n\t" | |
702 "movq %%mm4, %%mm5\n\t" | |
703 "punpcklbw %%mm7, %%mm0\n\t" | |
704 "punpcklbw %%mm7, %%mm1\n\t" | |
705 "punpckhbw %%mm7, %%mm2\n\t" | |
706 "punpckhbw %%mm7, %%mm3\n\t" | |
707 "punpcklbw %%mm7, %%mm4\n\t" | |
708 "punpckhbw %%mm7, %%mm5\n\t" | |
709 "paddusw %%mm1, %%mm0\n\t" | |
710 "paddusw %%mm3, %%mm2\n\t" | |
711 "psrlw $1, %%mm0\n\t" | |
712 "psrlw $1, %%mm2\n\t" | |
713 "paddusw %%mm4, %%mm0\n\t" | |
714 "paddusw %%mm5, %%mm2\n\t" | |
715 "psrlw $1, %%mm0\n\t" | |
716 "psrlw $1, %%mm2\n\t" | |
717 "packuswb %%mm2, %%mm0\n\t" | |
718 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
719 :"+m"(*p) |
0 | 720 :"m"(*pix) |
721 :"memory"); | |
722 pix += line_size; | |
723 p += line_size; | |
724 } while (--h); | |
725 } | |
726 | |
727 static void avg_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
728 { | |
729 UINT8 *p; | |
730 const UINT8 *pix; | |
731 p = block; | |
732 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
733 MOVQ_ZERO(mm7); |
0 | 734 do { |
735 __asm __volatile( | |
736 "movq %1, %%mm0\n\t" | |
737 "movq %2, %%mm1\n\t" | |
738 "movq %0, %%mm4\n\t" | |
739 "movq %%mm0, %%mm2\n\t" | |
740 "movq %%mm1, %%mm3\n\t" | |
741 "movq %%mm4, %%mm5\n\t" | |
742 "punpcklbw %%mm7, %%mm0\n\t" | |
743 "punpcklbw %%mm7, %%mm1\n\t" | |
744 "punpckhbw %%mm7, %%mm2\n\t" | |
745 "punpckhbw %%mm7, %%mm3\n\t" | |
746 "punpcklbw %%mm7, %%mm4\n\t" | |
747 "punpckhbw %%mm7, %%mm5\n\t" | |
748 "paddusw %%mm1, %%mm0\n\t" | |
749 "paddusw %%mm3, %%mm2\n\t" | |
750 "psrlw $1, %%mm0\n\t" | |
751 "psrlw $1, %%mm2\n\t" | |
752 "paddusw %%mm4, %%mm0\n\t" | |
753 "paddusw %%mm5, %%mm2\n\t" | |
754 "psrlw $1, %%mm0\n\t" | |
755 "psrlw $1, %%mm2\n\t" | |
756 "packuswb %%mm2, %%mm0\n\t" | |
757 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
758 :"+m"(*p) |
0 | 759 :"m"(*pix), "m"(*(pix+line_size)) |
760 :"memory"); | |
761 pix += line_size; | |
762 p += line_size ; | |
763 } while(--h); | |
764 } | |
765 | |
766 static void avg_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
767 { | |
768 UINT8 *p; | |
769 const UINT8 *pix; | |
770 p = block; | |
771 pix = pixels; | |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
772 MOVQ_ZERO(mm7); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
773 MOVQ_WONE(mm6); |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
774 JUMPALIGN(); |
0 | 775 do { |
776 __asm __volatile( | |
777 "movq %1, %%mm0\n\t" | |
778 "movq %2, %%mm1\n\t" | |
779 "movq 1%1, %%mm4\n\t" | |
780 "movq 1%2, %%mm5\n\t" | |
781 "movq %%mm0, %%mm2\n\t" | |
782 "movq %%mm1, %%mm3\n\t" | |
783 "punpcklbw %%mm7, %%mm0\n\t" | |
784 "punpcklbw %%mm7, %%mm1\n\t" | |
785 "punpckhbw %%mm7, %%mm2\n\t" | |
786 "punpckhbw %%mm7, %%mm3\n\t" | |
787 "paddusw %%mm1, %%mm0\n\t" | |
788 "paddusw %%mm3, %%mm2\n\t" | |
789 "movq %%mm4, %%mm1\n\t" | |
790 "movq %%mm5, %%mm3\n\t" | |
791 "punpcklbw %%mm7, %%mm4\n\t" | |
792 "punpcklbw %%mm7, %%mm5\n\t" | |
793 "punpckhbw %%mm7, %%mm1\n\t" | |
794 "punpckhbw %%mm7, %%mm3\n\t" | |
795 "paddusw %%mm5, %%mm4\n\t" | |
796 "paddusw %%mm3, %%mm1\n\t" | |
797 "paddusw %%mm6, %%mm4\n\t" | |
798 "paddusw %%mm6, %%mm1\n\t" | |
799 "paddusw %%mm4, %%mm0\n\t" | |
800 "paddusw %%mm1, %%mm2\n\t" | |
801 "movq %0, %%mm1\n\t" | |
802 "psrlw $2, %%mm0\n\t" | |
803 "movq %%mm1, %%mm3\n\t" | |
804 "psrlw $2, %%mm2\n\t" | |
805 "punpcklbw %%mm7, %%mm1\n\t" | |
806 "punpckhbw %%mm7, %%mm3\n\t" | |
807 "paddusw %%mm1, %%mm0\n\t" | |
808 "paddusw %%mm3, %%mm2\n\t" | |
809 "psrlw $1, %%mm0\n\t" | |
810 "psrlw $1, %%mm2\n\t" | |
811 "packuswb %%mm2, %%mm0\n\t" | |
812 "movq %%mm0, %0\n\t" | |
151
ae0516eadae2
fixed gcc-3.0.x compilation (by Michael Niedermayer)
nickols_k
parents:
42
diff
changeset
|
813 :"+m"(*p) |
0 | 814 :"m"(*pix), |
815 "m"(*(pix+line_size)) | |
816 :"memory"); | |
817 pix += line_size; | |
818 p += line_size; | |
819 } while(--h); | |
820 } | |
821 | |
296 | 822 static void clear_blocks_mmx(DCTELEM *blocks) |
823 { | |
824 asm volatile( | |
825 "pxor %%mm7, %%mm7 \n\t" | |
826 "movl $-128*6, %%eax \n\t" | |
827 "1: \n\t" | |
828 "movq %%mm7, (%0, %%eax) \n\t" | |
829 "movq %%mm7, 8(%0, %%eax) \n\t" | |
830 "movq %%mm7, 16(%0, %%eax) \n\t" | |
831 "movq %%mm7, 24(%0, %%eax) \n\t" | |
832 "addl $32, %%eax \n\t" | |
833 " js 1b \n\t" | |
834 : : "r" (((int)blocks)+128*6) | |
835 : "%eax" | |
836 ); | |
837 } | |
838 | |
393 | 839 #if 0 |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
840 static void just_return() { return; } |
393 | 841 #endif |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
842 |
0 | 843 void dsputil_init_mmx(void) |
844 { | |
845 mm_flags = mm_support(); | |
188 | 846 #if 1 |
847 printf("libavcodec: CPU flags:"); | |
0 | 848 if (mm_flags & MM_MMX) |
849 printf(" mmx"); | |
850 if (mm_flags & MM_MMXEXT) | |
851 printf(" mmxext"); | |
852 if (mm_flags & MM_3DNOW) | |
853 printf(" 3dnow"); | |
854 if (mm_flags & MM_SSE) | |
855 printf(" sse"); | |
856 if (mm_flags & MM_SSE2) | |
857 printf(" sse2"); | |
858 printf("\n"); | |
859 #endif | |
860 | |
861 if (mm_flags & MM_MMX) { | |
862 get_pixels = get_pixels_mmx; | |
324 | 863 diff_pixels = diff_pixels_mmx; |
0 | 864 put_pixels_clamped = put_pixels_clamped_mmx; |
865 add_pixels_clamped = add_pixels_clamped_mmx; | |
296 | 866 clear_blocks= clear_blocks_mmx; |
415 | 867 |
294 | 868 pix_abs16x16 = pix_abs16x16_mmx; |
869 pix_abs16x16_x2 = pix_abs16x16_x2_mmx; | |
870 pix_abs16x16_y2 = pix_abs16x16_y2_mmx; | |
0 | 871 pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx; |
294 | 872 pix_abs8x8 = pix_abs8x8_mmx; |
873 pix_abs8x8_x2 = pix_abs8x8_x2_mmx; | |
874 pix_abs8x8_y2 = pix_abs8x8_y2_mmx; | |
875 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx; | |
0 | 876 av_fdct = fdct_mmx; |
877 | |
878 put_pixels_tab[0] = put_pixels_mmx; | |
879 put_pixels_tab[1] = put_pixels_x2_mmx; | |
880 put_pixels_tab[2] = put_pixels_y2_mmx; | |
881 put_pixels_tab[3] = put_pixels_xy2_mmx; | |
882 | |
883 put_no_rnd_pixels_tab[0] = put_pixels_mmx; | |
884 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx; | |
885 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx; | |
886 put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx; | |
415 | 887 |
0 | 888 avg_pixels_tab[0] = avg_pixels_mmx; |
889 avg_pixels_tab[1] = avg_pixels_x2_mmx; | |
890 avg_pixels_tab[2] = avg_pixels_y2_mmx; | |
891 avg_pixels_tab[3] = avg_pixels_xy2_mmx; | |
892 | |
893 avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_mmx; | |
894 avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx; | |
895 avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx; | |
896 avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx; | |
386 | 897 |
0 | 898 if (mm_flags & MM_MMXEXT) { |
294 | 899 pix_abs16x16 = pix_abs16x16_mmx2; |
900 pix_abs16x16_x2 = pix_abs16x16_x2_mmx2; | |
901 pix_abs16x16_y2 = pix_abs16x16_y2_mmx2; | |
902 pix_abs16x16_xy2= pix_abs16x16_xy2_mmx2; | |
415 | 903 |
294 | 904 pix_abs8x8 = pix_abs8x8_mmx2; |
905 pix_abs8x8_x2 = pix_abs8x8_x2_mmx2; | |
906 pix_abs8x8_y2 = pix_abs8x8_y2_mmx2; | |
907 pix_abs8x8_xy2= pix_abs8x8_xy2_mmx2; | |
386 | 908 |
909 put_pixels_tab[1] = put_pixels_x2_mmx2; | |
910 put_pixels_tab[2] = put_pixels_y2_mmx2; | |
911 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx2; | |
912 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx2; | |
415 | 913 |
386 | 914 avg_pixels_tab[0] = avg_pixels_mmx2; |
915 avg_pixels_tab[1] = avg_pixels_x2_mmx2; | |
916 avg_pixels_tab[2] = avg_pixels_y2_mmx2; | |
917 avg_pixels_tab[3] = avg_pixels_xy2_mmx2; | |
0 | 918 } else if (mm_flags & MM_3DNOW) { |
919 put_pixels_tab[1] = put_pixels_x2_3dnow; | |
920 put_pixels_tab[2] = put_pixels_y2_3dnow; | |
386 | 921 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_3dnow; |
922 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_3dnow; | |
393 | 923 |
0 | 924 avg_pixels_tab[0] = avg_pixels_3dnow; |
925 avg_pixels_tab[1] = avg_pixels_x2_3dnow; | |
926 avg_pixels_tab[2] = avg_pixels_y2_3dnow; | |
927 avg_pixels_tab[3] = avg_pixels_xy2_3dnow; | |
928 } | |
19
82d4c9be9873
MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2
arpi_esp
parents:
8
diff
changeset
|
929 |
42 | 930 /* idct */ |
931 if (mm_flags & MM_MMXEXT) { | |
932 ff_idct = ff_mmxext_idct; | |
933 } else { | |
934 ff_idct = ff_mmx_idct; | |
935 } | |
174
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
936 #ifdef SIMPLE_IDCT |
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
937 // ff_idct = simple_idct; |
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
938 ff_idct = simple_idct_mmx; |
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
151
diff
changeset
|
939 #endif |
0 | 940 } |
247
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
941 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
942 #if 0 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
943 // for speed testing |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
944 get_pixels = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
945 put_pixels_clamped = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
946 add_pixels_clamped = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
947 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
948 pix_abs16x16 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
949 pix_abs16x16_x2 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
950 pix_abs16x16_y2 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
951 pix_abs16x16_xy2 = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
952 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
953 put_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
954 put_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
955 put_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
956 put_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
957 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
958 put_no_rnd_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
959 put_no_rnd_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
960 put_no_rnd_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
961 put_no_rnd_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
962 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
963 avg_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
964 avg_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
965 avg_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
966 avg_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
967 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
968 avg_no_rnd_pixels_tab[0] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
969 avg_no_rnd_pixels_tab[1] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
970 avg_no_rnd_pixels_tab[2] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
971 avg_no_rnd_pixels_tab[3] = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
972 |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
973 //av_fdct = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
974 //ff_idct = just_return; |
6f48cacd9ed9
* some modifications to allow gcc to compile same code for -fPIC
kabi
parents:
188
diff
changeset
|
975 #endif |
0 | 976 } |
402 | 977 |
978 /* remove any non bit exact operation (testing purpose). NOTE that | |
979 this function should be kept as small as possible because it is | |
980 always difficult to test automatically non bit exact cases. */ | |
981 void dsputil_set_bit_exact_mmx(void) | |
982 { | |
983 if (mm_flags & MM_MMX) { | |
984 if (mm_flags & MM_MMXEXT) { | |
985 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx; | |
986 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx; | |
987 avg_pixels_tab[3] = avg_pixels_xy2_mmx; | |
988 } else if (mm_flags & MM_3DNOW) { | |
989 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx; | |
990 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx; | |
991 avg_pixels_tab[3] = avg_pixels_xy2_mmx; | |
992 } | |
993 } | |
994 } |