Mercurial > libavcodec.hg
annotate i386/simple_idct_mmx.c @ 3990:746a60ba3177 libavcodec
enable CMOV_IS_FAST as its faster or equal speed on every cpu (duron, athlon, PM, P3) from which ive seen benchmarks, it might be slower on P4 but noone has posted benchmarks ...
author | michael |
---|---|
date | Wed, 11 Oct 2006 12:23:40 +0000 |
parents | c8c591fe26f8 |
children | d5ba514e3f4a |
rev | line source |
---|---|
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1 /* |
429 | 2 * Simple IDCT MMX |
3 * | |
4 * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at> | |
5 * | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3576
diff
changeset
|
6 * This file is part of FFmpeg. |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3576
diff
changeset
|
7 * |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3576
diff
changeset
|
8 * FFmpeg is free software; you can redistribute it and/or |
429 | 9 * modify it under the terms of the GNU Lesser General Public |
10 * License as published by the Free Software Foundation; either | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3576
diff
changeset
|
11 * version 2.1 of the License, or (at your option) any later version. |
429 | 12 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3576
diff
changeset
|
13 * FFmpeg is distributed in the hope that it will be useful, |
429 | 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 * Lesser General Public License for more details. | |
17 * | |
18 * You should have received a copy of the GNU Lesser General Public | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3576
diff
changeset
|
19 * License along with FFmpeg; if not, write to the Free Software |
3036
0b546eab515d
Update licensing information: The FSF changed postal address.
diego
parents:
2979
diff
changeset
|
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
429 | 21 */ |
22 #include "../dsputil.h" | |
2024
f65d87bfdd5a
some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>)
michael
parents:
1845
diff
changeset
|
23 #include "../simple_idct.h" |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
24 |
351 | 25 /* |
26 23170.475006 | |
27 22725.260826 | |
28 21406.727617 | |
29 19265.545870 | |
30 16384.000000 | |
31 12872.826198 | |
32 8866.956905 | |
33 4520.335430 | |
34 */ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
35 #define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
36 #define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
37 #define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
38 #define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
351 | 39 #if 0 |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
40 #define C4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
351 | 41 #else |
42 #define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5 | |
43 #endif | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
44 #define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
2979 | 45 #define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
46 #define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
47 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
48 #define ROW_SHIFT 11 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
49 #define COL_SHIFT 20 // 6 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
50 |
1845
3054613980a8
attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov))
michael
parents:
1469
diff
changeset
|
51 static const uint64_t attribute_used __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000ULL; |
3054613980a8
attribute used patch by (mitya at school dot ioffe dot ru (Dmitry Baryshkov))
michael
parents:
1469
diff
changeset
|
52 static const uint64_t attribute_used __attribute__((aligned(8))) d40000= 0x0000000000040000ULL; |
1469
43ceb6e34b06
another non const static, maybe its thread save now
michaelni
parents:
1064
diff
changeset
|
53 |
43ceb6e34b06
another non const static, maybe its thread save now
michaelni
parents:
1064
diff
changeset
|
54 static const int16_t __attribute__((aligned(8))) coeffs[]= { |
2979 | 55 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0, |
56 // 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0, | |
57 // 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16), | |
58 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0, | |
59 // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :) | |
60 // 0, 0, 0, 0, | |
61 // 0, 0, 0, 0, | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
62 |
209 | 63 C4, C4, C4, C4, |
64 C4, -C4, C4, -C4, | |
2967 | 65 |
209 | 66 C2, C6, C2, C6, |
67 C6, -C2, C6, -C2, | |
2967 | 68 |
209 | 69 C1, C3, C1, C3, |
70 C5, C7, C5, C7, | |
2967 | 71 |
209 | 72 C3, -C7, C3, -C7, |
73 -C1, -C5, -C1, -C5, | |
2967 | 74 |
209 | 75 C5, -C1, C5, -C1, |
76 C7, C3, C7, C3, | |
2967 | 77 |
209 | 78 C7, -C5, C7, -C5, |
79 C3, -C1, C3, -C1 | |
80 }; | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
81 |
350
6ebbecc10063
- Advanced Intra Coding (AIC) support for H.263+ encoder, just DC by now.
pulento
parents:
213
diff
changeset
|
82 #if 0 |
209 | 83 static void unused_var_killer(){ |
2979 | 84 int a= wm1010 + d40000; |
85 temp[0]=a; | |
209 | 86 } |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
87 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
88 static void inline idctCol (int16_t * col, int16_t *input) |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
89 { |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
90 #undef C0 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
91 #undef C1 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
92 #undef C2 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
93 #undef C3 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
94 #undef C4 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
95 #undef C5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
96 #undef C6 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
97 #undef C7 |
2979 | 98 int a0, a1, a2, a3, b0, b1, b2, b3; |
99 const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
100 const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
101 const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
102 const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
103 const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
104 const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
105 const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
106 const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
107 /* |
2979 | 108 if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) { |
109 col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] = | |
110 col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3; | |
111 return; | |
112 }*/ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
113 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
114 col[8*0] = input[8*0 + 0]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
115 col[8*1] = input[8*2 + 0]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
116 col[8*2] = input[8*0 + 1]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
117 col[8*3] = input[8*2 + 1]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
118 col[8*4] = input[8*4 + 0]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
119 col[8*5] = input[8*6 + 0]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
120 col[8*6] = input[8*4 + 1]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
121 col[8*7] = input[8*6 + 1]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
122 |
2979 | 123 a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6] + (1<<(COL_SHIFT-1)); |
124 a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6] + (1<<(COL_SHIFT-1)); | |
125 a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6] + (1<<(COL_SHIFT-1)); | |
126 a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6] + (1<<(COL_SHIFT-1)); | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
127 |
2979 | 128 b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7]; |
129 b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7]; | |
130 b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7]; | |
131 b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7]; | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
132 |
2979 | 133 col[8*0] = (a0 + b0) >> COL_SHIFT; |
134 col[8*1] = (a1 + b1) >> COL_SHIFT; | |
135 col[8*2] = (a2 + b2) >> COL_SHIFT; | |
136 col[8*3] = (a3 + b3) >> COL_SHIFT; | |
137 col[8*4] = (a3 - b3) >> COL_SHIFT; | |
138 col[8*5] = (a2 - b2) >> COL_SHIFT; | |
139 col[8*6] = (a1 - b1) >> COL_SHIFT; | |
140 col[8*7] = (a0 - b0) >> COL_SHIFT; | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
141 } |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
142 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
143 static void inline idctRow (int16_t * output, int16_t * input) |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
144 { |
2979 | 145 int16_t row[8]; |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
146 |
2979 | 147 int a0, a1, a2, a3, b0, b1, b2, b3; |
148 const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
149 const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
150 const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
151 const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
152 const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
153 const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
154 const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
155 const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
156 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
157 row[0] = input[0]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
158 row[2] = input[1]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
159 row[4] = input[4]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
160 row[6] = input[5]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
161 row[1] = input[8]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
162 row[3] = input[9]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
163 row[5] = input[12]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
164 row[7] = input[13]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
165 |
2979 | 166 if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) { |
167 row[0] = row[1] = row[2] = row[3] = row[4] = | |
168 row[5] = row[6] = row[7] = row[0]<<3; | |
169 output[0] = row[0]; | |
170 output[2] = row[1]; | |
171 output[4] = row[2]; | |
172 output[6] = row[3]; | |
173 output[8] = row[4]; | |
174 output[10] = row[5]; | |
175 output[12] = row[6]; | |
176 output[14] = row[7]; | |
177 return; | |
178 } | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
179 |
2979 | 180 a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1)); |
181 a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1)); | |
182 a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1)); | |
183 a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1)); | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
184 |
2979 | 185 b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7]; |
186 b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7]; | |
187 b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7]; | |
188 b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7]; | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
189 |
2979 | 190 row[0] = (a0 + b0) >> ROW_SHIFT; |
191 row[1] = (a1 + b1) >> ROW_SHIFT; | |
192 row[2] = (a2 + b2) >> ROW_SHIFT; | |
193 row[3] = (a3 + b3) >> ROW_SHIFT; | |
194 row[4] = (a3 - b3) >> ROW_SHIFT; | |
195 row[5] = (a2 - b2) >> ROW_SHIFT; | |
196 row[6] = (a1 - b1) >> ROW_SHIFT; | |
197 row[7] = (a0 - b0) >> ROW_SHIFT; | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
198 |
2979 | 199 output[0] = row[0]; |
200 output[2] = row[1]; | |
201 output[4] = row[2]; | |
202 output[6] = row[3]; | |
203 output[8] = row[4]; | |
204 output[10] = row[5]; | |
205 output[12] = row[6]; | |
206 output[14] = row[7]; | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
207 } |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
208 #endif |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
209 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
210 static inline void idct(int16_t *block) |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
211 { |
2979 | 212 int64_t __attribute__((aligned(8))) align_tmp[16]; |
213 int16_t * const temp= (int16_t*)align_tmp; | |
1469
43ceb6e34b06
another non const static, maybe its thread save now
michaelni
parents:
1064
diff
changeset
|
214 |
2979 | 215 asm volatile( |
209 | 216 #if 0 //Alternative, simpler variant |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
217 |
209 | 218 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ |
2979 | 219 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
220 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
221 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ | |
222 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ | |
223 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
224 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
225 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
226 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
227 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
228 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
229 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
230 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
231 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
232 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
233 #rounder ", %%mm4 \n\t"\ | |
234 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
235 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ | |
236 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ | |
237 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ | |
238 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
239 #rounder ", %%mm0 \n\t"\ | |
240 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ | |
241 "paddd %%mm0, %%mm0 \n\t" \ | |
242 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ | |
243 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
244 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ | |
245 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
246 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
247 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
248 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
249 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
250 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ | |
251 "psrad $" #shift ", %%mm7 \n\t"\ | |
252 "psrad $" #shift ", %%mm4 \n\t"\ | |
253 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ | |
254 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ | |
255 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
256 "psrad $" #shift ", %%mm1 \n\t"\ | |
257 "psrad $" #shift ", %%mm2 \n\t"\ | |
258 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ | |
259 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ | |
260 "movq %%mm7, " #dst " \n\t"\ | |
261 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ | |
262 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
263 "movq %%mm2, 24+" #dst " \n\t"\ | |
264 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
265 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
266 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
267 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
268 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ | |
269 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
270 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
271 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
272 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ | |
273 "psrad $" #shift ", %%mm2 \n\t"\ | |
274 "psrad $" #shift ", %%mm0 \n\t"\ | |
275 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
276 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ | |
277 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
278 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
279 "psrad $" #shift ", %%mm6 \n\t"\ | |
280 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ | |
281 "movq %%mm2, 8+" #dst " \n\t"\ | |
282 "psrad $" #shift ", %%mm4 \n\t"\ | |
283 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ | |
284 "movq %%mm4, 16+" #dst " \n\t"\ | |
209 | 285 |
3565
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
286 #define COL_IDCT(src0, src4, src1, src5, dst, shift) \ |
2979 | 287 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
288 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
289 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ | |
290 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ | |
291 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
292 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
293 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
294 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
295 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
296 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
297 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
298 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
299 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
300 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
301 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
302 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ | |
303 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ | |
304 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
305 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ | |
306 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ | |
307 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ | |
308 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
309 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
310 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\ | |
311 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
312 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
313 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
314 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
315 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
316 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\ | |
317 "psrad $" #shift ", %%mm7 \n\t"\ | |
318 "psrad $" #shift ", %%mm4 \n\t"\ | |
319 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ | |
320 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
321 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
322 "psrad $" #shift ", %%mm0 \n\t"\ | |
323 "psrad $" #shift ", %%mm2 \n\t"\ | |
324 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
325 "movd %%mm7, " #dst " \n\t"\ | |
326 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
327 "movd %%mm0, 16+" #dst " \n\t"\ | |
328 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
329 "movd %%mm2, 96+" #dst " \n\t"\ | |
330 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
331 "movd %%mm4, 112+" #dst " \n\t"\ | |
332 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\ | |
333 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
334 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
335 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
336 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
337 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
338 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ | |
339 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
340 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
341 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
342 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ | |
343 "psrad $" #shift ", %%mm2 \n\t"\ | |
344 "psrad $" #shift ", %%mm5 \n\t"\ | |
345 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
346 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\ | |
347 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
348 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
349 "psrad $" #shift ", %%mm6 \n\t"\ | |
350 "psrad $" #shift ", %%mm4 \n\t"\ | |
351 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
352 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
353 "movd %%mm2, 32+" #dst " \n\t"\ | |
354 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ | |
355 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ | |
356 "movd %%mm6, 48+" #dst " \n\t"\ | |
357 "movd %%mm4, 64+" #dst " \n\t"\ | |
358 "movd %%mm5, 80+" #dst " \n\t"\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
359 |
2967 | 360 |
209 | 361 #define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ |
2979 | 362 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
363 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
364 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ | |
365 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ | |
366 "movq "MANGLE(wm1010)", %%mm4 \n\t"\ | |
367 "pand %%mm0, %%mm4 \n\t"\ | |
368 "por %%mm1, %%mm4 \n\t"\ | |
369 "por %%mm2, %%mm4 \n\t"\ | |
370 "por %%mm3, %%mm4 \n\t"\ | |
371 "packssdw %%mm4,%%mm4 \n\t"\ | |
372 "movd %%mm4, %%eax \n\t"\ | |
373 "orl %%eax, %%eax \n\t"\ | |
374 "jz 1f \n\t"\ | |
375 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
376 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
377 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
378 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
379 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
380 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
381 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
382 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
383 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
384 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
385 #rounder ", %%mm4 \n\t"\ | |
386 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
387 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ | |
388 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ | |
389 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ | |
390 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
391 #rounder ", %%mm0 \n\t"\ | |
392 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ | |
393 "paddd %%mm0, %%mm0 \n\t" \ | |
394 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ | |
395 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
396 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ | |
397 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
398 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
399 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
400 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
401 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
402 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ | |
403 "psrad $" #shift ", %%mm7 \n\t"\ | |
404 "psrad $" #shift ", %%mm4 \n\t"\ | |
405 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ | |
406 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ | |
407 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
408 "psrad $" #shift ", %%mm1 \n\t"\ | |
409 "psrad $" #shift ", %%mm2 \n\t"\ | |
410 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ | |
411 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ | |
412 "movq %%mm7, " #dst " \n\t"\ | |
413 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ | |
414 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
415 "movq %%mm2, 24+" #dst " \n\t"\ | |
416 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
417 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
418 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
419 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
420 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ | |
421 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
422 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
423 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
424 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ | |
425 "psrad $" #shift ", %%mm2 \n\t"\ | |
426 "psrad $" #shift ", %%mm0 \n\t"\ | |
427 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
428 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ | |
429 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
430 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
431 "psrad $" #shift ", %%mm6 \n\t"\ | |
432 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ | |
433 "movq %%mm2, 8+" #dst " \n\t"\ | |
434 "psrad $" #shift ", %%mm4 \n\t"\ | |
435 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ | |
436 "movq %%mm4, 16+" #dst " \n\t"\ | |
437 "jmp 2f \n\t"\ | |
438 "1: \n\t"\ | |
439 "pslld $16, %%mm0 \n\t"\ | |
440 "#paddd "MANGLE(d40000)", %%mm0 \n\t"\ | |
441 "psrad $13, %%mm0 \n\t"\ | |
442 "packssdw %%mm0, %%mm0 \n\t"\ | |
443 "movq %%mm0, " #dst " \n\t"\ | |
444 "movq %%mm0, 8+" #dst " \n\t"\ | |
445 "movq %%mm0, 16+" #dst " \n\t"\ | |
446 "movq %%mm0, 24+" #dst " \n\t"\ | |
447 "2: \n\t" | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
448 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
449 |
209 | 450 //IDCT( src0, src4, src1, src5, dst, rounder, shift) |
451 ROW_IDCT( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11) | |
452 /*ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11) | |
453 ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11) | |
454 ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
455 |
209 | 456 DC_COND_ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11) |
457 DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11) | |
458 DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11) | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
459 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
460 |
3565
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
461 //IDCT( src0, src4, src1, src5, dst, shift) |
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
462 COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) |
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
463 COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) |
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
464 COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) |
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
465 COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
466 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
467 #else |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
468 |
209 | 469 #define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ |
2979 | 470 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
471 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
472 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ | |
473 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ | |
474 "movq "MANGLE(wm1010)", %%mm4 \n\t"\ | |
475 "pand %%mm0, %%mm4 \n\t"\ | |
476 "por %%mm1, %%mm4 \n\t"\ | |
477 "por %%mm2, %%mm4 \n\t"\ | |
478 "por %%mm3, %%mm4 \n\t"\ | |
479 "packssdw %%mm4,%%mm4 \n\t"\ | |
480 "movd %%mm4, %%eax \n\t"\ | |
481 "orl %%eax, %%eax \n\t"\ | |
482 "jz 1f \n\t"\ | |
483 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
484 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
485 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
486 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
487 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
488 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
489 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
490 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
491 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
492 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
493 #rounder ", %%mm4 \n\t"\ | |
494 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
495 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ | |
496 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ | |
497 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ | |
498 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
499 #rounder ", %%mm0 \n\t"\ | |
500 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ | |
501 "paddd %%mm0, %%mm0 \n\t" \ | |
502 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ | |
503 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
504 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ | |
505 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
506 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
507 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
508 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
509 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
510 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ | |
511 "psrad $" #shift ", %%mm7 \n\t"\ | |
512 "psrad $" #shift ", %%mm4 \n\t"\ | |
513 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ | |
514 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ | |
515 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
516 "psrad $" #shift ", %%mm1 \n\t"\ | |
517 "psrad $" #shift ", %%mm2 \n\t"\ | |
518 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ | |
519 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ | |
520 "movq %%mm7, " #dst " \n\t"\ | |
521 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ | |
522 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
523 "movq %%mm2, 24+" #dst " \n\t"\ | |
524 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
525 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
526 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
527 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
528 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ | |
529 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
530 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
531 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
532 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ | |
533 "psrad $" #shift ", %%mm2 \n\t"\ | |
534 "psrad $" #shift ", %%mm0 \n\t"\ | |
535 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
536 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ | |
537 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
538 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
539 "psrad $" #shift ", %%mm6 \n\t"\ | |
540 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ | |
541 "movq %%mm2, 8+" #dst " \n\t"\ | |
542 "psrad $" #shift ", %%mm4 \n\t"\ | |
543 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ | |
544 "movq %%mm4, 16+" #dst " \n\t"\ | |
545 "jmp 2f \n\t"\ | |
546 "1: \n\t"\ | |
547 "pslld $16, %%mm0 \n\t"\ | |
548 "paddd "MANGLE(d40000)", %%mm0 \n\t"\ | |
549 "psrad $13, %%mm0 \n\t"\ | |
550 "packssdw %%mm0, %%mm0 \n\t"\ | |
551 "movq %%mm0, " #dst " \n\t"\ | |
552 "movq %%mm0, 8+" #dst " \n\t"\ | |
553 "movq %%mm0, 16+" #dst " \n\t"\ | |
554 "movq %%mm0, 24+" #dst " \n\t"\ | |
555 "2: \n\t" | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
556 |
209 | 557 #define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \ |
2979 | 558 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
559 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
560 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ | |
561 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ | |
562 "movq %%mm0, %%mm4 \n\t"\ | |
563 "por %%mm1, %%mm4 \n\t"\ | |
564 "por %%mm2, %%mm4 \n\t"\ | |
565 "por %%mm3, %%mm4 \n\t"\ | |
566 "packssdw %%mm4,%%mm4 \n\t"\ | |
567 "movd %%mm4, %%eax \n\t"\ | |
568 "orl %%eax, %%eax \n\t"\ | |
569 "jz " #bt " \n\t"\ | |
570 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
571 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
572 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
573 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
574 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
575 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
576 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
577 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
578 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
579 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
580 #rounder ", %%mm4 \n\t"\ | |
581 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
582 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ | |
583 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ | |
584 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ | |
585 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
586 #rounder ", %%mm0 \n\t"\ | |
587 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ | |
588 "paddd %%mm0, %%mm0 \n\t" \ | |
589 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ | |
590 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
591 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ | |
592 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
593 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
594 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
595 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
596 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
597 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ | |
598 "psrad $" #shift ", %%mm7 \n\t"\ | |
599 "psrad $" #shift ", %%mm4 \n\t"\ | |
600 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ | |
601 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ | |
602 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
603 "psrad $" #shift ", %%mm1 \n\t"\ | |
604 "psrad $" #shift ", %%mm2 \n\t"\ | |
605 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ | |
606 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ | |
607 "movq %%mm7, " #dst " \n\t"\ | |
608 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ | |
609 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
610 "movq %%mm2, 24+" #dst " \n\t"\ | |
611 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
612 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
613 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
614 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
615 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ | |
616 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
617 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
618 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
619 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ | |
620 "psrad $" #shift ", %%mm2 \n\t"\ | |
621 "psrad $" #shift ", %%mm0 \n\t"\ | |
622 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
623 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ | |
624 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
625 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
626 "psrad $" #shift ", %%mm6 \n\t"\ | |
627 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ | |
628 "movq %%mm2, 8+" #dst " \n\t"\ | |
629 "psrad $" #shift ", %%mm4 \n\t"\ | |
630 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ | |
631 "movq %%mm4, 16+" #dst " \n\t"\ | |
209 | 632 |
633 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
2979 | 634 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
635 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
636 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ | |
637 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ | |
638 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
639 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
640 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
641 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
642 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
643 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
644 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
645 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
646 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
647 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
648 #rounder ", %%mm4 \n\t"\ | |
649 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
650 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ | |
651 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ | |
652 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ | |
653 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
654 #rounder ", %%mm0 \n\t"\ | |
655 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ | |
656 "paddd %%mm0, %%mm0 \n\t" \ | |
657 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ | |
658 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
659 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ | |
660 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
661 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
662 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
663 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
664 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
665 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ | |
666 "psrad $" #shift ", %%mm7 \n\t"\ | |
667 "psrad $" #shift ", %%mm4 \n\t"\ | |
668 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ | |
669 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ | |
670 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
671 "psrad $" #shift ", %%mm1 \n\t"\ | |
672 "psrad $" #shift ", %%mm2 \n\t"\ | |
673 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ | |
674 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ | |
675 "movq %%mm7, " #dst " \n\t"\ | |
676 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ | |
677 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
678 "movq %%mm2, 24+" #dst " \n\t"\ | |
679 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
680 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
681 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
682 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
683 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ | |
684 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
685 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
686 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
687 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ | |
688 "psrad $" #shift ", %%mm2 \n\t"\ | |
689 "psrad $" #shift ", %%mm0 \n\t"\ | |
690 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
691 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ | |
692 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
693 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
694 "psrad $" #shift ", %%mm6 \n\t"\ | |
695 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ | |
696 "movq %%mm2, 8+" #dst " \n\t"\ | |
697 "psrad $" #shift ", %%mm4 \n\t"\ | |
698 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ | |
699 "movq %%mm4, 16+" #dst " \n\t"\ | |
209 | 700 |
701 //IDCT( src0, src4, src1, src5, dst, rounder, shift) | |
702 DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11) | |
703 Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f) | |
704 Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f) | |
705 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f) | |
706 | |
707 #undef IDCT | |
3565
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
708 #define IDCT(src0, src4, src1, src5, dst, shift) \ |
2979 | 709 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
710 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
711 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ | |
712 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ | |
713 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
714 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
715 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
716 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
717 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
718 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
719 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
720 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
721 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
722 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
723 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
724 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ | |
725 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ | |
726 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
727 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ | |
728 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ | |
729 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ | |
730 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
731 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
732 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\ | |
733 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
734 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
735 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
736 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
737 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
738 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\ | |
739 "psrad $" #shift ", %%mm7 \n\t"\ | |
740 "psrad $" #shift ", %%mm4 \n\t"\ | |
741 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ | |
742 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
743 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
744 "psrad $" #shift ", %%mm0 \n\t"\ | |
745 "psrad $" #shift ", %%mm2 \n\t"\ | |
746 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
747 "movd %%mm7, " #dst " \n\t"\ | |
748 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
749 "movd %%mm0, 16+" #dst " \n\t"\ | |
750 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
751 "movd %%mm2, 96+" #dst " \n\t"\ | |
752 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
753 "movd %%mm4, 112+" #dst " \n\t"\ | |
754 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\ | |
755 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
756 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
757 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
758 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
759 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
760 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ | |
761 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
762 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
763 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
764 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ | |
765 "psrad $" #shift ", %%mm2 \n\t"\ | |
766 "psrad $" #shift ", %%mm5 \n\t"\ | |
767 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
768 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\ | |
769 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
770 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
771 "psrad $" #shift ", %%mm6 \n\t"\ | |
772 "psrad $" #shift ", %%mm4 \n\t"\ | |
773 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
774 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
775 "movd %%mm2, 32+" #dst " \n\t"\ | |
776 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ | |
777 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ | |
778 "movd %%mm6, 48+" #dst " \n\t"\ | |
779 "movd %%mm4, 64+" #dst " \n\t"\ | |
780 "movd %%mm5, 80+" #dst " \n\t" | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
781 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
782 |
3565
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
783 //IDCT( src0, src4, src1, src5, dst, shift) |
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
784 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) |
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
785 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) |
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
786 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) |
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
787 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) |
2979 | 788 "jmp 9f \n\t" |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
789 |
3576
f7125bf10892
Support for MacIntel, last part: balign directives
gpoirier
parents:
3565
diff
changeset
|
790 "#" ASMALIGN(4) \ |
2979 | 791 "4: \n\t" |
209 | 792 Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f) |
793 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f) | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
794 |
209 | 795 #undef IDCT |
3565
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
796 #define IDCT(src0, src4, src1, src5, dst, shift) \ |
2979 | 797 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
798 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
799 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ | |
800 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
801 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
802 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
803 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
804 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
805 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
806 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
807 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
808 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
809 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ | |
810 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ | |
811 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
812 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ | |
813 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ | |
814 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ | |
815 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
816 "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
817 "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
818 "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\ | |
819 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
820 "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
821 "psrad $" #shift ", %%mm1 \n\t"\ | |
822 "psrad $" #shift ", %%mm4 \n\t"\ | |
823 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ | |
824 "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
825 "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
826 "psrad $" #shift ", %%mm0 \n\t"\ | |
827 "psrad $" #shift ", %%mm2 \n\t"\ | |
828 "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\ | |
829 "movd %%mm1, " #dst " \n\t"\ | |
830 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
831 "movd %%mm0, 16+" #dst " \n\t"\ | |
832 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
833 "movd %%mm2, 96+" #dst " \n\t"\ | |
834 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
835 "movd %%mm4, 112+" #dst " \n\t"\ | |
836 "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\ | |
837 "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
838 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ | |
839 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
840 "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
841 "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\ | |
842 "psrad $" #shift ", %%mm2 \n\t"\ | |
843 "psrad $" #shift ", %%mm5 \n\t"\ | |
844 "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\ | |
845 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
846 "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\ | |
847 "psrad $" #shift ", %%mm6 \n\t"\ | |
848 "psrad $" #shift ", %%mm1 \n\t"\ | |
849 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
850 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
851 "movd %%mm2, 32+" #dst " \n\t"\ | |
852 "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\ | |
853 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ | |
854 "movd %%mm6, 48+" #dst " \n\t"\ | |
855 "movd %%mm1, 64+" #dst " \n\t"\ | |
856 "movd %%mm5, 80+" #dst " \n\t" | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
857 |
3565
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
858 //IDCT( src0, src4, src1, src5, dst, shift) |
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
859 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) |
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
860 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) |
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
861 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) |
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
862 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) |
2979 | 863 "jmp 9f \n\t" |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
864 |
3576
f7125bf10892
Support for MacIntel, last part: balign directives
gpoirier
parents:
3565
diff
changeset
|
865 "#" ASMALIGN(4) \ |
2979 | 866 "6: \n\t" |
209 | 867 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f) |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
868 |
209 | 869 #undef IDCT |
3565
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
870 #define IDCT(src0, src4, src1, src5, dst, shift) \ |
2979 | 871 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
872 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ | |
873 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
874 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
875 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
876 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
877 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
878 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
879 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ | |
880 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
881 "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
882 "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
883 "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\ | |
884 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
885 "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
886 "psrad $" #shift ", %%mm1 \n\t"\ | |
887 "psrad $" #shift ", %%mm4 \n\t"\ | |
888 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ | |
889 "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
890 "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
891 "psrad $" #shift ", %%mm0 \n\t"\ | |
892 "psrad $" #shift ", %%mm2 \n\t"\ | |
893 "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\ | |
894 "movd %%mm1, " #dst " \n\t"\ | |
895 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
896 "movd %%mm0, 16+" #dst " \n\t"\ | |
897 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
898 "movd %%mm2, 96+" #dst " \n\t"\ | |
899 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
900 "movd %%mm4, 112+" #dst " \n\t"\ | |
901 "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\ | |
902 "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
903 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ | |
904 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
905 "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
906 "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\ | |
907 "psrad $" #shift ", %%mm2 \n\t"\ | |
908 "psrad $" #shift ", %%mm5 \n\t"\ | |
909 "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\ | |
910 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
911 "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\ | |
912 "psrad $" #shift ", %%mm6 \n\t"\ | |
913 "psrad $" #shift ", %%mm1 \n\t"\ | |
914 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
915 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
916 "movd %%mm2, 32+" #dst " \n\t"\ | |
917 "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\ | |
918 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ | |
919 "movd %%mm6, 48+" #dst " \n\t"\ | |
920 "movd %%mm1, 64+" #dst " \n\t"\ | |
921 "movd %%mm5, 80+" #dst " \n\t" | |
209 | 922 |
923 | |
3565
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
924 //IDCT( src0, src4, src1, src5, dst, shift) |
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
925 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) |
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
926 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) |
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
927 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) |
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
928 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) |
2979 | 929 "jmp 9f \n\t" |
209 | 930 |
3576
f7125bf10892
Support for MacIntel, last part: balign directives
gpoirier
parents:
3565
diff
changeset
|
931 "#" ASMALIGN(4) \ |
2979 | 932 "2: \n\t" |
209 | 933 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f) |
934 | |
935 #undef IDCT | |
3565
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
936 #define IDCT(src0, src4, src1, src5, dst, shift) \ |
2979 | 937 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
938 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ | |
939 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ | |
940 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
941 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
942 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
943 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
944 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
945 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
946 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
947 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
948 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ | |
949 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
950 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
951 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\ | |
952 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
953 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
954 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
955 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
956 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
957 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\ | |
958 "psrad $" #shift ", %%mm7 \n\t"\ | |
959 "psrad $" #shift ", %%mm4 \n\t"\ | |
960 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ | |
961 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
962 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
963 "psrad $" #shift ", %%mm0 \n\t"\ | |
964 "psrad $" #shift ", %%mm2 \n\t"\ | |
965 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
966 "movd %%mm7, " #dst " \n\t"\ | |
967 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
968 "movd %%mm0, 16+" #dst " \n\t"\ | |
969 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
970 "movd %%mm2, 96+" #dst " \n\t"\ | |
971 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
972 "movd %%mm4, 112+" #dst " \n\t"\ | |
973 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\ | |
974 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
975 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
976 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
977 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
978 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
979 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ | |
980 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
981 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
982 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
983 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ | |
984 "psrad $" #shift ", %%mm2 \n\t"\ | |
985 "psrad $" #shift ", %%mm5 \n\t"\ | |
986 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
987 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\ | |
988 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
989 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
990 "psrad $" #shift ", %%mm6 \n\t"\ | |
991 "psrad $" #shift ", %%mm4 \n\t"\ | |
992 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
993 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
994 "movd %%mm2, 32+" #dst " \n\t"\ | |
995 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ | |
996 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ | |
997 "movd %%mm6, 48+" #dst " \n\t"\ | |
998 "movd %%mm4, 64+" #dst " \n\t"\ | |
999 "movd %%mm5, 80+" #dst " \n\t" | |
209 | 1000 |
3565
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
1001 //IDCT( src0, src4, src1, src5, dst, shift) |
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
1002 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) |
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
1003 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) |
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
1004 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) |
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
1005 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) |
2979 | 1006 "jmp 9f \n\t" |
209 | 1007 |
3576
f7125bf10892
Support for MacIntel, last part: balign directives
gpoirier
parents:
3565
diff
changeset
|
1008 "#" ASMALIGN(4) \ |
2979 | 1009 "3: \n\t" |
209 | 1010 #undef IDCT |
3565
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
1011 #define IDCT(src0, src4, src1, src5, dst, shift) \ |
2979 | 1012 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
1013 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ | |
1014 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
1015 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
1016 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
1017 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1018 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
1019 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
1020 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
1021 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1022 "movq 64(%2), %%mm3 \n\t"\ | |
1023 "pmaddwd %%mm2, %%mm3 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
1024 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
1025 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
1026 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
1027 "psrad $" #shift ", %%mm7 \n\t"\ | |
1028 "psrad $" #shift ", %%mm4 \n\t"\ | |
1029 "movq %%mm0, %%mm1 \n\t" /* A1 a1 */\ | |
1030 "paddd %%mm3, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
1031 "psubd %%mm3, %%mm1 \n\t" /* A1-B1 a1-b1 */\ | |
1032 "psrad $" #shift ", %%mm0 \n\t"\ | |
1033 "psrad $" #shift ", %%mm1 \n\t"\ | |
1034 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
1035 "movd %%mm7, " #dst " \n\t"\ | |
1036 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
1037 "movd %%mm0, 16+" #dst " \n\t"\ | |
1038 "packssdw %%mm1, %%mm1 \n\t" /* A1-B1 a1-b1 */\ | |
1039 "movd %%mm1, 96+" #dst " \n\t"\ | |
1040 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
1041 "movd %%mm4, 112+" #dst " \n\t"\ | |
1042 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
1043 "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
1044 "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
1045 "movq %%mm5, %%mm1 \n\t" /* A2 a2 */\ | |
1046 "paddd %%mm4, %%mm1 \n\t" /* A2+B2 a2+b2 */\ | |
1047 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ | |
1048 "psrad $" #shift ", %%mm1 \n\t"\ | |
1049 "psrad $" #shift ", %%mm5 \n\t"\ | |
1050 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
1051 "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
1052 "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
1053 "psrad $" #shift ", %%mm6 \n\t"\ | |
1054 "psrad $" #shift ", %%mm4 \n\t"\ | |
1055 "packssdw %%mm1, %%mm1 \n\t" /* A2+B2 a2+b2 */\ | |
1056 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
1057 "movd %%mm1, 32+" #dst " \n\t"\ | |
1058 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ | |
1059 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ | |
1060 "movd %%mm6, 48+" #dst " \n\t"\ | |
1061 "movd %%mm4, 64+" #dst " \n\t"\ | |
1062 "movd %%mm5, 80+" #dst " \n\t" | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1063 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1064 |
3565
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
1065 //IDCT( src0, src4, src1, src5, dst, shift) |
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
1066 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) |
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
1067 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) |
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
1068 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) |
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
1069 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) |
2979 | 1070 "jmp 9f \n\t" |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1071 |
3576
f7125bf10892
Support for MacIntel, last part: balign directives
gpoirier
parents:
3565
diff
changeset
|
1072 "#" ASMALIGN(4) \ |
2979 | 1073 "5: \n\t" |
209 | 1074 #undef IDCT |
3565
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
1075 #define IDCT(src0, src4, src1, src5, dst, shift) \ |
2979 | 1076 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
1077 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
1078 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
1079 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
1080 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
1081 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1082 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
1083 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
1084 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
1085 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
1086 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
1087 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ | |
1088 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ | |
1089 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1090 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ | |
1091 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ | |
1092 "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\ | |
1093 "movq 8+" #src4 ", %%mm3 \n\t" /* R6 R2 r6 r2 */\ | |
1094 "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\ | |
1095 "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
1096 "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\ | |
1097 "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1098 "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\ | |
1099 "pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
1100 "pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
1101 "paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\ | |
1102 "paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\ | |
1103 "psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\ | |
1104 "paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\ | |
1105 "paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\ | |
1106 "psubd %%mm3, %%mm2 \n\t" /* A2 a2 */\ | |
1107 "psrad $" #shift ", %%mm4 \n\t"\ | |
1108 "psrad $" #shift ", %%mm7 \n\t"\ | |
1109 "psrad $" #shift ", %%mm3 \n\t"\ | |
1110 "packssdw %%mm7, %%mm4 \n\t" /* A0 a0 */\ | |
1111 "movq %%mm4, " #dst " \n\t"\ | |
1112 "psrad $" #shift ", %%mm0 \n\t"\ | |
1113 "packssdw %%mm3, %%mm0 \n\t" /* A1 a1 */\ | |
1114 "movq %%mm0, 16+" #dst " \n\t"\ | |
1115 "movq %%mm0, 96+" #dst " \n\t"\ | |
1116 "movq %%mm4, 112+" #dst " \n\t"\ | |
1117 "psrad $" #shift ", %%mm5 \n\t"\ | |
1118 "psrad $" #shift ", %%mm6 \n\t"\ | |
1119 "psrad $" #shift ", %%mm2 \n\t"\ | |
1120 "packssdw %%mm2, %%mm5 \n\t" /* A2-B2 a2-b2 */\ | |
1121 "movq %%mm5, 32+" #dst " \n\t"\ | |
1122 "psrad $" #shift ", %%mm1 \n\t"\ | |
1123 "packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
1124 "movq %%mm6, 48+" #dst " \n\t"\ | |
1125 "movq %%mm6, 64+" #dst " \n\t"\ | |
1126 "movq %%mm5, 80+" #dst " \n\t" | |
2967 | 1127 |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1128 |
3565
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
1129 //IDCT( src0, src4, src1, src5, dst, shift) |
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
1130 IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) |
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
1131 //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) |
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
1132 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) |
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
1133 //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) |
2979 | 1134 "jmp 9f \n\t" |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1135 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1136 |
3576
f7125bf10892
Support for MacIntel, last part: balign directives
gpoirier
parents:
3565
diff
changeset
|
1137 "#" ASMALIGN(4) \ |
2979 | 1138 "1: \n\t" |
209 | 1139 #undef IDCT |
3565
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
1140 #define IDCT(src0, src4, src1, src5, dst, shift) \ |
2979 | 1141 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
1142 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
1143 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ | |
1144 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
1145 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
1146 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
1147 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1148 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
1149 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
1150 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
1151 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
1152 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
1153 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
1154 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
1155 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ | |
1156 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ | |
1157 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1158 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ | |
1159 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ | |
1160 "movq 64(%2), %%mm1 \n\t"\ | |
1161 "pmaddwd %%mm2, %%mm1 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
1162 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
1163 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
1164 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
1165 "psrad $" #shift ", %%mm7 \n\t"\ | |
1166 "psrad $" #shift ", %%mm4 \n\t"\ | |
1167 "movq %%mm0, %%mm3 \n\t" /* A1 a1 */\ | |
1168 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
1169 "psubd %%mm1, %%mm3 \n\t" /* A1-B1 a1-b1 */\ | |
1170 "psrad $" #shift ", %%mm0 \n\t"\ | |
1171 "psrad $" #shift ", %%mm3 \n\t"\ | |
1172 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
1173 "movd %%mm7, " #dst " \n\t"\ | |
1174 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
1175 "movd %%mm0, 16+" #dst " \n\t"\ | |
1176 "packssdw %%mm3, %%mm3 \n\t" /* A1-B1 a1-b1 */\ | |
1177 "movd %%mm3, 96+" #dst " \n\t"\ | |
1178 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
1179 "movd %%mm4, 112+" #dst " \n\t"\ | |
1180 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
1181 "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
1182 "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
1183 "movq %%mm5, %%mm3 \n\t" /* A2 a2 */\ | |
1184 "paddd %%mm4, %%mm3 \n\t" /* A2+B2 a2+b2 */\ | |
1185 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ | |
1186 "psrad $" #shift ", %%mm3 \n\t"\ | |
1187 "psrad $" #shift ", %%mm5 \n\t"\ | |
1188 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
1189 "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
1190 "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
1191 "psrad $" #shift ", %%mm6 \n\t"\ | |
1192 "packssdw %%mm3, %%mm3 \n\t" /* A2+B2 a2+b2 */\ | |
1193 "movd %%mm3, 32+" #dst " \n\t"\ | |
1194 "psrad $" #shift ", %%mm4 \n\t"\ | |
1195 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
1196 "movd %%mm6, 48+" #dst " \n\t"\ | |
1197 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ | |
1198 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ | |
1199 "movd %%mm4, 64+" #dst " \n\t"\ | |
1200 "movd %%mm5, 80+" #dst " \n\t" | |
2967 | 1201 |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1202 |
3565
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
1203 //IDCT( src0, src4, src1, src5, dst, shift) |
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
1204 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) |
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
1205 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) |
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
1206 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) |
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
1207 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) |
2979 | 1208 "jmp 9f \n\t" |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1209 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1210 |
3576
f7125bf10892
Support for MacIntel, last part: balign directives
gpoirier
parents:
3565
diff
changeset
|
1211 "#" ASMALIGN(4) |
2979 | 1212 "7: \n\t" |
209 | 1213 #undef IDCT |
3565
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
1214 #define IDCT(src0, src4, src1, src5, dst, shift) \ |
2979 | 1215 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
1216 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
1217 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
1218 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
1219 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1220 "psrad $" #shift ", %%mm4 \n\t"\ | |
1221 "psrad $" #shift ", %%mm0 \n\t"\ | |
1222 "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\ | |
1223 "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\ | |
1224 "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
1225 "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\ | |
1226 "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1227 "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\ | |
1228 "psrad $" #shift ", %%mm1 \n\t"\ | |
1229 "packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\ | |
1230 "movq %%mm4, " #dst " \n\t"\ | |
1231 "psrad $" #shift ", %%mm2 \n\t"\ | |
1232 "packssdw %%mm2, %%mm0 \n\t" /* A1 a1 */\ | |
1233 "movq %%mm0, 16+" #dst " \n\t"\ | |
1234 "movq %%mm0, 96+" #dst " \n\t"\ | |
1235 "movq %%mm4, 112+" #dst " \n\t"\ | |
1236 "movq %%mm0, 32+" #dst " \n\t"\ | |
1237 "movq %%mm4, 48+" #dst " \n\t"\ | |
1238 "movq %%mm4, 64+" #dst " \n\t"\ | |
1239 "movq %%mm0, 80+" #dst " \n\t" | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1240 |
3565
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
1241 //IDCT( src0, src4, src1, src5, dst, shift) |
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
1242 IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) |
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
1243 //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) |
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
1244 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) |
f086f8868bb6
Support for MacIntel, take xx: '/nop' illegal for old versions of GAS
gpoirier
parents:
3036
diff
changeset
|
1245 //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1246 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1247 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1248 #endif |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1249 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1250 /* |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1251 Input |
209 | 1252 00 40 04 44 20 60 24 64 |
1253 10 30 14 34 50 70 54 74 | |
1254 01 41 03 43 21 61 23 63 | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1255 11 31 13 33 51 71 53 73 |
209 | 1256 02 42 06 46 22 62 26 66 |
1257 12 32 16 36 52 72 56 76 | |
1258 05 45 07 47 25 65 27 67 | |
1259 15 35 17 37 55 75 57 77 | |
2967 | 1260 |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1261 Temp |
209 | 1262 00 04 10 14 20 24 30 34 |
1263 40 44 50 54 60 64 70 74 | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1264 01 03 11 13 21 23 31 33 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1265 41 43 51 53 61 63 71 73 |
209 | 1266 02 06 12 16 22 26 32 36 |
1267 42 46 52 56 62 66 72 76 | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1268 05 07 15 17 25 27 35 37 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1269 45 47 55 57 65 67 75 77 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1270 */ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1271 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1272 "9: \n\t" |
2979 | 1273 :: "r" (block), "r" (temp), "r" (coeffs) |
1274 : "%eax" | |
1275 ); | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1276 } |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1277 |
706
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
429
diff
changeset
|
1278 void ff_simple_idct_mmx(int16_t *block) |
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
429
diff
changeset
|
1279 { |
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
429
diff
changeset
|
1280 idct(block); |
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
429
diff
changeset
|
1281 } |
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
429
diff
changeset
|
1282 |
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
429
diff
changeset
|
1283 //FIXME merge add/put into the idct |
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
429
diff
changeset
|
1284 |
1064 | 1285 void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block) |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1286 { |
706
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
429
diff
changeset
|
1287 idct(block); |
854
3034f1816596
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1288 put_pixels_clamped_mmx(block, dest, line_size); |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1289 } |
1064 | 1290 void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block) |
706
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
429
diff
changeset
|
1291 { |
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
429
diff
changeset
|
1292 idct(block); |
854
3034f1816596
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1293 add_pixels_clamped_mmx(block, dest, line_size); |
706
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
429
diff
changeset
|
1294 } |