Mercurial > libavcodec.hg
annotate i386/simple_idct_mmx.c @ 1795:920e6381e1fe libavcodec
2 byte shorter userdata for mpeg4
in the past it was startcode,string,00,7F,startcode
now it is startcode,string,stratcode
both are mpeg4 compliant, as according to the standard the userdata lasts until the next 00 00 01 (startcode prefix) but some very primitive decoders which simply skip until the first 00 byte and then expect the next valid startcode might fail with the old variant, just a theory though (didnt test if quicktime can decode it now)
author | michael |
---|---|
date | Sun, 08 Feb 2004 22:52:35 +0000 |
parents | 43ceb6e34b06 |
children | 3054613980a8 |
rev | line source |
---|---|
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1 /* |
429 | 2 * Simple IDCT MMX |
3 * | |
4 * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at> | |
5 * | |
6 * This library is free software; you can redistribute it and/or | |
7 * modify it under the terms of the GNU Lesser General Public | |
8 * License as published by the Free Software Foundation; either | |
9 * version 2 of the License, or (at your option) any later version. | |
10 * | |
11 * This library is distributed in the hope that it will be useful, | |
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 * Lesser General Public License for more details. | |
15 * | |
16 * You should have received a copy of the GNU Lesser General Public | |
17 * License along with this library; if not, write to the Free Software | |
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
19 */ | |
20 #include "../dsputil.h" | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
21 |
351 | 22 /* |
23 23170.475006 | |
24 22725.260826 | |
25 21406.727617 | |
26 19265.545870 | |
27 16384.000000 | |
28 12872.826198 | |
29 8866.956905 | |
30 4520.335430 | |
31 */ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
32 #define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
33 #define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
34 #define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
35 #define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
351 | 36 #if 0 |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
37 #define C4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
351 | 38 #else |
39 #define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5 | |
40 #endif | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
41 #define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
42 #define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
43 #define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
44 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
45 #define ROW_SHIFT 11 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
46 #define COL_SHIFT 20 // 6 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
47 |
350
6ebbecc10063
- Advanced Intra Coding (AIC) support for H.263+ encoder, just DC by now.
pulento
parents:
213
diff
changeset
|
48 static const uint64_t __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000ULL; |
6ebbecc10063
- Advanced Intra Coding (AIC) support for H.263+ encoder, just DC by now.
pulento
parents:
213
diff
changeset
|
49 static const uint64_t __attribute__((aligned(8))) d40000= 0x0000000000040000ULL; |
1469
43ceb6e34b06
another non const static, maybe its thread save now
michaelni
parents:
1064
diff
changeset
|
50 |
43ceb6e34b06
another non const static, maybe its thread save now
michaelni
parents:
1064
diff
changeset
|
51 static const int16_t __attribute__((aligned(8))) coeffs[]= { |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
52 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0, |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
53 // 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0, |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
54 // 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16), |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
55 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0, |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
56 // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :) |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
57 // 0, 0, 0, 0, |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
58 // 0, 0, 0, 0, |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
59 |
209 | 60 C4, C4, C4, C4, |
61 C4, -C4, C4, -C4, | |
62 | |
63 C2, C6, C2, C6, | |
64 C6, -C2, C6, -C2, | |
65 | |
66 C1, C3, C1, C3, | |
67 C5, C7, C5, C7, | |
68 | |
69 C3, -C7, C3, -C7, | |
70 -C1, -C5, -C1, -C5, | |
71 | |
72 C5, -C1, C5, -C1, | |
73 C7, C3, C7, C3, | |
74 | |
75 C7, -C5, C7, -C5, | |
76 C3, -C1, C3, -C1 | |
77 }; | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
78 |
350
6ebbecc10063
- Advanced Intra Coding (AIC) support for H.263+ encoder, just DC by now.
pulento
parents:
213
diff
changeset
|
79 #if 0 |
209 | 80 static void unused_var_killer(){ |
81 int a= wm1010 + d40000; | |
82 temp[0]=a; | |
83 } | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
84 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
85 static void inline idctCol (int16_t * col, int16_t *input) |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
86 { |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
87 #undef C0 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
88 #undef C1 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
89 #undef C2 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
90 #undef C3 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
91 #undef C4 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
92 #undef C5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
93 #undef C6 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
94 #undef C7 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
95 int a0, a1, a2, a3, b0, b1, b2, b3; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
96 const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
97 const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
98 const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
99 const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
352
5a8eb5cf9f92
C4=16383 for the c version too and even for some outcommented code
michaelni
parents:
351
diff
changeset
|
100 const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
101 const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
102 const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
103 const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
104 /* |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
105 if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) { |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
106 col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] = |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
107 col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
108 return; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
109 }*/ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
110 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
111 col[8*0] = input[8*0 + 0]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
112 col[8*1] = input[8*2 + 0]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
113 col[8*2] = input[8*0 + 1]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
114 col[8*3] = input[8*2 + 1]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
115 col[8*4] = input[8*4 + 0]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
116 col[8*5] = input[8*6 + 0]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
117 col[8*6] = input[8*4 + 1]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
118 col[8*7] = input[8*6 + 1]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
119 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
120 a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6] + (1<<(COL_SHIFT-1)); |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
121 a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6] + (1<<(COL_SHIFT-1)); |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
122 a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6] + (1<<(COL_SHIFT-1)); |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
123 a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6] + (1<<(COL_SHIFT-1)); |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
124 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
125 b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
126 b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
127 b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
128 b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
129 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
130 col[8*0] = (a0 + b0) >> COL_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
131 col[8*1] = (a1 + b1) >> COL_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
132 col[8*2] = (a2 + b2) >> COL_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
133 col[8*3] = (a3 + b3) >> COL_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
134 col[8*4] = (a3 - b3) >> COL_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
135 col[8*5] = (a2 - b2) >> COL_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
136 col[8*6] = (a1 - b1) >> COL_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
137 col[8*7] = (a0 - b0) >> COL_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
138 } |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
139 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
140 static void inline idctRow (int16_t * output, int16_t * input) |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
141 { |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
142 int16_t row[8]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
143 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
144 int a0, a1, a2, a3, b0, b1, b2, b3; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
145 const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
146 const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
147 const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
148 const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
352
5a8eb5cf9f92
C4=16383 for the c version too and even for some outcommented code
michaelni
parents:
351
diff
changeset
|
149 const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
150 const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
151 const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
152 const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
153 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
154 row[0] = input[0]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
155 row[2] = input[1]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
156 row[4] = input[4]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
157 row[6] = input[5]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
158 row[1] = input[8]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
159 row[3] = input[9]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
160 row[5] = input[12]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
161 row[7] = input[13]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
162 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
163 if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) { |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
164 row[0] = row[1] = row[2] = row[3] = row[4] = |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
165 row[5] = row[6] = row[7] = row[0]<<3; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
166 output[0] = row[0]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
167 output[2] = row[1]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
168 output[4] = row[2]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
169 output[6] = row[3]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
170 output[8] = row[4]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
171 output[10] = row[5]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
172 output[12] = row[6]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
173 output[14] = row[7]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
174 return; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
175 } |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
176 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
177 a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1)); |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
178 a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1)); |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
179 a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1)); |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
180 a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1)); |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
181 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
182 b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
183 b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
184 b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
185 b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
186 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
187 row[0] = (a0 + b0) >> ROW_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
188 row[1] = (a1 + b1) >> ROW_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
189 row[2] = (a2 + b2) >> ROW_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
190 row[3] = (a3 + b3) >> ROW_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
191 row[4] = (a3 - b3) >> ROW_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
192 row[5] = (a2 - b2) >> ROW_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
193 row[6] = (a1 - b1) >> ROW_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
194 row[7] = (a0 - b0) >> ROW_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
195 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
196 output[0] = row[0]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
197 output[2] = row[1]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
198 output[4] = row[2]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
199 output[6] = row[3]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
200 output[8] = row[4]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
201 output[10] = row[5]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
202 output[12] = row[6]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
203 output[14] = row[7]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
204 } |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
205 #endif |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
206 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
207 static inline void idct(int16_t *block) |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
208 { |
1469
43ceb6e34b06
another non const static, maybe its thread save now
michaelni
parents:
1064
diff
changeset
|
209 int64_t __attribute__((aligned(8))) align_tmp[16]; |
43ceb6e34b06
another non const static, maybe its thread save now
michaelni
parents:
1064
diff
changeset
|
210 int16_t * const temp= (int16_t*)align_tmp; |
43ceb6e34b06
another non const static, maybe its thread save now
michaelni
parents:
1064
diff
changeset
|
211 |
209 | 212 asm volatile( |
213 #if 0 //Alternative, simpler variant | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
214 |
209 | 215 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ |
216 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
217 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
218 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
219 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
209 | 220 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
221 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
222 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
223 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
224 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
225 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
226 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
227 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
228 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
229 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
230 #rounder ", %%mm4 \n\t"\ | |
231 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
232 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
209 | 233 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
234 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ | |
235 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
236 #rounder ", %%mm0 \n\t"\ | |
237 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ | |
238 "paddd %%mm0, %%mm0 \n\t" \ | |
239 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ | |
240 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
241 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ | |
242 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
243 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
244 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
245 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
246 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
247 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ | |
248 "psrad $" #shift ", %%mm7 \n\t"\ | |
249 "psrad $" #shift ", %%mm4 \n\t"\ | |
250 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ | |
251 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ | |
252 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
253 "psrad $" #shift ", %%mm1 \n\t"\ | |
254 "psrad $" #shift ", %%mm2 \n\t"\ | |
255 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ | |
256 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ | |
257 "movq %%mm7, " #dst " \n\t"\ | |
258 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ | |
259 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
260 "movq %%mm2, 24+" #dst " \n\t"\ | |
261 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
262 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
263 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
264 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
265 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ | |
266 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
267 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
268 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
269 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ | |
270 "psrad $" #shift ", %%mm2 \n\t"\ | |
271 "psrad $" #shift ", %%mm0 \n\t"\ | |
272 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
273 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ | |
274 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
275 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
276 "psrad $" #shift ", %%mm6 \n\t"\ | |
277 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ | |
278 "movq %%mm2, 8+" #dst " \n\t"\ | |
279 "psrad $" #shift ", %%mm4 \n\t"\ | |
280 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ | |
281 "movq %%mm4, 16+" #dst " \n\t"\ | |
282 | |
283 #define COL_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
284 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
285 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
286 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ | |
287 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ | |
288 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
289 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
290 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
291 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
292 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
293 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
294 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
295 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
296 #rounder ", %%mm4 \n\t"\ |
209 | 297 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
298 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
299 #rounder ", %%mm0 \n\t"\ | |
300 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
301 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ | |
302 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ | |
303 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
304 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ | |
305 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ | |
306 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ | |
307 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
308 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
309 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\ | |
310 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
311 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
312 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
313 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
209 | 314 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
315 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\ | |
316 "psrad $" #shift ", %%mm7 \n\t"\ | |
317 "psrad $" #shift ", %%mm4 \n\t"\ | |
318 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ | |
319 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
320 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
321 "psrad $" #shift ", %%mm0 \n\t"\ | |
322 "psrad $" #shift ", %%mm2 \n\t"\ | |
323 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
324 "movd %%mm7, " #dst " \n\t"\ | |
325 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
326 "movd %%mm0, 16+" #dst " \n\t"\ | |
327 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
328 "movd %%mm2, 96+" #dst " \n\t"\ | |
329 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
330 "movd %%mm4, 112+" #dst " \n\t"\ | |
331 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\ | |
332 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
333 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
334 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
335 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
336 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
337 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ | |
338 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
339 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
340 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
341 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ | |
342 "psrad $" #shift ", %%mm2 \n\t"\ | |
343 "psrad $" #shift ", %%mm5 \n\t"\ | |
344 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
345 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\ | |
346 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
347 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
348 "psrad $" #shift ", %%mm6 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
349 "psrad $" #shift ", %%mm4 \n\t"\ |
209 | 350 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
351 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
352 "movd %%mm2, 32+" #dst " \n\t"\ | |
353 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ | |
354 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ | |
355 "movd %%mm6, 48+" #dst " \n\t"\ | |
356 "movd %%mm4, 64+" #dst " \n\t"\ | |
357 "movd %%mm5, 80+" #dst " \n\t"\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
358 |
209 | 359 |
360 #define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
361 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
362 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
363 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
364 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
213
e80ad397d30e
Cygwin's mangling by Felix Buenemann <atmosfear@users.sourceforge.net>
nickols_k
parents:
209
diff
changeset
|
365 "movq "MANGLE(wm1010)", %%mm4 \n\t"\ |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
366 "pand %%mm0, %%mm4 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
367 "por %%mm1, %%mm4 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
368 "por %%mm2, %%mm4 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
369 "por %%mm3, %%mm4 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
370 "packssdw %%mm4,%%mm4 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
371 "movd %%mm4, %%eax \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
372 "orl %%eax, %%eax \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
373 "jz 1f \n\t"\ |
209 | 374 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
375 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
376 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
377 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
378 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
379 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
380 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
381 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
382 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
383 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
384 #rounder ", %%mm4 \n\t"\ |
209 | 385 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
386 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ | |
387 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ | |
388 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ | |
389 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
390 #rounder ", %%mm0 \n\t"\ | |
391 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ | |
392 "paddd %%mm0, %%mm0 \n\t" \ | |
393 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ | |
394 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
395 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ | |
396 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
397 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
398 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
399 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
400 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
401 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ | |
402 "psrad $" #shift ", %%mm7 \n\t"\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
403 "psrad $" #shift ", %%mm4 \n\t"\ |
209 | 404 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ |
405 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ | |
406 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
407 "psrad $" #shift ", %%mm1 \n\t"\ | |
408 "psrad $" #shift ", %%mm2 \n\t"\ | |
409 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ | |
410 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ | |
411 "movq %%mm7, " #dst " \n\t"\ | |
412 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ | |
413 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
414 "movq %%mm2, 24+" #dst " \n\t"\ | |
415 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
416 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
417 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
418 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
209 | 419 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ |
420 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
421 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
422 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
423 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
424 "psrad $" #shift ", %%mm2 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
425 "psrad $" #shift ", %%mm0 \n\t"\ |
209 | 426 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ |
427 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ | |
428 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
429 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
430 "psrad $" #shift ", %%mm6 \n\t"\ | |
431 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ | |
432 "movq %%mm2, 8+" #dst " \n\t"\ | |
433 "psrad $" #shift ", %%mm4 \n\t"\ | |
434 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ | |
435 "movq %%mm4, 16+" #dst " \n\t"\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
436 "jmp 2f \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
437 "1: \n\t"\ |
209 | 438 "pslld $16, %%mm0 \n\t"\ |
213
e80ad397d30e
Cygwin's mangling by Felix Buenemann <atmosfear@users.sourceforge.net>
nickols_k
parents:
209
diff
changeset
|
439 "#paddd "MANGLE(d40000)", %%mm0 \n\t"\ |
209 | 440 "psrad $13, %%mm0 \n\t"\ |
441 "packssdw %%mm0, %%mm0 \n\t"\ | |
442 "movq %%mm0, " #dst " \n\t"\ | |
443 "movq %%mm0, 8+" #dst " \n\t"\ | |
444 "movq %%mm0, 16+" #dst " \n\t"\ | |
445 "movq %%mm0, 24+" #dst " \n\t"\ | |
446 "2: \n\t" | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
447 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
448 |
209 | 449 //IDCT( src0, src4, src1, src5, dst, rounder, shift) |
450 ROW_IDCT( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11) | |
451 /*ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11) | |
452 ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11) | |
453 ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
454 |
209 | 455 DC_COND_ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11) |
456 DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11) | |
457 DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11) | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
458 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
459 |
209 | 460 //IDCT( src0, src4, src1, src5, dst, rounder, shift) |
461 COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) | |
462 COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) | |
463 COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) | |
464 COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
465 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
466 #else |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
467 |
209 | 468 #define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ |
469 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
470 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
471 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
472 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
213
e80ad397d30e
Cygwin's mangling by Felix Buenemann <atmosfear@users.sourceforge.net>
nickols_k
parents:
209
diff
changeset
|
473 "movq "MANGLE(wm1010)", %%mm4 \n\t"\ |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
474 "pand %%mm0, %%mm4 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
475 "por %%mm1, %%mm4 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
476 "por %%mm2, %%mm4 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
477 "por %%mm3, %%mm4 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
478 "packssdw %%mm4,%%mm4 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
479 "movd %%mm4, %%eax \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
480 "orl %%eax, %%eax \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
481 "jz 1f \n\t"\ |
209 | 482 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
483 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
484 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
485 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
486 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
487 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
488 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
489 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
490 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
491 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
492 #rounder ", %%mm4 \n\t"\ |
209 | 493 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
494 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ | |
495 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ | |
496 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ | |
497 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
498 #rounder ", %%mm0 \n\t"\ | |
499 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ | |
500 "paddd %%mm0, %%mm0 \n\t" \ | |
501 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ | |
502 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
503 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ | |
504 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
505 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
506 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
507 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
508 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
509 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ | |
510 "psrad $" #shift ", %%mm7 \n\t"\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
511 "psrad $" #shift ", %%mm4 \n\t"\ |
209 | 512 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ |
513 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ | |
514 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
515 "psrad $" #shift ", %%mm1 \n\t"\ | |
516 "psrad $" #shift ", %%mm2 \n\t"\ | |
517 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ | |
518 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ | |
519 "movq %%mm7, " #dst " \n\t"\ | |
520 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ | |
521 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
522 "movq %%mm2, 24+" #dst " \n\t"\ | |
523 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
524 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
525 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
526 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
209 | 527 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ |
528 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
529 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
530 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
531 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
532 "psrad $" #shift ", %%mm2 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
533 "psrad $" #shift ", %%mm0 \n\t"\ |
209 | 534 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ |
535 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ | |
536 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
537 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
538 "psrad $" #shift ", %%mm6 \n\t"\ | |
539 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ | |
540 "movq %%mm2, 8+" #dst " \n\t"\ | |
541 "psrad $" #shift ", %%mm4 \n\t"\ | |
542 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ | |
543 "movq %%mm4, 16+" #dst " \n\t"\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
544 "jmp 2f \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
545 "1: \n\t"\ |
209 | 546 "pslld $16, %%mm0 \n\t"\ |
213
e80ad397d30e
Cygwin's mangling by Felix Buenemann <atmosfear@users.sourceforge.net>
nickols_k
parents:
209
diff
changeset
|
547 "paddd "MANGLE(d40000)", %%mm0 \n\t"\ |
209 | 548 "psrad $13, %%mm0 \n\t"\ |
549 "packssdw %%mm0, %%mm0 \n\t"\ | |
550 "movq %%mm0, " #dst " \n\t"\ | |
551 "movq %%mm0, 8+" #dst " \n\t"\ | |
552 "movq %%mm0, 16+" #dst " \n\t"\ | |
553 "movq %%mm0, 24+" #dst " \n\t"\ | |
554 "2: \n\t" | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
555 |
209 | 556 #define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \ |
557 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
558 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
559 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
560 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
561 "movq %%mm0, %%mm4 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
562 "por %%mm1, %%mm4 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
563 "por %%mm2, %%mm4 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
564 "por %%mm3, %%mm4 \n\t"\ |
209 | 565 "packssdw %%mm4,%%mm4 \n\t"\ |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
566 "movd %%mm4, %%eax \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
567 "orl %%eax, %%eax \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
568 "jz " #bt " \n\t"\ |
209 | 569 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
570 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
571 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
572 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
573 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
574 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
575 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
576 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
577 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
578 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
579 #rounder ", %%mm4 \n\t"\ | |
580 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
581 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ | |
582 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ | |
583 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ | |
584 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
585 #rounder ", %%mm0 \n\t"\ | |
586 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ | |
587 "paddd %%mm0, %%mm0 \n\t" \ | |
588 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ | |
589 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
590 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ | |
591 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
592 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
593 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
594 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
595 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
596 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ | |
597 "psrad $" #shift ", %%mm7 \n\t"\ | |
598 "psrad $" #shift ", %%mm4 \n\t"\ | |
599 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ | |
600 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ | |
601 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
602 "psrad $" #shift ", %%mm1 \n\t"\ | |
603 "psrad $" #shift ", %%mm2 \n\t"\ | |
604 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ | |
605 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ | |
606 "movq %%mm7, " #dst " \n\t"\ | |
607 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ | |
608 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
609 "movq %%mm2, 24+" #dst " \n\t"\ | |
610 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
611 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
612 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
613 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
614 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ | |
615 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
616 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
617 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
618 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ | |
619 "psrad $" #shift ", %%mm2 \n\t"\ | |
620 "psrad $" #shift ", %%mm0 \n\t"\ | |
621 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
622 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ | |
623 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
624 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
625 "psrad $" #shift ", %%mm6 \n\t"\ | |
626 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ | |
627 "movq %%mm2, 8+" #dst " \n\t"\ | |
628 "psrad $" #shift ", %%mm4 \n\t"\ | |
629 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ | |
630 "movq %%mm4, 16+" #dst " \n\t"\ | |
631 | |
632 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
633 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
634 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
635 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ | |
636 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ | |
637 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
638 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
639 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
640 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
641 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
642 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
643 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
644 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
645 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
646 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
647 #rounder ", %%mm4 \n\t"\ | |
648 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
649 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
209 | 650 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
651 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ | |
652 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
653 #rounder ", %%mm0 \n\t"\ | |
654 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ | |
655 "paddd %%mm0, %%mm0 \n\t" \ | |
656 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ | |
657 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
658 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ | |
659 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
660 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
661 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
662 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
663 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
664 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ | |
665 "psrad $" #shift ", %%mm7 \n\t"\ | |
666 "psrad $" #shift ", %%mm4 \n\t"\ | |
667 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ | |
668 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ | |
669 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
670 "psrad $" #shift ", %%mm1 \n\t"\ | |
671 "psrad $" #shift ", %%mm2 \n\t"\ | |
672 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ | |
673 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ | |
674 "movq %%mm7, " #dst " \n\t"\ | |
675 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ | |
676 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
677 "movq %%mm2, 24+" #dst " \n\t"\ | |
678 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
679 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
680 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
681 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
682 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ | |
683 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
684 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
685 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
686 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ | |
687 "psrad $" #shift ", %%mm2 \n\t"\ | |
688 "psrad $" #shift ", %%mm0 \n\t"\ | |
689 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
690 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ | |
691 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
692 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
693 "psrad $" #shift ", %%mm6 \n\t"\ | |
694 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ | |
695 "movq %%mm2, 8+" #dst " \n\t"\ | |
696 "psrad $" #shift ", %%mm4 \n\t"\ | |
697 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ | |
698 "movq %%mm4, 16+" #dst " \n\t"\ | |
699 | |
700 //IDCT( src0, src4, src1, src5, dst, rounder, shift) | |
701 DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11) | |
702 Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f) | |
703 Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f) | |
704 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f) | |
705 | |
706 #undef IDCT | |
707 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
708 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
709 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
710 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ | |
711 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ | |
712 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
713 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
714 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
715 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
716 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
717 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
718 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
719 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
720 #rounder ", %%mm4 \n\t"\ |
209 | 721 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
722 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
723 #rounder ", %%mm0 \n\t"\ | |
724 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
725 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ | |
726 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ | |
727 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
728 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ | |
729 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ | |
730 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ | |
731 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
732 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
733 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\ | |
734 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
735 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
736 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
737 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
209 | 738 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
739 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\ | |
740 "psrad $" #shift ", %%mm7 \n\t"\ | |
741 "psrad $" #shift ", %%mm4 \n\t"\ | |
742 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ | |
743 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
744 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
745 "psrad $" #shift ", %%mm0 \n\t"\ | |
746 "psrad $" #shift ", %%mm2 \n\t"\ | |
747 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
748 "movd %%mm7, " #dst " \n\t"\ | |
749 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
750 "movd %%mm0, 16+" #dst " \n\t"\ | |
751 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
752 "movd %%mm2, 96+" #dst " \n\t"\ | |
753 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
754 "movd %%mm4, 112+" #dst " \n\t"\ | |
755 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\ | |
756 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
757 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
758 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
759 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
760 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
761 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ | |
762 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
763 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
764 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
765 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ | |
766 "psrad $" #shift ", %%mm2 \n\t"\ | |
767 "psrad $" #shift ", %%mm5 \n\t"\ | |
768 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
769 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\ | |
770 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
771 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
772 "psrad $" #shift ", %%mm6 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
773 "psrad $" #shift ", %%mm4 \n\t"\ |
209 | 774 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
775 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
776 "movd %%mm2, 32+" #dst " \n\t"\ | |
777 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ | |
778 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ | |
779 "movd %%mm6, 48+" #dst " \n\t"\ | |
780 "movd %%mm4, 64+" #dst " \n\t"\ | |
781 "movd %%mm5, 80+" #dst " \n\t" | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
782 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
783 |
209 | 784 //IDCT( src0, src4, src1, src5, dst, rounder, shift) |
785 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) | |
786 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) | |
787 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) | |
788 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
789 "jmp 9f \n\t" |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
790 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
791 "#.balign 16 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
792 "4: \n\t" |
209 | 793 Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f) |
794 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f) | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
795 |
209 | 796 #undef IDCT |
797 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
798 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
799 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
800 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
209 | 801 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
802 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
803 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
804 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
805 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
806 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
807 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
808 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
809 #rounder ", %%mm4 \n\t"\ | |
810 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
811 #rounder ", %%mm0 \n\t"\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
812 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
209 | 813 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
814 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
815 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ | |
816 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ | |
817 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ | |
818 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
819 "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
820 "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ |
209 | 821 "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\ |
822 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
823 "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
824 "psrad $" #shift ", %%mm1 \n\t"\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
825 "psrad $" #shift ", %%mm4 \n\t"\ |
209 | 826 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ |
827 "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
828 "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
829 "psrad $" #shift ", %%mm0 \n\t"\ |
209 | 830 "psrad $" #shift ", %%mm2 \n\t"\ |
831 "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\ | |
832 "movd %%mm1, " #dst " \n\t"\ | |
833 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
834 "movd %%mm0, 16+" #dst " \n\t"\ | |
835 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
836 "movd %%mm2, 96+" #dst " \n\t"\ | |
837 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
838 "movd %%mm4, 112+" #dst " \n\t"\ | |
839 "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\ | |
840 "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
841 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ | |
842 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
843 "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
844 "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\ | |
845 "psrad $" #shift ", %%mm2 \n\t"\ | |
846 "psrad $" #shift ", %%mm5 \n\t"\ | |
847 "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\ | |
848 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
849 "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\ | |
850 "psrad $" #shift ", %%mm6 \n\t"\ | |
851 "psrad $" #shift ", %%mm1 \n\t"\ | |
852 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
853 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
854 "movd %%mm2, 32+" #dst " \n\t"\ | |
855 "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\ | |
856 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ | |
857 "movd %%mm6, 48+" #dst " \n\t"\ | |
858 "movd %%mm1, 64+" #dst " \n\t"\ | |
859 "movd %%mm5, 80+" #dst " \n\t" | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
860 |
209 | 861 //IDCT( src0, src4, src1, src5, dst, rounder, shift) |
862 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) | |
863 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) | |
864 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) | |
865 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
866 "jmp 9f \n\t" |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
867 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
868 "#.balign 16 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
869 "6: \n\t" |
209 | 870 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f) |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
871 |
209 | 872 #undef IDCT |
873 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
874 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
875 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ | |
876 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
877 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
878 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
879 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
880 #rounder ", %%mm4 \n\t"\ | |
881 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
882 #rounder ", %%mm0 \n\t"\ | |
883 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
884 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ | |
885 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
886 "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
887 "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
888 "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\ | |
889 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
890 "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
891 "psrad $" #shift ", %%mm1 \n\t"\ | |
892 "psrad $" #shift ", %%mm4 \n\t"\ | |
893 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ | |
894 "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
895 "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
896 "psrad $" #shift ", %%mm0 \n\t"\ | |
897 "psrad $" #shift ", %%mm2 \n\t"\ | |
898 "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\ | |
899 "movd %%mm1, " #dst " \n\t"\ | |
900 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
901 "movd %%mm0, 16+" #dst " \n\t"\ | |
902 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
903 "movd %%mm2, 96+" #dst " \n\t"\ | |
904 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
905 "movd %%mm4, 112+" #dst " \n\t"\ | |
906 "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\ | |
907 "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
908 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ | |
909 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
910 "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
911 "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\ | |
912 "psrad $" #shift ", %%mm2 \n\t"\ | |
913 "psrad $" #shift ", %%mm5 \n\t"\ | |
914 "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\ | |
915 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
916 "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\ | |
917 "psrad $" #shift ", %%mm6 \n\t"\ | |
918 "psrad $" #shift ", %%mm1 \n\t"\ | |
919 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
920 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
921 "movd %%mm2, 32+" #dst " \n\t"\ | |
922 "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\ | |
923 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ | |
924 "movd %%mm6, 48+" #dst " \n\t"\ | |
925 "movd %%mm1, 64+" #dst " \n\t"\ | |
926 "movd %%mm5, 80+" #dst " \n\t" | |
927 | |
928 | |
929 //IDCT( src0, src4, src1, src5, dst, rounder, shift) | |
930 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) | |
931 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) | |
932 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) | |
933 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) | |
934 "jmp 9f \n\t" | |
935 | |
936 "#.balign 16 \n\t"\ | |
937 "2: \n\t" | |
938 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f) | |
939 | |
940 #undef IDCT | |
941 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
942 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
943 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
944 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
209 | 945 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
946 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
947 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
948 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
949 #rounder ", %%mm4 \n\t"\ | |
950 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
951 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
952 #rounder ", %%mm0 \n\t"\ | |
953 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
954 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
955 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ | |
956 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
957 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
958 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\ | |
959 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
960 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
961 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
962 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
963 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
964 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\ | |
965 "psrad $" #shift ", %%mm7 \n\t"\ | |
966 "psrad $" #shift ", %%mm4 \n\t"\ | |
967 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ | |
968 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
969 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
970 "psrad $" #shift ", %%mm0 \n\t"\ | |
971 "psrad $" #shift ", %%mm2 \n\t"\ | |
972 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
973 "movd %%mm7, " #dst " \n\t"\ | |
974 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
975 "movd %%mm0, 16+" #dst " \n\t"\ | |
976 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
977 "movd %%mm2, 96+" #dst " \n\t"\ | |
978 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
979 "movd %%mm4, 112+" #dst " \n\t"\ | |
980 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\ | |
981 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
982 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
983 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
984 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
985 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
986 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ | |
987 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
988 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
989 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
990 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ | |
991 "psrad $" #shift ", %%mm2 \n\t"\ | |
992 "psrad $" #shift ", %%mm5 \n\t"\ | |
993 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
994 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\ | |
995 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
996 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
997 "psrad $" #shift ", %%mm6 \n\t"\ | |
998 "psrad $" #shift ", %%mm4 \n\t"\ | |
999 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
1000 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
1001 "movd %%mm2, 32+" #dst " \n\t"\ | |
1002 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ | |
1003 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ | |
1004 "movd %%mm6, 48+" #dst " \n\t"\ | |
1005 "movd %%mm4, 64+" #dst " \n\t"\ | |
1006 "movd %%mm5, 80+" #dst " \n\t" | |
1007 | |
1008 //IDCT( src0, src4, src1, src5, dst, rounder, shift) | |
1009 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) | |
1010 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) | |
1011 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) | |
1012 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) | |
1013 "jmp 9f \n\t" | |
1014 | |
1015 "#.balign 16 \n\t"\ | |
1016 "3: \n\t" | |
1017 #undef IDCT | |
1018 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
1019 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
1020 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ | |
1021 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
1022 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
1023 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
1024 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1025 #rounder ", %%mm4 \n\t"\ | |
1026 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
1027 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
1028 #rounder ", %%mm0 \n\t"\ | |
1029 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
1030 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1031 "movq 64(%2), %%mm3 \n\t"\ | |
1032 "pmaddwd %%mm2, %%mm3 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1033 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1034 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1035 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1036 "psrad $" #shift ", %%mm7 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1037 "psrad $" #shift ", %%mm4 \n\t"\ |
209 | 1038 "movq %%mm0, %%mm1 \n\t" /* A1 a1 */\ |
1039 "paddd %%mm3, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
1040 "psubd %%mm3, %%mm1 \n\t" /* A1-B1 a1-b1 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1041 "psrad $" #shift ", %%mm0 \n\t"\ |
209 | 1042 "psrad $" #shift ", %%mm1 \n\t"\ |
1043 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
1044 "movd %%mm7, " #dst " \n\t"\ | |
1045 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
1046 "movd %%mm0, 16+" #dst " \n\t"\ | |
1047 "packssdw %%mm1, %%mm1 \n\t" /* A1-B1 a1-b1 */\ | |
1048 "movd %%mm1, 96+" #dst " \n\t"\ | |
1049 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
1050 "movd %%mm4, 112+" #dst " \n\t"\ | |
1051 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
1052 "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
1053 "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
1054 "movq %%mm5, %%mm1 \n\t" /* A2 a2 */\ | |
1055 "paddd %%mm4, %%mm1 \n\t" /* A2+B2 a2+b2 */\ | |
1056 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ | |
1057 "psrad $" #shift ", %%mm1 \n\t"\ | |
1058 "psrad $" #shift ", %%mm5 \n\t"\ | |
1059 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
1060 "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
1061 "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1062 "psrad $" #shift ", %%mm6 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1063 "psrad $" #shift ", %%mm4 \n\t"\ |
209 | 1064 "packssdw %%mm1, %%mm1 \n\t" /* A2+B2 a2+b2 */\ |
1065 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
1066 "movd %%mm1, 32+" #dst " \n\t"\ | |
1067 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ | |
1068 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ | |
1069 "movd %%mm6, 48+" #dst " \n\t"\ | |
1070 "movd %%mm4, 64+" #dst " \n\t"\ | |
1071 "movd %%mm5, 80+" #dst " \n\t" | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1072 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1073 |
209 | 1074 //IDCT( src0, src4, src1, src5, dst, rounder, shift) |
1075 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) | |
1076 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) | |
1077 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) | |
1078 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1079 "jmp 9f \n\t" |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1080 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1081 "#.balign 16 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1082 "5: \n\t" |
209 | 1083 #undef IDCT |
1084 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
1085 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
1086 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
1087 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
1088 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
1089 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
1090 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1091 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
1092 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
1093 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
1094 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
1095 #rounder ", %%mm4 \n\t"\ | |
1096 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1097 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
209 | 1098 #rounder ", %%mm0 \n\t"\ |
1099 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ | |
1100 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1101 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ | |
1102 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ | |
1103 "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\ | |
1104 "movq 8+" #src4 ", %%mm3 \n\t" /* R6 R2 r6 r2 */\ | |
1105 "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\ | |
1106 "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
1107 "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\ | |
1108 "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1109 "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\ | |
1110 "pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
1111 "pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
1112 #rounder ", %%mm1 \n\t"\ | |
1113 "paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\ | |
1114 "paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\ | |
1115 #rounder ", %%mm2 \n\t"\ | |
1116 "psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\ | |
1117 "paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\ | |
1118 "paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\ | |
1119 "psubd %%mm3, %%mm2 \n\t" /* A2 a2 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1120 "psrad $" #shift ", %%mm4 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1121 "psrad $" #shift ", %%mm7 \n\t"\ |
209 | 1122 "psrad $" #shift ", %%mm3 \n\t"\ |
1123 "packssdw %%mm7, %%mm4 \n\t" /* A0 a0 */\ | |
1124 "movq %%mm4, " #dst " \n\t"\ | |
1125 "psrad $" #shift ", %%mm0 \n\t"\ | |
1126 "packssdw %%mm3, %%mm0 \n\t" /* A1 a1 */\ | |
1127 "movq %%mm0, 16+" #dst " \n\t"\ | |
1128 "movq %%mm0, 96+" #dst " \n\t"\ | |
1129 "movq %%mm4, 112+" #dst " \n\t"\ | |
1130 "psrad $" #shift ", %%mm5 \n\t"\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1131 "psrad $" #shift ", %%mm6 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1132 "psrad $" #shift ", %%mm2 \n\t"\ |
209 | 1133 "packssdw %%mm2, %%mm5 \n\t" /* A2-B2 a2-b2 */\ |
1134 "movq %%mm5, 32+" #dst " \n\t"\ | |
1135 "psrad $" #shift ", %%mm1 \n\t"\ | |
1136 "packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
1137 "movq %%mm6, 48+" #dst " \n\t"\ | |
1138 "movq %%mm6, 64+" #dst " \n\t"\ | |
1139 "movq %%mm5, 80+" #dst " \n\t" | |
1140 | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1141 |
209 | 1142 //IDCT( src0, src4, src1, src5, dst, rounder, shift) |
1143 IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) | |
1144 //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) | |
1145 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) | |
1146 //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1147 "jmp 9f \n\t" |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1148 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1149 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1150 "#.balign 16 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1151 "1: \n\t" |
209 | 1152 #undef IDCT |
1153 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
1154 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
1155 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1156 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
209 | 1157 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
1158 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
1159 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
1160 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1161 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
1162 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
1163 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
1164 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
1165 #rounder ", %%mm4 \n\t"\ | |
1166 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
1167 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
1168 #rounder ", %%mm0 \n\t"\ | |
1169 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1170 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
209 | 1171 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
1172 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1173 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ | |
1174 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ | |
1175 "movq 64(%2), %%mm1 \n\t"\ | |
1176 "pmaddwd %%mm2, %%mm1 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
1177 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1178 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
209 | 1179 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
1180 "psrad $" #shift ", %%mm7 \n\t"\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1181 "psrad $" #shift ", %%mm4 \n\t"\ |
209 | 1182 "movq %%mm0, %%mm3 \n\t" /* A1 a1 */\ |
1183 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
1184 "psubd %%mm1, %%mm3 \n\t" /* A1-B1 a1-b1 */\ | |
1185 "psrad $" #shift ", %%mm0 \n\t"\ | |
1186 "psrad $" #shift ", %%mm3 \n\t"\ | |
1187 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
1188 "movd %%mm7, " #dst " \n\t"\ | |
1189 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
1190 "movd %%mm0, 16+" #dst " \n\t"\ | |
1191 "packssdw %%mm3, %%mm3 \n\t" /* A1-B1 a1-b1 */\ | |
1192 "movd %%mm3, 96+" #dst " \n\t"\ | |
1193 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
1194 "movd %%mm4, 112+" #dst " \n\t"\ | |
1195 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
1196 "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
1197 "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
1198 "movq %%mm5, %%mm3 \n\t" /* A2 a2 */\ | |
1199 "paddd %%mm4, %%mm3 \n\t" /* A2+B2 a2+b2 */\ | |
1200 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ | |
1201 "psrad $" #shift ", %%mm3 \n\t"\ | |
1202 "psrad $" #shift ", %%mm5 \n\t"\ | |
1203 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
1204 "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
1205 "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1206 "psrad $" #shift ", %%mm6 \n\t"\ |
209 | 1207 "packssdw %%mm3, %%mm3 \n\t" /* A2+B2 a2+b2 */\ |
1208 "movd %%mm3, 32+" #dst " \n\t"\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1209 "psrad $" #shift ", %%mm4 \n\t"\ |
209 | 1210 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
1211 "movd %%mm6, 48+" #dst " \n\t"\ | |
1212 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ | |
1213 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ | |
1214 "movd %%mm4, 64+" #dst " \n\t"\ | |
1215 "movd %%mm5, 80+" #dst " \n\t" | |
1216 | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1217 |
209 | 1218 //IDCT( src0, src4, src1, src5, dst, rounder, shift) |
1219 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) | |
1220 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) | |
1221 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) | |
1222 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1223 "jmp 9f \n\t" |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1224 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1225 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1226 "#.balign 16 \n\t" |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1227 "7: \n\t" |
209 | 1228 #undef IDCT |
1229 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
1230 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
1231 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
1232 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
1233 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
1234 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1235 #rounder ", %%mm4 \n\t"\ | |
1236 #rounder ", %%mm0 \n\t"\ | |
1237 "psrad $" #shift ", %%mm4 \n\t"\ | |
1238 "psrad $" #shift ", %%mm0 \n\t"\ | |
1239 "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\ | |
1240 "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\ | |
1241 "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
1242 "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\ | |
1243 "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1244 "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\ | |
1245 #rounder ", %%mm1 \n\t"\ | |
1246 #rounder ", %%mm2 \n\t"\ | |
1247 "psrad $" #shift ", %%mm1 \n\t"\ | |
1248 "packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\ | |
1249 "movq %%mm4, " #dst " \n\t"\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1250 "psrad $" #shift ", %%mm2 \n\t"\ |
209 | 1251 "packssdw %%mm2, %%mm0 \n\t" /* A1 a1 */\ |
1252 "movq %%mm0, 16+" #dst " \n\t"\ | |
1253 "movq %%mm0, 96+" #dst " \n\t"\ | |
1254 "movq %%mm4, 112+" #dst " \n\t"\ | |
1255 "movq %%mm0, 32+" #dst " \n\t"\ | |
1256 "movq %%mm4, 48+" #dst " \n\t"\ | |
1257 "movq %%mm4, 64+" #dst " \n\t"\ | |
1258 "movq %%mm0, 80+" #dst " \n\t" | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1259 |
209 | 1260 //IDCT( src0, src4, src1, src5, dst, rounder, shift) |
1261 IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) | |
1262 //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) | |
1263 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) | |
1264 //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1265 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1266 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1267 #endif |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1268 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1269 /* |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1270 Input |
209 | 1271 00 40 04 44 20 60 24 64 |
1272 10 30 14 34 50 70 54 74 | |
1273 01 41 03 43 21 61 23 63 | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1274 11 31 13 33 51 71 53 73 |
209 | 1275 02 42 06 46 22 62 26 66 |
1276 12 32 16 36 52 72 56 76 | |
1277 05 45 07 47 25 65 27 67 | |
1278 15 35 17 37 55 75 57 77 | |
1279 | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1280 Temp |
209 | 1281 00 04 10 14 20 24 30 34 |
1282 40 44 50 54 60 64 70 74 | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1283 01 03 11 13 21 23 31 33 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1284 41 43 51 53 61 63 71 73 |
209 | 1285 02 06 12 16 22 26 32 36 |
1286 42 46 52 56 62 66 72 76 | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1287 05 07 15 17 25 27 35 37 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1288 45 47 55 57 65 67 75 77 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1289 */ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1290 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1291 "9: \n\t" |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1292 :: "r" (block), "r" (temp), "r" (coeffs) |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1293 : "%eax" |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1294 ); |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1295 } |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1296 |
706
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
429
diff
changeset
|
1297 void ff_simple_idct_mmx(int16_t *block) |
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
429
diff
changeset
|
1298 { |
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
429
diff
changeset
|
1299 idct(block); |
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
429
diff
changeset
|
1300 } |
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
429
diff
changeset
|
1301 |
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
429
diff
changeset
|
1302 //FIXME merge add/put into the idct |
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
429
diff
changeset
|
1303 |
1064 | 1304 void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block) |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1305 { |
706
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
429
diff
changeset
|
1306 idct(block); |
854
3034f1816596
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1307 put_pixels_clamped_mmx(block, dest, line_size); |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1308 } |
1064 | 1309 void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block) |
706
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
429
diff
changeset
|
1310 { |
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
429
diff
changeset
|
1311 idct(block); |
854
3034f1816596
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
706
diff
changeset
|
1312 add_pixels_clamped_mmx(block, dest, line_size); |
706
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
429
diff
changeset
|
1313 } |