Mercurial > libavcodec.hg
annotate i386/simple_idct_mmx.c @ 350:6ebbecc10063 libavcodec
- Advanced Intra Coding (AIC) support for H.263+ encoder, just DC by now.
- Bug fix H.263+ AIC tables.
- Warning fixes.
author | pulento |
---|---|
date | Thu, 02 May 2002 04:39:45 +0000 |
parents | e80ad397d30e |
children | 6cef8253faab |
rev | line source |
---|---|
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1 /* |
209 | 2 Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at) |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
3 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
4 This program is free software; you can redistribute it and/or modify |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
5 it under the terms of the GNU General Public License as published by |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
6 the Free Software Foundation; either version 2 of the License, or |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
7 (at your option) any later version. |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
8 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
9 This program is distributed in the hope that it will be useful, |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
10 but WITHOUT ANY WARRANTY; without even the implied warranty of |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
12 GNU General Public License for more details. |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
13 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
14 You should have received a copy of the GNU General Public License |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
15 along with this program; if not, write to the Free Software |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
17 */ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
18 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
19 #include <inttypes.h> |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
20 #include "../dsputil.h" |
213
e80ad397d30e
Cygwin's mangling by Felix Buenemann <atmosfear@users.sourceforge.net>
nickols_k
parents:
209
diff
changeset
|
21 #include "../mangle.h" |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
22 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
23 #define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
24 #define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
25 #define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
26 #define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
27 #define C4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
28 #define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
29 #define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
30 #define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
31 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
32 #define ROW_SHIFT 11 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
33 #define COL_SHIFT 20 // 6 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
34 |
350
6ebbecc10063
- Advanced Intra Coding (AIC) support for H.263+ encoder, just DC by now.
pulento
parents:
213
diff
changeset
|
35 static const uint64_t __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000ULL; |
6ebbecc10063
- Advanced Intra Coding (AIC) support for H.263+ encoder, just DC by now.
pulento
parents:
213
diff
changeset
|
36 static const uint64_t __attribute__((aligned(8))) d40000= 0x0000000000040000ULL; |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
37 static int16_t __attribute__((aligned(8))) temp[64]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
38 static int16_t __attribute__((aligned(8))) coeffs[]= { |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
39 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0, |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
40 // 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0, |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
41 // 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16), |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
42 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0, |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
43 // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :) |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
44 // 0, 0, 0, 0, |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
45 // 0, 0, 0, 0, |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
46 |
209 | 47 C4, C4, C4, C4, |
48 C4, -C4, C4, -C4, | |
49 | |
50 C2, C6, C2, C6, | |
51 C6, -C2, C6, -C2, | |
52 | |
53 C1, C3, C1, C3, | |
54 C5, C7, C5, C7, | |
55 | |
56 C3, -C7, C3, -C7, | |
57 -C1, -C5, -C1, -C5, | |
58 | |
59 C5, -C1, C5, -C1, | |
60 C7, C3, C7, C3, | |
61 | |
62 C7, -C5, C7, -C5, | |
63 C3, -C1, C3, -C1 | |
64 }; | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
65 |
350
6ebbecc10063
- Advanced Intra Coding (AIC) support for H.263+ encoder, just DC by now.
pulento
parents:
213
diff
changeset
|
66 #if 0 |
209 | 67 static void unused_var_killer(){ |
68 int a= wm1010 + d40000; | |
69 temp[0]=a; | |
70 } | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
71 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
72 static void inline idctCol (int16_t * col, int16_t *input) |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
73 { |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
74 #undef C0 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
75 #undef C1 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
76 #undef C2 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
77 #undef C3 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
78 #undef C4 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
79 #undef C5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
80 #undef C6 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
81 #undef C7 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
82 int a0, a1, a2, a3, b0, b1, b2, b3; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
83 const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
84 const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
85 const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
86 const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
87 const int C4 = 16384; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
88 const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
89 const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
90 const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
91 /* |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
92 if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) { |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
93 col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] = |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
94 col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
95 return; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
96 }*/ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
97 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
98 col[8*0] = input[8*0 + 0]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
99 col[8*1] = input[8*2 + 0]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
100 col[8*2] = input[8*0 + 1]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
101 col[8*3] = input[8*2 + 1]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
102 col[8*4] = input[8*4 + 0]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
103 col[8*5] = input[8*6 + 0]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
104 col[8*6] = input[8*4 + 1]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
105 col[8*7] = input[8*6 + 1]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
106 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
107 a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6] + (1<<(COL_SHIFT-1)); |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
108 a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6] + (1<<(COL_SHIFT-1)); |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
109 a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6] + (1<<(COL_SHIFT-1)); |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
110 a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6] + (1<<(COL_SHIFT-1)); |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
111 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
112 b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
113 b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
114 b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
115 b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
116 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
117 col[8*0] = (a0 + b0) >> COL_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
118 col[8*1] = (a1 + b1) >> COL_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
119 col[8*2] = (a2 + b2) >> COL_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
120 col[8*3] = (a3 + b3) >> COL_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
121 col[8*4] = (a3 - b3) >> COL_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
122 col[8*5] = (a2 - b2) >> COL_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
123 col[8*6] = (a1 - b1) >> COL_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
124 col[8*7] = (a0 - b0) >> COL_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
125 } |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
126 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
127 static void inline idctRow (int16_t * output, int16_t * input) |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
128 { |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
129 int16_t row[8]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
130 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
131 int a0, a1, a2, a3, b0, b1, b2, b3; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
132 const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
133 const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
134 const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
135 const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
136 const int C4 = 16384; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
137 const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
138 const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
139 const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
140 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
141 row[0] = input[0]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
142 row[2] = input[1]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
143 row[4] = input[4]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
144 row[6] = input[5]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
145 row[1] = input[8]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
146 row[3] = input[9]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
147 row[5] = input[12]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
148 row[7] = input[13]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
149 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
150 if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) { |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
151 row[0] = row[1] = row[2] = row[3] = row[4] = |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
152 row[5] = row[6] = row[7] = row[0]<<3; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
153 output[0] = row[0]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
154 output[2] = row[1]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
155 output[4] = row[2]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
156 output[6] = row[3]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
157 output[8] = row[4]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
158 output[10] = row[5]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
159 output[12] = row[6]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
160 output[14] = row[7]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
161 return; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
162 } |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
163 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
164 a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1)); |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
165 a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1)); |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
166 a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1)); |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
167 a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1)); |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
168 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
169 b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
170 b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
171 b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
172 b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
173 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
174 row[0] = (a0 + b0) >> ROW_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
175 row[1] = (a1 + b1) >> ROW_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
176 row[2] = (a2 + b2) >> ROW_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
177 row[3] = (a3 + b3) >> ROW_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
178 row[4] = (a3 - b3) >> ROW_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
179 row[5] = (a2 - b2) >> ROW_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
180 row[6] = (a1 - b1) >> ROW_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
181 row[7] = (a0 - b0) >> ROW_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
182 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
183 output[0] = row[0]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
184 output[2] = row[1]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
185 output[4] = row[2]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
186 output[6] = row[3]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
187 output[8] = row[4]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
188 output[10] = row[5]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
189 output[12] = row[6]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
190 output[14] = row[7]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
191 } |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
192 #endif |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
193 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
194 static inline void idct(int16_t *block) |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
195 { |
209 | 196 asm volatile( |
197 #if 0 //Alternative, simpler variant | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
198 |
209 | 199 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ |
200 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
201 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
202 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
203 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
209 | 204 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
205 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
206 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
207 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
208 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
209 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
210 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
211 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
212 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
213 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
214 #rounder ", %%mm4 \n\t"\ | |
215 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
216 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
209 | 217 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
218 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ | |
219 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
220 #rounder ", %%mm0 \n\t"\ | |
221 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ | |
222 "paddd %%mm0, %%mm0 \n\t" \ | |
223 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ | |
224 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
225 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ | |
226 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
227 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
228 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
229 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
230 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
231 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ | |
232 "psrad $" #shift ", %%mm7 \n\t"\ | |
233 "psrad $" #shift ", %%mm4 \n\t"\ | |
234 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ | |
235 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ | |
236 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
237 "psrad $" #shift ", %%mm1 \n\t"\ | |
238 "psrad $" #shift ", %%mm2 \n\t"\ | |
239 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ | |
240 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ | |
241 "movq %%mm7, " #dst " \n\t"\ | |
242 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ | |
243 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
244 "movq %%mm2, 24+" #dst " \n\t"\ | |
245 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
246 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
247 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
248 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
249 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ | |
250 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
251 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
252 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
253 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ | |
254 "psrad $" #shift ", %%mm2 \n\t"\ | |
255 "psrad $" #shift ", %%mm0 \n\t"\ | |
256 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
257 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ | |
258 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
259 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
260 "psrad $" #shift ", %%mm6 \n\t"\ | |
261 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ | |
262 "movq %%mm2, 8+" #dst " \n\t"\ | |
263 "psrad $" #shift ", %%mm4 \n\t"\ | |
264 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ | |
265 "movq %%mm4, 16+" #dst " \n\t"\ | |
266 | |
267 #define COL_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
268 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
269 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
270 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ | |
271 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ | |
272 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
273 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
274 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
275 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
276 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
277 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
278 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
279 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
280 #rounder ", %%mm4 \n\t"\ |
209 | 281 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
282 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
283 #rounder ", %%mm0 \n\t"\ | |
284 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
285 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ | |
286 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ | |
287 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
288 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ | |
289 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ | |
290 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ | |
291 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
292 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
293 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\ | |
294 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
295 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
296 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
297 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
209 | 298 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
299 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\ | |
300 "psrad $" #shift ", %%mm7 \n\t"\ | |
301 "psrad $" #shift ", %%mm4 \n\t"\ | |
302 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ | |
303 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
304 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
305 "psrad $" #shift ", %%mm0 \n\t"\ | |
306 "psrad $" #shift ", %%mm2 \n\t"\ | |
307 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
308 "movd %%mm7, " #dst " \n\t"\ | |
309 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
310 "movd %%mm0, 16+" #dst " \n\t"\ | |
311 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
312 "movd %%mm2, 96+" #dst " \n\t"\ | |
313 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
314 "movd %%mm4, 112+" #dst " \n\t"\ | |
315 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\ | |
316 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
317 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
318 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
319 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
320 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
321 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ | |
322 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
323 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
324 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
325 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ | |
326 "psrad $" #shift ", %%mm2 \n\t"\ | |
327 "psrad $" #shift ", %%mm5 \n\t"\ | |
328 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
329 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\ | |
330 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
331 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
332 "psrad $" #shift ", %%mm6 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
333 "psrad $" #shift ", %%mm4 \n\t"\ |
209 | 334 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
335 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
336 "movd %%mm2, 32+" #dst " \n\t"\ | |
337 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ | |
338 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ | |
339 "movd %%mm6, 48+" #dst " \n\t"\ | |
340 "movd %%mm4, 64+" #dst " \n\t"\ | |
341 "movd %%mm5, 80+" #dst " \n\t"\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
342 |
209 | 343 |
344 #define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
345 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
346 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
347 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
348 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
213
e80ad397d30e
Cygwin's mangling by Felix Buenemann <atmosfear@users.sourceforge.net>
nickols_k
parents:
209
diff
changeset
|
349 "movq "MANGLE(wm1010)", %%mm4 \n\t"\ |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
350 "pand %%mm0, %%mm4 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
351 "por %%mm1, %%mm4 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
352 "por %%mm2, %%mm4 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
353 "por %%mm3, %%mm4 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
354 "packssdw %%mm4,%%mm4 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
355 "movd %%mm4, %%eax \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
356 "orl %%eax, %%eax \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
357 "jz 1f \n\t"\ |
209 | 358 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
359 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
360 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
361 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
362 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
363 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
364 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
365 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
366 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
367 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
368 #rounder ", %%mm4 \n\t"\ |
209 | 369 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
370 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ | |
371 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ | |
372 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ | |
373 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
374 #rounder ", %%mm0 \n\t"\ | |
375 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ | |
376 "paddd %%mm0, %%mm0 \n\t" \ | |
377 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ | |
378 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
379 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ | |
380 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
381 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
382 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
383 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
384 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
385 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ | |
386 "psrad $" #shift ", %%mm7 \n\t"\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
387 "psrad $" #shift ", %%mm4 \n\t"\ |
209 | 388 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ |
389 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ | |
390 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
391 "psrad $" #shift ", %%mm1 \n\t"\ | |
392 "psrad $" #shift ", %%mm2 \n\t"\ | |
393 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ | |
394 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ | |
395 "movq %%mm7, " #dst " \n\t"\ | |
396 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ | |
397 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
398 "movq %%mm2, 24+" #dst " \n\t"\ | |
399 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
400 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
401 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
402 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
209 | 403 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ |
404 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
405 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
406 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
407 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
408 "psrad $" #shift ", %%mm2 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
409 "psrad $" #shift ", %%mm0 \n\t"\ |
209 | 410 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ |
411 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ | |
412 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
413 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
414 "psrad $" #shift ", %%mm6 \n\t"\ | |
415 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ | |
416 "movq %%mm2, 8+" #dst " \n\t"\ | |
417 "psrad $" #shift ", %%mm4 \n\t"\ | |
418 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ | |
419 "movq %%mm4, 16+" #dst " \n\t"\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
420 "jmp 2f \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
421 "1: \n\t"\ |
209 | 422 "pslld $16, %%mm0 \n\t"\ |
213
e80ad397d30e
Cygwin's mangling by Felix Buenemann <atmosfear@users.sourceforge.net>
nickols_k
parents:
209
diff
changeset
|
423 "#paddd "MANGLE(d40000)", %%mm0 \n\t"\ |
209 | 424 "psrad $13, %%mm0 \n\t"\ |
425 "packssdw %%mm0, %%mm0 \n\t"\ | |
426 "movq %%mm0, " #dst " \n\t"\ | |
427 "movq %%mm0, 8+" #dst " \n\t"\ | |
428 "movq %%mm0, 16+" #dst " \n\t"\ | |
429 "movq %%mm0, 24+" #dst " \n\t"\ | |
430 "2: \n\t" | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
431 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
432 |
209 | 433 //IDCT( src0, src4, src1, src5, dst, rounder, shift) |
434 ROW_IDCT( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11) | |
435 /*ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11) | |
436 ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11) | |
437 ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
438 |
209 | 439 DC_COND_ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11) |
440 DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11) | |
441 DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11) | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
442 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
443 |
209 | 444 //IDCT( src0, src4, src1, src5, dst, rounder, shift) |
445 COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) | |
446 COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) | |
447 COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) | |
448 COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
449 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
450 #else |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
451 |
209 | 452 #define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ |
453 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
454 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
455 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
456 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
213
e80ad397d30e
Cygwin's mangling by Felix Buenemann <atmosfear@users.sourceforge.net>
nickols_k
parents:
209
diff
changeset
|
457 "movq "MANGLE(wm1010)", %%mm4 \n\t"\ |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
458 "pand %%mm0, %%mm4 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
459 "por %%mm1, %%mm4 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
460 "por %%mm2, %%mm4 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
461 "por %%mm3, %%mm4 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
462 "packssdw %%mm4,%%mm4 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
463 "movd %%mm4, %%eax \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
464 "orl %%eax, %%eax \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
465 "jz 1f \n\t"\ |
209 | 466 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
467 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
468 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
469 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
470 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
471 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
472 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
473 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
474 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
475 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
476 #rounder ", %%mm4 \n\t"\ |
209 | 477 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
478 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ | |
479 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ | |
480 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ | |
481 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
482 #rounder ", %%mm0 \n\t"\ | |
483 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ | |
484 "paddd %%mm0, %%mm0 \n\t" \ | |
485 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ | |
486 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
487 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ | |
488 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
489 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
490 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
491 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
492 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
493 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ | |
494 "psrad $" #shift ", %%mm7 \n\t"\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
495 "psrad $" #shift ", %%mm4 \n\t"\ |
209 | 496 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ |
497 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ | |
498 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
499 "psrad $" #shift ", %%mm1 \n\t"\ | |
500 "psrad $" #shift ", %%mm2 \n\t"\ | |
501 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ | |
502 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ | |
503 "movq %%mm7, " #dst " \n\t"\ | |
504 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ | |
505 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
506 "movq %%mm2, 24+" #dst " \n\t"\ | |
507 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
508 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
509 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
510 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
209 | 511 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ |
512 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
513 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
514 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
515 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
516 "psrad $" #shift ", %%mm2 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
517 "psrad $" #shift ", %%mm0 \n\t"\ |
209 | 518 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ |
519 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ | |
520 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
521 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
522 "psrad $" #shift ", %%mm6 \n\t"\ | |
523 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ | |
524 "movq %%mm2, 8+" #dst " \n\t"\ | |
525 "psrad $" #shift ", %%mm4 \n\t"\ | |
526 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ | |
527 "movq %%mm4, 16+" #dst " \n\t"\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
528 "jmp 2f \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
529 "1: \n\t"\ |
209 | 530 "pslld $16, %%mm0 \n\t"\ |
213
e80ad397d30e
Cygwin's mangling by Felix Buenemann <atmosfear@users.sourceforge.net>
nickols_k
parents:
209
diff
changeset
|
531 "paddd "MANGLE(d40000)", %%mm0 \n\t"\ |
209 | 532 "psrad $13, %%mm0 \n\t"\ |
533 "packssdw %%mm0, %%mm0 \n\t"\ | |
534 "movq %%mm0, " #dst " \n\t"\ | |
535 "movq %%mm0, 8+" #dst " \n\t"\ | |
536 "movq %%mm0, 16+" #dst " \n\t"\ | |
537 "movq %%mm0, 24+" #dst " \n\t"\ | |
538 "2: \n\t" | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
539 |
209 | 540 #define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \ |
541 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
542 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
543 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
544 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
545 "movq %%mm0, %%mm4 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
546 "por %%mm1, %%mm4 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
547 "por %%mm2, %%mm4 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
548 "por %%mm3, %%mm4 \n\t"\ |
209 | 549 "packssdw %%mm4,%%mm4 \n\t"\ |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
550 "movd %%mm4, %%eax \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
551 "orl %%eax, %%eax \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
552 "jz " #bt " \n\t"\ |
209 | 553 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
554 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
555 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
556 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
557 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
558 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
559 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
560 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
561 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
562 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
563 #rounder ", %%mm4 \n\t"\ | |
564 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
565 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ | |
566 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ | |
567 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ | |
568 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
569 #rounder ", %%mm0 \n\t"\ | |
570 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ | |
571 "paddd %%mm0, %%mm0 \n\t" \ | |
572 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ | |
573 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
574 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ | |
575 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
576 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
577 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
578 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
579 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
580 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ | |
581 "psrad $" #shift ", %%mm7 \n\t"\ | |
582 "psrad $" #shift ", %%mm4 \n\t"\ | |
583 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ | |
584 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ | |
585 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
586 "psrad $" #shift ", %%mm1 \n\t"\ | |
587 "psrad $" #shift ", %%mm2 \n\t"\ | |
588 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ | |
589 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ | |
590 "movq %%mm7, " #dst " \n\t"\ | |
591 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ | |
592 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
593 "movq %%mm2, 24+" #dst " \n\t"\ | |
594 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
595 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
596 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
597 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
598 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ | |
599 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
600 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
601 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
602 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ | |
603 "psrad $" #shift ", %%mm2 \n\t"\ | |
604 "psrad $" #shift ", %%mm0 \n\t"\ | |
605 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
606 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ | |
607 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
608 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
609 "psrad $" #shift ", %%mm6 \n\t"\ | |
610 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ | |
611 "movq %%mm2, 8+" #dst " \n\t"\ | |
612 "psrad $" #shift ", %%mm4 \n\t"\ | |
613 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ | |
614 "movq %%mm4, 16+" #dst " \n\t"\ | |
615 | |
616 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
617 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
618 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
619 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ | |
620 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ | |
621 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
622 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
623 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
624 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
625 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
626 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
627 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
628 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
629 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
630 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
631 #rounder ", %%mm4 \n\t"\ | |
632 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
633 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
209 | 634 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
635 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ | |
636 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
637 #rounder ", %%mm0 \n\t"\ | |
638 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ | |
639 "paddd %%mm0, %%mm0 \n\t" \ | |
640 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ | |
641 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
642 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ | |
643 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
644 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
645 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
646 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
647 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
648 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ | |
649 "psrad $" #shift ", %%mm7 \n\t"\ | |
650 "psrad $" #shift ", %%mm4 \n\t"\ | |
651 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ | |
652 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ | |
653 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
654 "psrad $" #shift ", %%mm1 \n\t"\ | |
655 "psrad $" #shift ", %%mm2 \n\t"\ | |
656 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ | |
657 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ | |
658 "movq %%mm7, " #dst " \n\t"\ | |
659 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ | |
660 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
661 "movq %%mm2, 24+" #dst " \n\t"\ | |
662 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
663 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
664 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
665 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
666 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ | |
667 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
668 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
669 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
670 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ | |
671 "psrad $" #shift ", %%mm2 \n\t"\ | |
672 "psrad $" #shift ", %%mm0 \n\t"\ | |
673 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
674 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ | |
675 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
676 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
677 "psrad $" #shift ", %%mm6 \n\t"\ | |
678 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ | |
679 "movq %%mm2, 8+" #dst " \n\t"\ | |
680 "psrad $" #shift ", %%mm4 \n\t"\ | |
681 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ | |
682 "movq %%mm4, 16+" #dst " \n\t"\ | |
683 | |
684 //IDCT( src0, src4, src1, src5, dst, rounder, shift) | |
685 DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11) | |
686 Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f) | |
687 Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f) | |
688 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f) | |
689 | |
690 #undef IDCT | |
691 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
692 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
693 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
694 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ | |
695 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ | |
696 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
697 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
698 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
699 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
700 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
701 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
702 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
703 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
704 #rounder ", %%mm4 \n\t"\ |
209 | 705 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
706 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
707 #rounder ", %%mm0 \n\t"\ | |
708 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
709 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ | |
710 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ | |
711 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
712 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ | |
713 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ | |
714 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ | |
715 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
716 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
717 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\ | |
718 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
719 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
720 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
721 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
209 | 722 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
723 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\ | |
724 "psrad $" #shift ", %%mm7 \n\t"\ | |
725 "psrad $" #shift ", %%mm4 \n\t"\ | |
726 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ | |
727 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
728 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
729 "psrad $" #shift ", %%mm0 \n\t"\ | |
730 "psrad $" #shift ", %%mm2 \n\t"\ | |
731 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
732 "movd %%mm7, " #dst " \n\t"\ | |
733 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
734 "movd %%mm0, 16+" #dst " \n\t"\ | |
735 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
736 "movd %%mm2, 96+" #dst " \n\t"\ | |
737 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
738 "movd %%mm4, 112+" #dst " \n\t"\ | |
739 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\ | |
740 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
741 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
742 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
743 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
744 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
745 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ | |
746 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
747 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
748 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
749 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ | |
750 "psrad $" #shift ", %%mm2 \n\t"\ | |
751 "psrad $" #shift ", %%mm5 \n\t"\ | |
752 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
753 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\ | |
754 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
755 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
756 "psrad $" #shift ", %%mm6 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
757 "psrad $" #shift ", %%mm4 \n\t"\ |
209 | 758 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
759 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
760 "movd %%mm2, 32+" #dst " \n\t"\ | |
761 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ | |
762 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ | |
763 "movd %%mm6, 48+" #dst " \n\t"\ | |
764 "movd %%mm4, 64+" #dst " \n\t"\ | |
765 "movd %%mm5, 80+" #dst " \n\t" | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
766 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
767 |
209 | 768 //IDCT( src0, src4, src1, src5, dst, rounder, shift) |
769 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) | |
770 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) | |
771 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) | |
772 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
773 "jmp 9f \n\t" |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
774 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
775 "#.balign 16 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
776 "4: \n\t" |
209 | 777 Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f) |
778 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f) | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
779 |
209 | 780 #undef IDCT |
781 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
782 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
783 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
784 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
209 | 785 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
786 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
787 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
788 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
789 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
790 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
791 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
792 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
793 #rounder ", %%mm4 \n\t"\ | |
794 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
795 #rounder ", %%mm0 \n\t"\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
796 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
209 | 797 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
798 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
799 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ | |
800 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ | |
801 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ | |
802 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
803 "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
804 "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ |
209 | 805 "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\ |
806 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
807 "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
808 "psrad $" #shift ", %%mm1 \n\t"\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
809 "psrad $" #shift ", %%mm4 \n\t"\ |
209 | 810 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ |
811 "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
812 "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
813 "psrad $" #shift ", %%mm0 \n\t"\ |
209 | 814 "psrad $" #shift ", %%mm2 \n\t"\ |
815 "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\ | |
816 "movd %%mm1, " #dst " \n\t"\ | |
817 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
818 "movd %%mm0, 16+" #dst " \n\t"\ | |
819 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
820 "movd %%mm2, 96+" #dst " \n\t"\ | |
821 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
822 "movd %%mm4, 112+" #dst " \n\t"\ | |
823 "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\ | |
824 "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
825 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ | |
826 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
827 "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
828 "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\ | |
829 "psrad $" #shift ", %%mm2 \n\t"\ | |
830 "psrad $" #shift ", %%mm5 \n\t"\ | |
831 "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\ | |
832 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
833 "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\ | |
834 "psrad $" #shift ", %%mm6 \n\t"\ | |
835 "psrad $" #shift ", %%mm1 \n\t"\ | |
836 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
837 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
838 "movd %%mm2, 32+" #dst " \n\t"\ | |
839 "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\ | |
840 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ | |
841 "movd %%mm6, 48+" #dst " \n\t"\ | |
842 "movd %%mm1, 64+" #dst " \n\t"\ | |
843 "movd %%mm5, 80+" #dst " \n\t" | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
844 |
209 | 845 //IDCT( src0, src4, src1, src5, dst, rounder, shift) |
846 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) | |
847 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) | |
848 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) | |
849 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
850 "jmp 9f \n\t" |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
851 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
852 "#.balign 16 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
853 "6: \n\t" |
209 | 854 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f) |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
855 |
209 | 856 #undef IDCT |
857 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
858 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
859 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ | |
860 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
861 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
862 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
863 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
864 #rounder ", %%mm4 \n\t"\ | |
865 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
866 #rounder ", %%mm0 \n\t"\ | |
867 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
868 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ | |
869 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
870 "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
871 "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
872 "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\ | |
873 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
874 "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
875 "psrad $" #shift ", %%mm1 \n\t"\ | |
876 "psrad $" #shift ", %%mm4 \n\t"\ | |
877 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ | |
878 "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
879 "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
880 "psrad $" #shift ", %%mm0 \n\t"\ | |
881 "psrad $" #shift ", %%mm2 \n\t"\ | |
882 "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\ | |
883 "movd %%mm1, " #dst " \n\t"\ | |
884 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
885 "movd %%mm0, 16+" #dst " \n\t"\ | |
886 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
887 "movd %%mm2, 96+" #dst " \n\t"\ | |
888 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
889 "movd %%mm4, 112+" #dst " \n\t"\ | |
890 "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\ | |
891 "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
892 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ | |
893 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
894 "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
895 "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\ | |
896 "psrad $" #shift ", %%mm2 \n\t"\ | |
897 "psrad $" #shift ", %%mm5 \n\t"\ | |
898 "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\ | |
899 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
900 "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\ | |
901 "psrad $" #shift ", %%mm6 \n\t"\ | |
902 "psrad $" #shift ", %%mm1 \n\t"\ | |
903 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
904 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
905 "movd %%mm2, 32+" #dst " \n\t"\ | |
906 "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\ | |
907 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ | |
908 "movd %%mm6, 48+" #dst " \n\t"\ | |
909 "movd %%mm1, 64+" #dst " \n\t"\ | |
910 "movd %%mm5, 80+" #dst " \n\t" | |
911 | |
912 | |
913 //IDCT( src0, src4, src1, src5, dst, rounder, shift) | |
914 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) | |
915 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) | |
916 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) | |
917 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) | |
918 "jmp 9f \n\t" | |
919 | |
920 "#.balign 16 \n\t"\ | |
921 "2: \n\t" | |
922 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f) | |
923 | |
924 #undef IDCT | |
925 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
926 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
927 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
928 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
209 | 929 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
930 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
931 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
932 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
933 #rounder ", %%mm4 \n\t"\ | |
934 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
935 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
936 #rounder ", %%mm0 \n\t"\ | |
937 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
938 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
939 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ | |
940 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
941 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
942 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\ | |
943 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
944 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
945 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
946 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
947 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
948 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\ | |
949 "psrad $" #shift ", %%mm7 \n\t"\ | |
950 "psrad $" #shift ", %%mm4 \n\t"\ | |
951 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ | |
952 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
953 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
954 "psrad $" #shift ", %%mm0 \n\t"\ | |
955 "psrad $" #shift ", %%mm2 \n\t"\ | |
956 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
957 "movd %%mm7, " #dst " \n\t"\ | |
958 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
959 "movd %%mm0, 16+" #dst " \n\t"\ | |
960 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
961 "movd %%mm2, 96+" #dst " \n\t"\ | |
962 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
963 "movd %%mm4, 112+" #dst " \n\t"\ | |
964 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\ | |
965 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
966 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
967 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
968 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
969 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
970 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ | |
971 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
972 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
973 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
974 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ | |
975 "psrad $" #shift ", %%mm2 \n\t"\ | |
976 "psrad $" #shift ", %%mm5 \n\t"\ | |
977 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
978 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\ | |
979 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
980 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
981 "psrad $" #shift ", %%mm6 \n\t"\ | |
982 "psrad $" #shift ", %%mm4 \n\t"\ | |
983 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
984 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
985 "movd %%mm2, 32+" #dst " \n\t"\ | |
986 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ | |
987 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ | |
988 "movd %%mm6, 48+" #dst " \n\t"\ | |
989 "movd %%mm4, 64+" #dst " \n\t"\ | |
990 "movd %%mm5, 80+" #dst " \n\t" | |
991 | |
992 //IDCT( src0, src4, src1, src5, dst, rounder, shift) | |
993 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) | |
994 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) | |
995 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) | |
996 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) | |
997 "jmp 9f \n\t" | |
998 | |
999 "#.balign 16 \n\t"\ | |
1000 "3: \n\t" | |
1001 #undef IDCT | |
1002 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
1003 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
1004 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ | |
1005 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
1006 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
1007 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
1008 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1009 #rounder ", %%mm4 \n\t"\ | |
1010 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
1011 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
1012 #rounder ", %%mm0 \n\t"\ | |
1013 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
1014 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1015 "movq 64(%2), %%mm3 \n\t"\ | |
1016 "pmaddwd %%mm2, %%mm3 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1017 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1018 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1019 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1020 "psrad $" #shift ", %%mm7 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1021 "psrad $" #shift ", %%mm4 \n\t"\ |
209 | 1022 "movq %%mm0, %%mm1 \n\t" /* A1 a1 */\ |
1023 "paddd %%mm3, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
1024 "psubd %%mm3, %%mm1 \n\t" /* A1-B1 a1-b1 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1025 "psrad $" #shift ", %%mm0 \n\t"\ |
209 | 1026 "psrad $" #shift ", %%mm1 \n\t"\ |
1027 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
1028 "movd %%mm7, " #dst " \n\t"\ | |
1029 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
1030 "movd %%mm0, 16+" #dst " \n\t"\ | |
1031 "packssdw %%mm1, %%mm1 \n\t" /* A1-B1 a1-b1 */\ | |
1032 "movd %%mm1, 96+" #dst " \n\t"\ | |
1033 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
1034 "movd %%mm4, 112+" #dst " \n\t"\ | |
1035 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
1036 "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
1037 "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
1038 "movq %%mm5, %%mm1 \n\t" /* A2 a2 */\ | |
1039 "paddd %%mm4, %%mm1 \n\t" /* A2+B2 a2+b2 */\ | |
1040 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ | |
1041 "psrad $" #shift ", %%mm1 \n\t"\ | |
1042 "psrad $" #shift ", %%mm5 \n\t"\ | |
1043 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
1044 "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
1045 "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1046 "psrad $" #shift ", %%mm6 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1047 "psrad $" #shift ", %%mm4 \n\t"\ |
209 | 1048 "packssdw %%mm1, %%mm1 \n\t" /* A2+B2 a2+b2 */\ |
1049 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
1050 "movd %%mm1, 32+" #dst " \n\t"\ | |
1051 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ | |
1052 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ | |
1053 "movd %%mm6, 48+" #dst " \n\t"\ | |
1054 "movd %%mm4, 64+" #dst " \n\t"\ | |
1055 "movd %%mm5, 80+" #dst " \n\t" | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1056 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1057 |
209 | 1058 //IDCT( src0, src4, src1, src5, dst, rounder, shift) |
1059 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) | |
1060 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) | |
1061 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) | |
1062 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1063 "jmp 9f \n\t" |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1064 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1065 "#.balign 16 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1066 "5: \n\t" |
209 | 1067 #undef IDCT |
1068 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
1069 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
1070 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
1071 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
1072 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
1073 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
1074 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1075 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
1076 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
1077 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
1078 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
1079 #rounder ", %%mm4 \n\t"\ | |
1080 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1081 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
209 | 1082 #rounder ", %%mm0 \n\t"\ |
1083 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ | |
1084 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1085 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ | |
1086 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ | |
1087 "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\ | |
1088 "movq 8+" #src4 ", %%mm3 \n\t" /* R6 R2 r6 r2 */\ | |
1089 "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\ | |
1090 "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
1091 "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\ | |
1092 "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1093 "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\ | |
1094 "pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
1095 "pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
1096 #rounder ", %%mm1 \n\t"\ | |
1097 "paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\ | |
1098 "paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\ | |
1099 #rounder ", %%mm2 \n\t"\ | |
1100 "psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\ | |
1101 "paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\ | |
1102 "paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\ | |
1103 "psubd %%mm3, %%mm2 \n\t" /* A2 a2 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1104 "psrad $" #shift ", %%mm4 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1105 "psrad $" #shift ", %%mm7 \n\t"\ |
209 | 1106 "psrad $" #shift ", %%mm3 \n\t"\ |
1107 "packssdw %%mm7, %%mm4 \n\t" /* A0 a0 */\ | |
1108 "movq %%mm4, " #dst " \n\t"\ | |
1109 "psrad $" #shift ", %%mm0 \n\t"\ | |
1110 "packssdw %%mm3, %%mm0 \n\t" /* A1 a1 */\ | |
1111 "movq %%mm0, 16+" #dst " \n\t"\ | |
1112 "movq %%mm0, 96+" #dst " \n\t"\ | |
1113 "movq %%mm4, 112+" #dst " \n\t"\ | |
1114 "psrad $" #shift ", %%mm5 \n\t"\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1115 "psrad $" #shift ", %%mm6 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1116 "psrad $" #shift ", %%mm2 \n\t"\ |
209 | 1117 "packssdw %%mm2, %%mm5 \n\t" /* A2-B2 a2-b2 */\ |
1118 "movq %%mm5, 32+" #dst " \n\t"\ | |
1119 "psrad $" #shift ", %%mm1 \n\t"\ | |
1120 "packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
1121 "movq %%mm6, 48+" #dst " \n\t"\ | |
1122 "movq %%mm6, 64+" #dst " \n\t"\ | |
1123 "movq %%mm5, 80+" #dst " \n\t" | |
1124 | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1125 |
209 | 1126 //IDCT( src0, src4, src1, src5, dst, rounder, shift) |
1127 IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) | |
1128 //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) | |
1129 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) | |
1130 //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1131 "jmp 9f \n\t" |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1132 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1133 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1134 "#.balign 16 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1135 "1: \n\t" |
209 | 1136 #undef IDCT |
1137 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
1138 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
1139 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1140 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
209 | 1141 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
1142 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
1143 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
1144 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1145 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
1146 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
1147 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
1148 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
1149 #rounder ", %%mm4 \n\t"\ | |
1150 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
1151 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
1152 #rounder ", %%mm0 \n\t"\ | |
1153 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1154 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
209 | 1155 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
1156 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1157 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ | |
1158 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ | |
1159 "movq 64(%2), %%mm1 \n\t"\ | |
1160 "pmaddwd %%mm2, %%mm1 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
1161 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1162 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
209 | 1163 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
1164 "psrad $" #shift ", %%mm7 \n\t"\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1165 "psrad $" #shift ", %%mm4 \n\t"\ |
209 | 1166 "movq %%mm0, %%mm3 \n\t" /* A1 a1 */\ |
1167 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
1168 "psubd %%mm1, %%mm3 \n\t" /* A1-B1 a1-b1 */\ | |
1169 "psrad $" #shift ", %%mm0 \n\t"\ | |
1170 "psrad $" #shift ", %%mm3 \n\t"\ | |
1171 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
1172 "movd %%mm7, " #dst " \n\t"\ | |
1173 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
1174 "movd %%mm0, 16+" #dst " \n\t"\ | |
1175 "packssdw %%mm3, %%mm3 \n\t" /* A1-B1 a1-b1 */\ | |
1176 "movd %%mm3, 96+" #dst " \n\t"\ | |
1177 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
1178 "movd %%mm4, 112+" #dst " \n\t"\ | |
1179 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
1180 "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
1181 "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
1182 "movq %%mm5, %%mm3 \n\t" /* A2 a2 */\ | |
1183 "paddd %%mm4, %%mm3 \n\t" /* A2+B2 a2+b2 */\ | |
1184 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ | |
1185 "psrad $" #shift ", %%mm3 \n\t"\ | |
1186 "psrad $" #shift ", %%mm5 \n\t"\ | |
1187 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
1188 "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
1189 "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1190 "psrad $" #shift ", %%mm6 \n\t"\ |
209 | 1191 "packssdw %%mm3, %%mm3 \n\t" /* A2+B2 a2+b2 */\ |
1192 "movd %%mm3, 32+" #dst " \n\t"\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1193 "psrad $" #shift ", %%mm4 \n\t"\ |
209 | 1194 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
1195 "movd %%mm6, 48+" #dst " \n\t"\ | |
1196 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ | |
1197 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ | |
1198 "movd %%mm4, 64+" #dst " \n\t"\ | |
1199 "movd %%mm5, 80+" #dst " \n\t" | |
1200 | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1201 |
209 | 1202 //IDCT( src0, src4, src1, src5, dst, rounder, shift) |
1203 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) | |
1204 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) | |
1205 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) | |
1206 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1207 "jmp 9f \n\t" |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1208 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1209 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1210 "#.balign 16 \n\t" |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1211 "7: \n\t" |
209 | 1212 #undef IDCT |
1213 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
1214 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
1215 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
1216 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
1217 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
1218 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1219 #rounder ", %%mm4 \n\t"\ | |
1220 #rounder ", %%mm0 \n\t"\ | |
1221 "psrad $" #shift ", %%mm4 \n\t"\ | |
1222 "psrad $" #shift ", %%mm0 \n\t"\ | |
1223 "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\ | |
1224 "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\ | |
1225 "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
1226 "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\ | |
1227 "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1228 "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\ | |
1229 #rounder ", %%mm1 \n\t"\ | |
1230 #rounder ", %%mm2 \n\t"\ | |
1231 "psrad $" #shift ", %%mm1 \n\t"\ | |
1232 "packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\ | |
1233 "movq %%mm4, " #dst " \n\t"\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1234 "psrad $" #shift ", %%mm2 \n\t"\ |
209 | 1235 "packssdw %%mm2, %%mm0 \n\t" /* A1 a1 */\ |
1236 "movq %%mm0, 16+" #dst " \n\t"\ | |
1237 "movq %%mm0, 96+" #dst " \n\t"\ | |
1238 "movq %%mm4, 112+" #dst " \n\t"\ | |
1239 "movq %%mm0, 32+" #dst " \n\t"\ | |
1240 "movq %%mm4, 48+" #dst " \n\t"\ | |
1241 "movq %%mm4, 64+" #dst " \n\t"\ | |
1242 "movq %%mm0, 80+" #dst " \n\t" | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1243 |
209 | 1244 //IDCT( src0, src4, src1, src5, dst, rounder, shift) |
1245 IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) | |
1246 //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) | |
1247 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) | |
1248 //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1249 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1250 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1251 #endif |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1252 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1253 /* |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1254 Input |
209 | 1255 00 40 04 44 20 60 24 64 |
1256 10 30 14 34 50 70 54 74 | |
1257 01 41 03 43 21 61 23 63 | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1258 11 31 13 33 51 71 53 73 |
209 | 1259 02 42 06 46 22 62 26 66 |
1260 12 32 16 36 52 72 56 76 | |
1261 05 45 07 47 25 65 27 67 | |
1262 15 35 17 37 55 75 57 77 | |
1263 | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1264 Temp |
209 | 1265 00 04 10 14 20 24 30 34 |
1266 40 44 50 54 60 64 70 74 | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1267 01 03 11 13 21 23 31 33 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1268 41 43 51 53 61 63 71 73 |
209 | 1269 02 06 12 16 22 26 32 36 |
1270 42 46 52 56 62 66 72 76 | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1271 05 07 15 17 25 27 35 37 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1272 45 47 55 57 65 67 75 77 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1273 */ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1274 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1275 "9: \n\t" |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1276 :: "r" (block), "r" (temp), "r" (coeffs) |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1277 : "%eax" |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1278 ); |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1279 } |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1280 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1281 void simple_idct_mmx(int16_t *block) |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1282 { |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1283 idct(block); |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1284 } |