Mercurial > libavcodec.hg
annotate i386/simple_idct_mmx.c @ 209:c0d8ecae7ac5 libavcodec
(commit by michael)
faster simple idct in MMX
author | arpi_esp |
---|---|
date | Thu, 17 Jan 2002 20:00:41 +0000 |
parents | bd77d3cbb233 |
children | e80ad397d30e |
rev | line source |
---|---|
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1 /* |
209 | 2 Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at) |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
3 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
4 This program is free software; you can redistribute it and/or modify |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
5 it under the terms of the GNU General Public License as published by |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
6 the Free Software Foundation; either version 2 of the License, or |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
7 (at your option) any later version. |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
8 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
9 This program is distributed in the hope that it will be useful, |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
10 but WITHOUT ANY WARRANTY; without even the implied warranty of |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
12 GNU General Public License for more details. |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
13 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
14 You should have received a copy of the GNU General Public License |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
15 along with this program; if not, write to the Free Software |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
17 */ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
18 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
19 #include <inttypes.h> |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
20 #include "../dsputil.h" |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
21 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
22 #define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
23 #define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
24 #define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
25 #define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
26 #define C4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
27 #define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
28 #define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
29 #define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
30 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
31 #define ROW_SHIFT 11 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
32 #define COL_SHIFT 20 // 6 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
33 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
34 static uint64_t __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000ULL; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
35 static uint64_t __attribute__((aligned(8))) d40000= 0x0000000000040000ULL; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
36 static int16_t __attribute__((aligned(8))) temp[64]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
37 static int16_t __attribute__((aligned(8))) coeffs[]= { |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
38 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0, |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
39 // 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0, |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
40 // 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16), |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
41 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0, |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
42 // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :) |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
43 // 0, 0, 0, 0, |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
44 // 0, 0, 0, 0, |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
45 |
209 | 46 C4, C4, C4, C4, |
47 C4, -C4, C4, -C4, | |
48 | |
49 C2, C6, C2, C6, | |
50 C6, -C2, C6, -C2, | |
51 | |
52 C1, C3, C1, C3, | |
53 C5, C7, C5, C7, | |
54 | |
55 C3, -C7, C3, -C7, | |
56 -C1, -C5, -C1, -C5, | |
57 | |
58 C5, -C1, C5, -C1, | |
59 C7, C3, C7, C3, | |
60 | |
61 C7, -C5, C7, -C5, | |
62 C3, -C1, C3, -C1 | |
63 }; | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
64 |
209 | 65 static void unused_var_killer(){ |
66 int a= wm1010 + d40000; | |
67 temp[0]=a; | |
68 } | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
69 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
70 #if 0 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
71 static void inline idctCol (int16_t * col, int16_t *input) |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
72 { |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
73 #undef C0 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
74 #undef C1 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
75 #undef C2 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
76 #undef C3 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
77 #undef C4 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
78 #undef C5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
79 #undef C6 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
80 #undef C7 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
81 int a0, a1, a2, a3, b0, b1, b2, b3; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
82 const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
83 const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
84 const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
85 const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
86 const int C4 = 16384; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
87 const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
88 const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
89 const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
90 /* |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
91 if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) { |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
92 col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] = |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
93 col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
94 return; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
95 }*/ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
96 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
97 col[8*0] = input[8*0 + 0]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
98 col[8*1] = input[8*2 + 0]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
99 col[8*2] = input[8*0 + 1]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
100 col[8*3] = input[8*2 + 1]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
101 col[8*4] = input[8*4 + 0]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
102 col[8*5] = input[8*6 + 0]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
103 col[8*6] = input[8*4 + 1]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
104 col[8*7] = input[8*6 + 1]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
105 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
106 a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6] + (1<<(COL_SHIFT-1)); |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
107 a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6] + (1<<(COL_SHIFT-1)); |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
108 a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6] + (1<<(COL_SHIFT-1)); |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
109 a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6] + (1<<(COL_SHIFT-1)); |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
110 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
111 b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
112 b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
113 b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
114 b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
115 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
116 col[8*0] = (a0 + b0) >> COL_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
117 col[8*1] = (a1 + b1) >> COL_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
118 col[8*2] = (a2 + b2) >> COL_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
119 col[8*3] = (a3 + b3) >> COL_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
120 col[8*4] = (a3 - b3) >> COL_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
121 col[8*5] = (a2 - b2) >> COL_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
122 col[8*6] = (a1 - b1) >> COL_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
123 col[8*7] = (a0 - b0) >> COL_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
124 } |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
125 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
126 static void inline idctRow (int16_t * output, int16_t * input) |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
127 { |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
128 int16_t row[8]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
129 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
130 int a0, a1, a2, a3, b0, b1, b2, b3; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
131 const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
132 const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
133 const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
134 const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
135 const int C4 = 16384; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
136 const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
137 const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
138 const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
139 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
140 row[0] = input[0]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
141 row[2] = input[1]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
142 row[4] = input[4]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
143 row[6] = input[5]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
144 row[1] = input[8]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
145 row[3] = input[9]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
146 row[5] = input[12]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
147 row[7] = input[13]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
148 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
149 if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) { |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
150 row[0] = row[1] = row[2] = row[3] = row[4] = |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
151 row[5] = row[6] = row[7] = row[0]<<3; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
152 output[0] = row[0]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
153 output[2] = row[1]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
154 output[4] = row[2]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
155 output[6] = row[3]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
156 output[8] = row[4]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
157 output[10] = row[5]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
158 output[12] = row[6]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
159 output[14] = row[7]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
160 return; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
161 } |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
162 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
163 a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1)); |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
164 a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1)); |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
165 a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1)); |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
166 a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1)); |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
167 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
168 b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
169 b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
170 b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
171 b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
172 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
173 row[0] = (a0 + b0) >> ROW_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
174 row[1] = (a1 + b1) >> ROW_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
175 row[2] = (a2 + b2) >> ROW_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
176 row[3] = (a3 + b3) >> ROW_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
177 row[4] = (a3 - b3) >> ROW_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
178 row[5] = (a2 - b2) >> ROW_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
179 row[6] = (a1 - b1) >> ROW_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
180 row[7] = (a0 - b0) >> ROW_SHIFT; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
181 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
182 output[0] = row[0]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
183 output[2] = row[1]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
184 output[4] = row[2]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
185 output[6] = row[3]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
186 output[8] = row[4]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
187 output[10] = row[5]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
188 output[12] = row[6]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
189 output[14] = row[7]; |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
190 } |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
191 #endif |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
192 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
193 static inline void idct(int16_t *block) |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
194 { |
209 | 195 asm volatile( |
196 #if 0 //Alternative, simpler variant | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
197 |
209 | 198 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ |
199 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
200 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
201 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
202 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
209 | 203 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
204 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
205 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
206 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
207 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
208 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
209 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
210 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
211 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
212 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
213 #rounder ", %%mm4 \n\t"\ | |
214 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
215 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
209 | 216 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
217 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ | |
218 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
219 #rounder ", %%mm0 \n\t"\ | |
220 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ | |
221 "paddd %%mm0, %%mm0 \n\t" \ | |
222 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ | |
223 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
224 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ | |
225 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
226 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
227 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
228 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
229 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
230 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ | |
231 "psrad $" #shift ", %%mm7 \n\t"\ | |
232 "psrad $" #shift ", %%mm4 \n\t"\ | |
233 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ | |
234 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ | |
235 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
236 "psrad $" #shift ", %%mm1 \n\t"\ | |
237 "psrad $" #shift ", %%mm2 \n\t"\ | |
238 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ | |
239 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ | |
240 "movq %%mm7, " #dst " \n\t"\ | |
241 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ | |
242 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
243 "movq %%mm2, 24+" #dst " \n\t"\ | |
244 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
245 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
246 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
247 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
248 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ | |
249 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
250 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
251 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
252 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ | |
253 "psrad $" #shift ", %%mm2 \n\t"\ | |
254 "psrad $" #shift ", %%mm0 \n\t"\ | |
255 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
256 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ | |
257 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
258 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
259 "psrad $" #shift ", %%mm6 \n\t"\ | |
260 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ | |
261 "movq %%mm2, 8+" #dst " \n\t"\ | |
262 "psrad $" #shift ", %%mm4 \n\t"\ | |
263 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ | |
264 "movq %%mm4, 16+" #dst " \n\t"\ | |
265 | |
266 #define COL_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
267 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
268 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
269 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ | |
270 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ | |
271 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
272 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
273 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
274 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
275 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
276 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
277 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
278 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
279 #rounder ", %%mm4 \n\t"\ |
209 | 280 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
281 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
282 #rounder ", %%mm0 \n\t"\ | |
283 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
284 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ | |
285 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ | |
286 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
287 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ | |
288 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ | |
289 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ | |
290 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
291 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
292 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\ | |
293 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
294 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
295 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
296 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
209 | 297 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
298 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\ | |
299 "psrad $" #shift ", %%mm7 \n\t"\ | |
300 "psrad $" #shift ", %%mm4 \n\t"\ | |
301 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ | |
302 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
303 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
304 "psrad $" #shift ", %%mm0 \n\t"\ | |
305 "psrad $" #shift ", %%mm2 \n\t"\ | |
306 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
307 "movd %%mm7, " #dst " \n\t"\ | |
308 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
309 "movd %%mm0, 16+" #dst " \n\t"\ | |
310 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
311 "movd %%mm2, 96+" #dst " \n\t"\ | |
312 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
313 "movd %%mm4, 112+" #dst " \n\t"\ | |
314 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\ | |
315 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
316 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
317 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
318 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
319 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
320 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ | |
321 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
322 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
323 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
324 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ | |
325 "psrad $" #shift ", %%mm2 \n\t"\ | |
326 "psrad $" #shift ", %%mm5 \n\t"\ | |
327 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
328 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\ | |
329 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
330 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
331 "psrad $" #shift ", %%mm6 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
332 "psrad $" #shift ", %%mm4 \n\t"\ |
209 | 333 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
334 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
335 "movd %%mm2, 32+" #dst " \n\t"\ | |
336 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ | |
337 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ | |
338 "movd %%mm6, 48+" #dst " \n\t"\ | |
339 "movd %%mm4, 64+" #dst " \n\t"\ | |
340 "movd %%mm5, 80+" #dst " \n\t"\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
341 |
209 | 342 |
343 #define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
344 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
345 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
346 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
347 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
348 "movq wm1010, %%mm4 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
349 "pand %%mm0, %%mm4 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
350 "por %%mm1, %%mm4 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
351 "por %%mm2, %%mm4 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
352 "por %%mm3, %%mm4 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
353 "packssdw %%mm4,%%mm4 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
354 "movd %%mm4, %%eax \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
355 "orl %%eax, %%eax \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
356 "jz 1f \n\t"\ |
209 | 357 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
358 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
359 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
360 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
361 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
362 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
363 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
364 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
365 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
366 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
367 #rounder ", %%mm4 \n\t"\ |
209 | 368 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
369 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ | |
370 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ | |
371 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ | |
372 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
373 #rounder ", %%mm0 \n\t"\ | |
374 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ | |
375 "paddd %%mm0, %%mm0 \n\t" \ | |
376 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ | |
377 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
378 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ | |
379 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
380 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
381 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
382 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
383 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
384 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ | |
385 "psrad $" #shift ", %%mm7 \n\t"\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
386 "psrad $" #shift ", %%mm4 \n\t"\ |
209 | 387 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ |
388 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ | |
389 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
390 "psrad $" #shift ", %%mm1 \n\t"\ | |
391 "psrad $" #shift ", %%mm2 \n\t"\ | |
392 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ | |
393 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ | |
394 "movq %%mm7, " #dst " \n\t"\ | |
395 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ | |
396 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
397 "movq %%mm2, 24+" #dst " \n\t"\ | |
398 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
399 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
400 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
401 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
209 | 402 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ |
403 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
404 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
405 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
406 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
407 "psrad $" #shift ", %%mm2 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
408 "psrad $" #shift ", %%mm0 \n\t"\ |
209 | 409 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ |
410 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ | |
411 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
412 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
413 "psrad $" #shift ", %%mm6 \n\t"\ | |
414 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ | |
415 "movq %%mm2, 8+" #dst " \n\t"\ | |
416 "psrad $" #shift ", %%mm4 \n\t"\ | |
417 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ | |
418 "movq %%mm4, 16+" #dst " \n\t"\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
419 "jmp 2f \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
420 "1: \n\t"\ |
209 | 421 "pslld $16, %%mm0 \n\t"\ |
422 "#paddd d40000, %%mm0 \n\t"\ | |
423 "psrad $13, %%mm0 \n\t"\ | |
424 "packssdw %%mm0, %%mm0 \n\t"\ | |
425 "movq %%mm0, " #dst " \n\t"\ | |
426 "movq %%mm0, 8+" #dst " \n\t"\ | |
427 "movq %%mm0, 16+" #dst " \n\t"\ | |
428 "movq %%mm0, 24+" #dst " \n\t"\ | |
429 "2: \n\t" | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
430 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
431 |
209 | 432 //IDCT( src0, src4, src1, src5, dst, rounder, shift) |
433 ROW_IDCT( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11) | |
434 /*ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11) | |
435 ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11) | |
436 ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
437 |
209 | 438 DC_COND_ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11) |
439 DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11) | |
440 DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11) | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
441 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
442 |
209 | 443 //IDCT( src0, src4, src1, src5, dst, rounder, shift) |
444 COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) | |
445 COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) | |
446 COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) | |
447 COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
448 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
449 #else |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
450 |
209 | 451 #define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ |
452 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
453 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
454 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
455 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
456 "movq wm1010, %%mm4 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
457 "pand %%mm0, %%mm4 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
458 "por %%mm1, %%mm4 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
459 "por %%mm2, %%mm4 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
460 "por %%mm3, %%mm4 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
461 "packssdw %%mm4,%%mm4 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
462 "movd %%mm4, %%eax \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
463 "orl %%eax, %%eax \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
464 "jz 1f \n\t"\ |
209 | 465 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
466 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
467 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
468 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
469 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
470 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
471 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
472 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
473 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
474 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
475 #rounder ", %%mm4 \n\t"\ |
209 | 476 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
477 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ | |
478 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ | |
479 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ | |
480 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
481 #rounder ", %%mm0 \n\t"\ | |
482 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ | |
483 "paddd %%mm0, %%mm0 \n\t" \ | |
484 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ | |
485 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
486 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ | |
487 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
488 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
489 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
490 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
491 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
492 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ | |
493 "psrad $" #shift ", %%mm7 \n\t"\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
494 "psrad $" #shift ", %%mm4 \n\t"\ |
209 | 495 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ |
496 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ | |
497 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
498 "psrad $" #shift ", %%mm1 \n\t"\ | |
499 "psrad $" #shift ", %%mm2 \n\t"\ | |
500 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ | |
501 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ | |
502 "movq %%mm7, " #dst " \n\t"\ | |
503 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ | |
504 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
505 "movq %%mm2, 24+" #dst " \n\t"\ | |
506 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
507 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
508 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
509 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
209 | 510 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ |
511 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
512 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
513 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
514 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
515 "psrad $" #shift ", %%mm2 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
516 "psrad $" #shift ", %%mm0 \n\t"\ |
209 | 517 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ |
518 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ | |
519 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
520 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
521 "psrad $" #shift ", %%mm6 \n\t"\ | |
522 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ | |
523 "movq %%mm2, 8+" #dst " \n\t"\ | |
524 "psrad $" #shift ", %%mm4 \n\t"\ | |
525 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ | |
526 "movq %%mm4, 16+" #dst " \n\t"\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
527 "jmp 2f \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
528 "1: \n\t"\ |
209 | 529 "pslld $16, %%mm0 \n\t"\ |
530 "paddd d40000, %%mm0 \n\t"\ | |
531 "psrad $13, %%mm0 \n\t"\ | |
532 "packssdw %%mm0, %%mm0 \n\t"\ | |
533 "movq %%mm0, " #dst " \n\t"\ | |
534 "movq %%mm0, 8+" #dst " \n\t"\ | |
535 "movq %%mm0, 16+" #dst " \n\t"\ | |
536 "movq %%mm0, 24+" #dst " \n\t"\ | |
537 "2: \n\t" | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
538 |
209 | 539 #define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \ |
540 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
541 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
542 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
543 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
544 "movq %%mm0, %%mm4 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
545 "por %%mm1, %%mm4 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
546 "por %%mm2, %%mm4 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
547 "por %%mm3, %%mm4 \n\t"\ |
209 | 548 "packssdw %%mm4,%%mm4 \n\t"\ |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
549 "movd %%mm4, %%eax \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
550 "orl %%eax, %%eax \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
551 "jz " #bt " \n\t"\ |
209 | 552 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
553 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
554 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
555 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
556 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
557 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
558 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
559 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
560 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
561 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
562 #rounder ", %%mm4 \n\t"\ | |
563 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
564 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ | |
565 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ | |
566 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ | |
567 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
568 #rounder ", %%mm0 \n\t"\ | |
569 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ | |
570 "paddd %%mm0, %%mm0 \n\t" \ | |
571 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ | |
572 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
573 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ | |
574 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
575 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
576 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
577 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
578 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
579 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ | |
580 "psrad $" #shift ", %%mm7 \n\t"\ | |
581 "psrad $" #shift ", %%mm4 \n\t"\ | |
582 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ | |
583 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ | |
584 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
585 "psrad $" #shift ", %%mm1 \n\t"\ | |
586 "psrad $" #shift ", %%mm2 \n\t"\ | |
587 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ | |
588 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ | |
589 "movq %%mm7, " #dst " \n\t"\ | |
590 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ | |
591 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
592 "movq %%mm2, 24+" #dst " \n\t"\ | |
593 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
594 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
595 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
596 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
597 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ | |
598 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
599 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
600 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
601 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ | |
602 "psrad $" #shift ", %%mm2 \n\t"\ | |
603 "psrad $" #shift ", %%mm0 \n\t"\ | |
604 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
605 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ | |
606 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
607 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
608 "psrad $" #shift ", %%mm6 \n\t"\ | |
609 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ | |
610 "movq %%mm2, 8+" #dst " \n\t"\ | |
611 "psrad $" #shift ", %%mm4 \n\t"\ | |
612 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ | |
613 "movq %%mm4, 16+" #dst " \n\t"\ | |
614 | |
615 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
616 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
617 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
618 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ | |
619 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ | |
620 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
621 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
622 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
623 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
624 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
625 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
626 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
627 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
628 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
629 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
630 #rounder ", %%mm4 \n\t"\ | |
631 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
632 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
209 | 633 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
634 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ | |
635 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
636 #rounder ", %%mm0 \n\t"\ | |
637 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ | |
638 "paddd %%mm0, %%mm0 \n\t" \ | |
639 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ | |
640 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
641 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ | |
642 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
643 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
644 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
645 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
646 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
647 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ | |
648 "psrad $" #shift ", %%mm7 \n\t"\ | |
649 "psrad $" #shift ", %%mm4 \n\t"\ | |
650 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ | |
651 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ | |
652 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
653 "psrad $" #shift ", %%mm1 \n\t"\ | |
654 "psrad $" #shift ", %%mm2 \n\t"\ | |
655 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ | |
656 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ | |
657 "movq %%mm7, " #dst " \n\t"\ | |
658 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ | |
659 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
660 "movq %%mm2, 24+" #dst " \n\t"\ | |
661 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
662 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
663 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
664 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
665 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ | |
666 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
667 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
668 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
669 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ | |
670 "psrad $" #shift ", %%mm2 \n\t"\ | |
671 "psrad $" #shift ", %%mm0 \n\t"\ | |
672 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
673 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ | |
674 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
675 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
676 "psrad $" #shift ", %%mm6 \n\t"\ | |
677 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ | |
678 "movq %%mm2, 8+" #dst " \n\t"\ | |
679 "psrad $" #shift ", %%mm4 \n\t"\ | |
680 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ | |
681 "movq %%mm4, 16+" #dst " \n\t"\ | |
682 | |
683 //IDCT( src0, src4, src1, src5, dst, rounder, shift) | |
684 DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11) | |
685 Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f) | |
686 Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f) | |
687 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f) | |
688 | |
689 #undef IDCT | |
690 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
691 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
692 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
693 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ | |
694 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ | |
695 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
696 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
697 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
698 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
699 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
700 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
701 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
702 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
703 #rounder ", %%mm4 \n\t"\ |
209 | 704 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
705 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
706 #rounder ", %%mm0 \n\t"\ | |
707 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
708 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ | |
709 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ | |
710 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
711 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ | |
712 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ | |
713 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ | |
714 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
715 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
716 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\ | |
717 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
718 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
719 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
720 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
209 | 721 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
722 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\ | |
723 "psrad $" #shift ", %%mm7 \n\t"\ | |
724 "psrad $" #shift ", %%mm4 \n\t"\ | |
725 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ | |
726 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
727 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
728 "psrad $" #shift ", %%mm0 \n\t"\ | |
729 "psrad $" #shift ", %%mm2 \n\t"\ | |
730 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
731 "movd %%mm7, " #dst " \n\t"\ | |
732 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
733 "movd %%mm0, 16+" #dst " \n\t"\ | |
734 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
735 "movd %%mm2, 96+" #dst " \n\t"\ | |
736 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
737 "movd %%mm4, 112+" #dst " \n\t"\ | |
738 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\ | |
739 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
740 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
741 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
742 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
743 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
744 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ | |
745 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
746 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
747 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
748 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ | |
749 "psrad $" #shift ", %%mm2 \n\t"\ | |
750 "psrad $" #shift ", %%mm5 \n\t"\ | |
751 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
752 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\ | |
753 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
754 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
755 "psrad $" #shift ", %%mm6 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
756 "psrad $" #shift ", %%mm4 \n\t"\ |
209 | 757 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
758 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
759 "movd %%mm2, 32+" #dst " \n\t"\ | |
760 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ | |
761 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ | |
762 "movd %%mm6, 48+" #dst " \n\t"\ | |
763 "movd %%mm4, 64+" #dst " \n\t"\ | |
764 "movd %%mm5, 80+" #dst " \n\t" | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
765 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
766 |
209 | 767 //IDCT( src0, src4, src1, src5, dst, rounder, shift) |
768 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) | |
769 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) | |
770 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) | |
771 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
772 "jmp 9f \n\t" |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
773 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
774 "#.balign 16 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
775 "4: \n\t" |
209 | 776 Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f) |
777 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f) | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
778 |
209 | 779 #undef IDCT |
780 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
781 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
782 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
783 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
209 | 784 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
785 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
786 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
787 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
788 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
789 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
790 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
791 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
792 #rounder ", %%mm4 \n\t"\ | |
793 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
794 #rounder ", %%mm0 \n\t"\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
795 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
209 | 796 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
797 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
798 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ | |
799 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ | |
800 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ | |
801 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
802 "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
803 "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ |
209 | 804 "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\ |
805 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
806 "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
807 "psrad $" #shift ", %%mm1 \n\t"\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
808 "psrad $" #shift ", %%mm4 \n\t"\ |
209 | 809 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ |
810 "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
811 "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
812 "psrad $" #shift ", %%mm0 \n\t"\ |
209 | 813 "psrad $" #shift ", %%mm2 \n\t"\ |
814 "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\ | |
815 "movd %%mm1, " #dst " \n\t"\ | |
816 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
817 "movd %%mm0, 16+" #dst " \n\t"\ | |
818 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
819 "movd %%mm2, 96+" #dst " \n\t"\ | |
820 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
821 "movd %%mm4, 112+" #dst " \n\t"\ | |
822 "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\ | |
823 "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
824 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ | |
825 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
826 "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
827 "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\ | |
828 "psrad $" #shift ", %%mm2 \n\t"\ | |
829 "psrad $" #shift ", %%mm5 \n\t"\ | |
830 "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\ | |
831 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
832 "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\ | |
833 "psrad $" #shift ", %%mm6 \n\t"\ | |
834 "psrad $" #shift ", %%mm1 \n\t"\ | |
835 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
836 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
837 "movd %%mm2, 32+" #dst " \n\t"\ | |
838 "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\ | |
839 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ | |
840 "movd %%mm6, 48+" #dst " \n\t"\ | |
841 "movd %%mm1, 64+" #dst " \n\t"\ | |
842 "movd %%mm5, 80+" #dst " \n\t" | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
843 |
209 | 844 //IDCT( src0, src4, src1, src5, dst, rounder, shift) |
845 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) | |
846 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) | |
847 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) | |
848 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
849 "jmp 9f \n\t" |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
850 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
851 "#.balign 16 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
852 "6: \n\t" |
209 | 853 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f) |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
854 |
209 | 855 #undef IDCT |
856 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
857 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
858 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ | |
859 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
860 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
861 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
862 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
863 #rounder ", %%mm4 \n\t"\ | |
864 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
865 #rounder ", %%mm0 \n\t"\ | |
866 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
867 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ | |
868 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
869 "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
870 "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
871 "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\ | |
872 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
873 "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
874 "psrad $" #shift ", %%mm1 \n\t"\ | |
875 "psrad $" #shift ", %%mm4 \n\t"\ | |
876 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ | |
877 "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
878 "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
879 "psrad $" #shift ", %%mm0 \n\t"\ | |
880 "psrad $" #shift ", %%mm2 \n\t"\ | |
881 "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\ | |
882 "movd %%mm1, " #dst " \n\t"\ | |
883 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
884 "movd %%mm0, 16+" #dst " \n\t"\ | |
885 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
886 "movd %%mm2, 96+" #dst " \n\t"\ | |
887 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
888 "movd %%mm4, 112+" #dst " \n\t"\ | |
889 "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\ | |
890 "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
891 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ | |
892 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
893 "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
894 "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\ | |
895 "psrad $" #shift ", %%mm2 \n\t"\ | |
896 "psrad $" #shift ", %%mm5 \n\t"\ | |
897 "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\ | |
898 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
899 "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\ | |
900 "psrad $" #shift ", %%mm6 \n\t"\ | |
901 "psrad $" #shift ", %%mm1 \n\t"\ | |
902 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
903 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
904 "movd %%mm2, 32+" #dst " \n\t"\ | |
905 "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\ | |
906 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ | |
907 "movd %%mm6, 48+" #dst " \n\t"\ | |
908 "movd %%mm1, 64+" #dst " \n\t"\ | |
909 "movd %%mm5, 80+" #dst " \n\t" | |
910 | |
911 | |
912 //IDCT( src0, src4, src1, src5, dst, rounder, shift) | |
913 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) | |
914 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) | |
915 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) | |
916 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) | |
917 "jmp 9f \n\t" | |
918 | |
919 "#.balign 16 \n\t"\ | |
920 "2: \n\t" | |
921 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f) | |
922 | |
923 #undef IDCT | |
924 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
925 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
926 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
927 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
209 | 928 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
929 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
930 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
931 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
932 #rounder ", %%mm4 \n\t"\ | |
933 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
934 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
935 #rounder ", %%mm0 \n\t"\ | |
936 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
937 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
938 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ | |
939 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
940 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
941 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\ | |
942 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
943 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
944 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
945 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
946 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
947 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\ | |
948 "psrad $" #shift ", %%mm7 \n\t"\ | |
949 "psrad $" #shift ", %%mm4 \n\t"\ | |
950 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ | |
951 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
952 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
953 "psrad $" #shift ", %%mm0 \n\t"\ | |
954 "psrad $" #shift ", %%mm2 \n\t"\ | |
955 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
956 "movd %%mm7, " #dst " \n\t"\ | |
957 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
958 "movd %%mm0, 16+" #dst " \n\t"\ | |
959 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
960 "movd %%mm2, 96+" #dst " \n\t"\ | |
961 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
962 "movd %%mm4, 112+" #dst " \n\t"\ | |
963 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\ | |
964 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
965 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
966 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
967 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
968 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
969 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ | |
970 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
971 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
972 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
973 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ | |
974 "psrad $" #shift ", %%mm2 \n\t"\ | |
975 "psrad $" #shift ", %%mm5 \n\t"\ | |
976 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
977 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\ | |
978 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
979 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
980 "psrad $" #shift ", %%mm6 \n\t"\ | |
981 "psrad $" #shift ", %%mm4 \n\t"\ | |
982 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
983 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
984 "movd %%mm2, 32+" #dst " \n\t"\ | |
985 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ | |
986 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ | |
987 "movd %%mm6, 48+" #dst " \n\t"\ | |
988 "movd %%mm4, 64+" #dst " \n\t"\ | |
989 "movd %%mm5, 80+" #dst " \n\t" | |
990 | |
991 //IDCT( src0, src4, src1, src5, dst, rounder, shift) | |
992 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) | |
993 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) | |
994 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) | |
995 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) | |
996 "jmp 9f \n\t" | |
997 | |
998 "#.balign 16 \n\t"\ | |
999 "3: \n\t" | |
1000 #undef IDCT | |
1001 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
1002 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
1003 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ | |
1004 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
1005 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
1006 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
1007 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1008 #rounder ", %%mm4 \n\t"\ | |
1009 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
1010 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
1011 #rounder ", %%mm0 \n\t"\ | |
1012 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
1013 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1014 "movq 64(%2), %%mm3 \n\t"\ | |
1015 "pmaddwd %%mm2, %%mm3 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1016 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1017 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1018 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1019 "psrad $" #shift ", %%mm7 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1020 "psrad $" #shift ", %%mm4 \n\t"\ |
209 | 1021 "movq %%mm0, %%mm1 \n\t" /* A1 a1 */\ |
1022 "paddd %%mm3, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
1023 "psubd %%mm3, %%mm1 \n\t" /* A1-B1 a1-b1 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1024 "psrad $" #shift ", %%mm0 \n\t"\ |
209 | 1025 "psrad $" #shift ", %%mm1 \n\t"\ |
1026 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
1027 "movd %%mm7, " #dst " \n\t"\ | |
1028 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
1029 "movd %%mm0, 16+" #dst " \n\t"\ | |
1030 "packssdw %%mm1, %%mm1 \n\t" /* A1-B1 a1-b1 */\ | |
1031 "movd %%mm1, 96+" #dst " \n\t"\ | |
1032 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
1033 "movd %%mm4, 112+" #dst " \n\t"\ | |
1034 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
1035 "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
1036 "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
1037 "movq %%mm5, %%mm1 \n\t" /* A2 a2 */\ | |
1038 "paddd %%mm4, %%mm1 \n\t" /* A2+B2 a2+b2 */\ | |
1039 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ | |
1040 "psrad $" #shift ", %%mm1 \n\t"\ | |
1041 "psrad $" #shift ", %%mm5 \n\t"\ | |
1042 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
1043 "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
1044 "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1045 "psrad $" #shift ", %%mm6 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1046 "psrad $" #shift ", %%mm4 \n\t"\ |
209 | 1047 "packssdw %%mm1, %%mm1 \n\t" /* A2+B2 a2+b2 */\ |
1048 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
1049 "movd %%mm1, 32+" #dst " \n\t"\ | |
1050 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ | |
1051 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ | |
1052 "movd %%mm6, 48+" #dst " \n\t"\ | |
1053 "movd %%mm4, 64+" #dst " \n\t"\ | |
1054 "movd %%mm5, 80+" #dst " \n\t" | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1055 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1056 |
209 | 1057 //IDCT( src0, src4, src1, src5, dst, rounder, shift) |
1058 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) | |
1059 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) | |
1060 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) | |
1061 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1062 "jmp 9f \n\t" |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1063 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1064 "#.balign 16 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1065 "5: \n\t" |
209 | 1066 #undef IDCT |
1067 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
1068 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
1069 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
1070 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
1071 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
1072 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
1073 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1074 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
1075 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
1076 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
1077 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
1078 #rounder ", %%mm4 \n\t"\ | |
1079 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1080 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
209 | 1081 #rounder ", %%mm0 \n\t"\ |
1082 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ | |
1083 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1084 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ | |
1085 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ | |
1086 "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\ | |
1087 "movq 8+" #src4 ", %%mm3 \n\t" /* R6 R2 r6 r2 */\ | |
1088 "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\ | |
1089 "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
1090 "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\ | |
1091 "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1092 "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\ | |
1093 "pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
1094 "pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
1095 #rounder ", %%mm1 \n\t"\ | |
1096 "paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\ | |
1097 "paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\ | |
1098 #rounder ", %%mm2 \n\t"\ | |
1099 "psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\ | |
1100 "paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\ | |
1101 "paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\ | |
1102 "psubd %%mm3, %%mm2 \n\t" /* A2 a2 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1103 "psrad $" #shift ", %%mm4 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1104 "psrad $" #shift ", %%mm7 \n\t"\ |
209 | 1105 "psrad $" #shift ", %%mm3 \n\t"\ |
1106 "packssdw %%mm7, %%mm4 \n\t" /* A0 a0 */\ | |
1107 "movq %%mm4, " #dst " \n\t"\ | |
1108 "psrad $" #shift ", %%mm0 \n\t"\ | |
1109 "packssdw %%mm3, %%mm0 \n\t" /* A1 a1 */\ | |
1110 "movq %%mm0, 16+" #dst " \n\t"\ | |
1111 "movq %%mm0, 96+" #dst " \n\t"\ | |
1112 "movq %%mm4, 112+" #dst " \n\t"\ | |
1113 "psrad $" #shift ", %%mm5 \n\t"\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1114 "psrad $" #shift ", %%mm6 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1115 "psrad $" #shift ", %%mm2 \n\t"\ |
209 | 1116 "packssdw %%mm2, %%mm5 \n\t" /* A2-B2 a2-b2 */\ |
1117 "movq %%mm5, 32+" #dst " \n\t"\ | |
1118 "psrad $" #shift ", %%mm1 \n\t"\ | |
1119 "packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
1120 "movq %%mm6, 48+" #dst " \n\t"\ | |
1121 "movq %%mm6, 64+" #dst " \n\t"\ | |
1122 "movq %%mm5, 80+" #dst " \n\t" | |
1123 | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1124 |
209 | 1125 //IDCT( src0, src4, src1, src5, dst, rounder, shift) |
1126 IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) | |
1127 //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) | |
1128 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) | |
1129 //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1130 "jmp 9f \n\t" |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1131 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1132 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1133 "#.balign 16 \n\t"\ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1134 "1: \n\t" |
209 | 1135 #undef IDCT |
1136 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
1137 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
1138 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1139 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
209 | 1140 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
1141 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
1142 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
1143 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1144 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
1145 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
1146 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
1147 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
1148 #rounder ", %%mm4 \n\t"\ | |
1149 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
1150 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
1151 #rounder ", %%mm0 \n\t"\ | |
1152 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1153 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
209 | 1154 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
1155 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1156 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ | |
1157 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ | |
1158 "movq 64(%2), %%mm1 \n\t"\ | |
1159 "pmaddwd %%mm2, %%mm1 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
1160 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1161 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
209 | 1162 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
1163 "psrad $" #shift ", %%mm7 \n\t"\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1164 "psrad $" #shift ", %%mm4 \n\t"\ |
209 | 1165 "movq %%mm0, %%mm3 \n\t" /* A1 a1 */\ |
1166 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
1167 "psubd %%mm1, %%mm3 \n\t" /* A1-B1 a1-b1 */\ | |
1168 "psrad $" #shift ", %%mm0 \n\t"\ | |
1169 "psrad $" #shift ", %%mm3 \n\t"\ | |
1170 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
1171 "movd %%mm7, " #dst " \n\t"\ | |
1172 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
1173 "movd %%mm0, 16+" #dst " \n\t"\ | |
1174 "packssdw %%mm3, %%mm3 \n\t" /* A1-B1 a1-b1 */\ | |
1175 "movd %%mm3, 96+" #dst " \n\t"\ | |
1176 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
1177 "movd %%mm4, 112+" #dst " \n\t"\ | |
1178 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
1179 "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
1180 "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
1181 "movq %%mm5, %%mm3 \n\t" /* A2 a2 */\ | |
1182 "paddd %%mm4, %%mm3 \n\t" /* A2+B2 a2+b2 */\ | |
1183 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ | |
1184 "psrad $" #shift ", %%mm3 \n\t"\ | |
1185 "psrad $" #shift ", %%mm5 \n\t"\ | |
1186 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
1187 "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
1188 "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1189 "psrad $" #shift ", %%mm6 \n\t"\ |
209 | 1190 "packssdw %%mm3, %%mm3 \n\t" /* A2+B2 a2+b2 */\ |
1191 "movd %%mm3, 32+" #dst " \n\t"\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1192 "psrad $" #shift ", %%mm4 \n\t"\ |
209 | 1193 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
1194 "movd %%mm6, 48+" #dst " \n\t"\ | |
1195 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ | |
1196 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ | |
1197 "movd %%mm4, 64+" #dst " \n\t"\ | |
1198 "movd %%mm5, 80+" #dst " \n\t" | |
1199 | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1200 |
209 | 1201 //IDCT( src0, src4, src1, src5, dst, rounder, shift) |
1202 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) | |
1203 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) | |
1204 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) | |
1205 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1206 "jmp 9f \n\t" |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1207 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1208 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1209 "#.balign 16 \n\t" |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1210 "7: \n\t" |
209 | 1211 #undef IDCT |
1212 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
1213 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
1214 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
1215 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
1216 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
1217 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1218 #rounder ", %%mm4 \n\t"\ | |
1219 #rounder ", %%mm0 \n\t"\ | |
1220 "psrad $" #shift ", %%mm4 \n\t"\ | |
1221 "psrad $" #shift ", %%mm0 \n\t"\ | |
1222 "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\ | |
1223 "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\ | |
1224 "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
1225 "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\ | |
1226 "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1227 "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\ | |
1228 #rounder ", %%mm1 \n\t"\ | |
1229 #rounder ", %%mm2 \n\t"\ | |
1230 "psrad $" #shift ", %%mm1 \n\t"\ | |
1231 "packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\ | |
1232 "movq %%mm4, " #dst " \n\t"\ | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1233 "psrad $" #shift ", %%mm2 \n\t"\ |
209 | 1234 "packssdw %%mm2, %%mm0 \n\t" /* A1 a1 */\ |
1235 "movq %%mm0, 16+" #dst " \n\t"\ | |
1236 "movq %%mm0, 96+" #dst " \n\t"\ | |
1237 "movq %%mm4, 112+" #dst " \n\t"\ | |
1238 "movq %%mm0, 32+" #dst " \n\t"\ | |
1239 "movq %%mm4, 48+" #dst " \n\t"\ | |
1240 "movq %%mm4, 64+" #dst " \n\t"\ | |
1241 "movq %%mm0, 80+" #dst " \n\t" | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1242 |
209 | 1243 //IDCT( src0, src4, src1, src5, dst, rounder, shift) |
1244 IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) | |
1245 //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) | |
1246 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) | |
1247 //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1248 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1249 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1250 #endif |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1251 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1252 /* |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1253 Input |
209 | 1254 00 40 04 44 20 60 24 64 |
1255 10 30 14 34 50 70 54 74 | |
1256 01 41 03 43 21 61 23 63 | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1257 11 31 13 33 51 71 53 73 |
209 | 1258 02 42 06 46 22 62 26 66 |
1259 12 32 16 36 52 72 56 76 | |
1260 05 45 07 47 25 65 27 67 | |
1261 15 35 17 37 55 75 57 77 | |
1262 | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1263 Temp |
209 | 1264 00 04 10 14 20 24 30 34 |
1265 40 44 50 54 60 64 70 74 | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1266 01 03 11 13 21 23 31 33 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1267 41 43 51 53 61 63 71 73 |
209 | 1268 02 06 12 16 22 26 32 36 |
1269 42 46 52 56 62 66 72 76 | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1270 05 07 15 17 25 27 35 37 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1271 45 47 55 57 65 67 75 77 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1272 */ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1273 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1274 "9: \n\t" |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1275 :: "r" (block), "r" (temp), "r" (coeffs) |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1276 : "%eax" |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1277 ); |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1278 } |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1279 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1280 void simple_idct_mmx(int16_t *block) |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1281 { |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1282 idct(block); |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1283 } |