Mercurial > libavcodec.hg
annotate simple_idct.c @ 11034:fd5921186064 libavcodec
Make the fast loop filter path work with unavailable left MBs.
This prevents the issue with having to switch between slow and
fast code paths in each row.
0.5% faster loopfilter for cathedral
author | michael |
---|---|
date | Thu, 28 Jan 2010 02:15:25 +0000 |
parents | 266bf83f634d |
children | 7dd2a45249a9 |
rev | line source |
---|---|
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
1 /* |
429 | 2 * Simple IDCT |
3 * | |
4 * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> | |
5 * | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3036
diff
changeset
|
6 * This file is part of FFmpeg. |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3036
diff
changeset
|
7 * |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3036
diff
changeset
|
8 * FFmpeg is free software; you can redistribute it and/or |
429 | 9 * modify it under the terms of the GNU Lesser General Public |
10 * License as published by the Free Software Foundation; either | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3036
diff
changeset
|
11 * version 2.1 of the License, or (at your option) any later version. |
429 | 12 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3036
diff
changeset
|
13 * FFmpeg is distributed in the hope that it will be useful, |
429 | 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 * Lesser General Public License for more details. | |
17 * | |
18 * You should have received a copy of the GNU Lesser General Public | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3036
diff
changeset
|
19 * License along with FFmpeg; if not, write to the Free Software |
3036
0b546eab515d
Update licensing information: The FSF changed postal address.
diego
parents:
2979
diff
changeset
|
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
429 | 21 */ |
2967 | 22 |
1106 | 23 /** |
8718
e9d9d946f213
Use full internal pathname in doxygen @file directives.
diego
parents:
8590
diff
changeset
|
24 * @file libavcodec/simple_idct.c |
1106 | 25 * simpleidct in C. |
26 */ | |
2967 | 27 |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
28 /* |
429 | 29 based upon some outcommented c code from mpeg2dec (idct_mmx.c |
2967 | 30 written by Aaron Holtzman <aholtzma@ess.engr.uvic.ca>) |
429 | 31 */ |
396
fce0a2520551
removed useless header includes - use av memory functions
glantau
parents:
352
diff
changeset
|
32 #include "avcodec.h" |
479 | 33 #include "dsputil.h" |
8497 | 34 #include "mathops.h" |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
35 #include "simple_idct.h" |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
36 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
37 #if 0 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
38 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
39 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
40 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
41 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
42 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
43 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
44 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */ |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
45 #define ROW_SHIFT 8 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
46 #define COL_SHIFT 17 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
47 #else |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
48 #define W1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
49 #define W2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
50 #define W3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
352
5a8eb5cf9f92
C4=16383 for the c version too and even for some outcommented code
michaelni
parents:
215
diff
changeset
|
51 #define W4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
52 #define W5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
53 #define W6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
54 #define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
55 #define ROW_SHIFT 11 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
56 #define COL_SHIFT 20 // 6 |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
57 #endif |
205 | 58 |
1008 | 59 static inline void idctRowCondDC (DCTELEM * row) |
205 | 60 { |
2979 | 61 int a0, a1, a2, a3, b0, b1, b2, b3; |
8590 | 62 #if HAVE_FAST_64BIT |
476
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
63 uint64_t temp; |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
64 #else |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
65 uint32_t temp; |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
66 #endif |
205 | 67 |
8590 | 68 #if HAVE_FAST_64BIT |
9985 | 69 #if HAVE_BIGENDIAN |
476
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
70 #define ROW0_MASK 0xffff000000000000LL |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
71 #else |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
72 #define ROW0_MASK 0xffffLL |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
73 #endif |
1008 | 74 if(sizeof(DCTELEM)==2){ |
2967 | 75 if ( ((((uint64_t *)row)[0] & ~ROW0_MASK) | |
1008 | 76 ((uint64_t *)row)[1]) == 0) { |
77 temp = (row[0] << 3) & 0xffff; | |
78 temp += temp << 16; | |
79 temp += temp << 32; | |
80 ((uint64_t *)row)[0] = temp; | |
81 ((uint64_t *)row)[1] = temp; | |
82 return; | |
2979 | 83 } |
1008 | 84 }else{ |
85 if (!(row[1]|row[2]|row[3]|row[4]|row[5]|row[6]|row[7])) { | |
86 row[0]=row[1]=row[2]=row[3]=row[4]=row[5]=row[6]=row[7]= row[0] << 3; | |
87 return; | |
88 } | |
89 } | |
476
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
90 #else |
1008 | 91 if(sizeof(DCTELEM)==2){ |
92 if (!(((uint32_t*)row)[1] | | |
93 ((uint32_t*)row)[2] | | |
2967 | 94 ((uint32_t*)row)[3] | |
1008 | 95 row[1])) { |
96 temp = (row[0] << 3) & 0xffff; | |
97 temp += temp << 16; | |
98 ((uint32_t*)row)[0]=((uint32_t*)row)[1] = | |
99 ((uint32_t*)row)[2]=((uint32_t*)row)[3] = temp; | |
100 return; | |
101 } | |
102 }else{ | |
103 if (!(row[1]|row[2]|row[3]|row[4]|row[5]|row[6]|row[7])) { | |
104 row[0]=row[1]=row[2]=row[3]=row[4]=row[5]=row[6]=row[7]= row[0] << 3; | |
105 return; | |
106 } | |
107 } | |
476
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
108 #endif |
205 | 109 |
476
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
110 a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1)); |
2979 | 111 a1 = a0; |
112 a2 = a0; | |
113 a3 = a0; | |
476
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
114 |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
115 /* no need to optimize : gcc does it */ |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
116 a0 += W2 * row[2]; |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
117 a1 += W6 * row[2]; |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
118 a2 -= W6 * row[2]; |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
119 a3 -= W2 * row[2]; |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
120 |
8497 | 121 b0 = MUL16(W1, row[1]); |
476
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
122 MAC16(b0, W3, row[3]); |
8497 | 123 b1 = MUL16(W3, row[1]); |
476
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
124 MAC16(b1, -W7, row[3]); |
8497 | 125 b2 = MUL16(W5, row[1]); |
476
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
126 MAC16(b2, -W1, row[3]); |
8497 | 127 b3 = MUL16(W7, row[1]); |
476
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
128 MAC16(b3, -W5, row[3]); |
205 | 129 |
8590 | 130 #if HAVE_FAST_64BIT |
476
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
131 temp = ((uint64_t*)row)[1]; |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
132 #else |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
133 temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3]; |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
134 #endif |
2979 | 135 if (temp != 0) { |
476
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
136 a0 += W4*row[4] + W6*row[6]; |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
137 a1 += - W4*row[4] - W2*row[6]; |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
138 a2 += - W4*row[4] + W2*row[6]; |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
139 a3 += W4*row[4] - W6*row[6]; |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
140 |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
141 MAC16(b0, W5, row[5]); |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
142 MAC16(b0, W7, row[7]); |
2967 | 143 |
476
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
144 MAC16(b1, -W1, row[5]); |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
145 MAC16(b1, -W5, row[7]); |
2967 | 146 |
476
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
147 MAC16(b2, W7, row[5]); |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
148 MAC16(b2, W3, row[7]); |
2967 | 149 |
476
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
150 MAC16(b3, W3, row[5]); |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
151 MAC16(b3, -W1, row[7]); |
2979 | 152 } |
205 | 153 |
2979 | 154 row[0] = (a0 + b0) >> ROW_SHIFT; |
155 row[7] = (a0 - b0) >> ROW_SHIFT; | |
156 row[1] = (a1 + b1) >> ROW_SHIFT; | |
157 row[6] = (a1 - b1) >> ROW_SHIFT; | |
158 row[2] = (a2 + b2) >> ROW_SHIFT; | |
159 row[5] = (a2 - b2) >> ROW_SHIFT; | |
160 row[3] = (a3 + b3) >> ROW_SHIFT; | |
161 row[4] = (a3 - b3) >> ROW_SHIFT; | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
162 } |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
163 |
2967 | 164 static inline void idctSparseColPut (uint8_t *dest, int line_size, |
1008 | 165 DCTELEM * col) |
205 | 166 { |
2979 | 167 int a0, a1, a2, a3, b0, b1, b2, b3; |
4176 | 168 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; |
476
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
169 |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
170 /* XXX: I did that only to give same values as previous code */ |
2979 | 171 a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4)); |
172 a1 = a0; | |
173 a2 = a0; | |
174 a3 = a0; | |
476
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
175 |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
176 a0 += + W2*col[8*2]; |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
177 a1 += + W6*col[8*2]; |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
178 a2 += - W6*col[8*2]; |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
179 a3 += - W2*col[8*2]; |
205 | 180 |
8497 | 181 b0 = MUL16(W1, col[8*1]); |
182 b1 = MUL16(W3, col[8*1]); | |
183 b2 = MUL16(W5, col[8*1]); | |
184 b3 = MUL16(W7, col[8*1]); | |
476
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
185 |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
186 MAC16(b0, + W3, col[8*3]); |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
187 MAC16(b1, - W7, col[8*3]); |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
188 MAC16(b2, - W1, col[8*3]); |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
189 MAC16(b3, - W5, col[8*3]); |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
190 |
2979 | 191 if(col[8*4]){ |
476
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
192 a0 += + W4*col[8*4]; |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
193 a1 += - W4*col[8*4]; |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
194 a2 += - W4*col[8*4]; |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
195 a3 += + W4*col[8*4]; |
2979 | 196 } |
205 | 197 |
2979 | 198 if (col[8*5]) { |
476
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
199 MAC16(b0, + W5, col[8*5]); |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
200 MAC16(b1, - W1, col[8*5]); |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
201 MAC16(b2, + W7, col[8*5]); |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
202 MAC16(b3, + W3, col[8*5]); |
2979 | 203 } |
205 | 204 |
2979 | 205 if(col[8*6]){ |
476
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
206 a0 += + W6*col[8*6]; |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
207 a1 += - W2*col[8*6]; |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
208 a2 += + W2*col[8*6]; |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
209 a3 += - W6*col[8*6]; |
2979 | 210 } |
205 | 211 |
2979 | 212 if (col[8*7]) { |
476
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
213 MAC16(b0, + W7, col[8*7]); |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
214 MAC16(b1, - W5, col[8*7]); |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
215 MAC16(b2, + W3, col[8*7]); |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
216 MAC16(b3, - W1, col[8*7]); |
2979 | 217 } |
205 | 218 |
479 | 219 dest[0] = cm[(a0 + b0) >> COL_SHIFT]; |
220 dest += line_size; | |
221 dest[0] = cm[(a1 + b1) >> COL_SHIFT]; | |
222 dest += line_size; | |
223 dest[0] = cm[(a2 + b2) >> COL_SHIFT]; | |
224 dest += line_size; | |
225 dest[0] = cm[(a3 + b3) >> COL_SHIFT]; | |
226 dest += line_size; | |
227 dest[0] = cm[(a3 - b3) >> COL_SHIFT]; | |
228 dest += line_size; | |
229 dest[0] = cm[(a2 - b2) >> COL_SHIFT]; | |
230 dest += line_size; | |
231 dest[0] = cm[(a1 - b1) >> COL_SHIFT]; | |
232 dest += line_size; | |
233 dest[0] = cm[(a0 - b0) >> COL_SHIFT]; | |
234 } | |
235 | |
2967 | 236 static inline void idctSparseColAdd (uint8_t *dest, int line_size, |
1008 | 237 DCTELEM * col) |
479 | 238 { |
2979 | 239 int a0, a1, a2, a3, b0, b1, b2, b3; |
4176 | 240 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; |
479 | 241 |
242 /* XXX: I did that only to give same values as previous code */ | |
2979 | 243 a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4)); |
244 a1 = a0; | |
245 a2 = a0; | |
246 a3 = a0; | |
479 | 247 |
248 a0 += + W2*col[8*2]; | |
249 a1 += + W6*col[8*2]; | |
250 a2 += - W6*col[8*2]; | |
251 a3 += - W2*col[8*2]; | |
252 | |
8497 | 253 b0 = MUL16(W1, col[8*1]); |
254 b1 = MUL16(W3, col[8*1]); | |
255 b2 = MUL16(W5, col[8*1]); | |
256 b3 = MUL16(W7, col[8*1]); | |
479 | 257 |
258 MAC16(b0, + W3, col[8*3]); | |
259 MAC16(b1, - W7, col[8*3]); | |
260 MAC16(b2, - W1, col[8*3]); | |
261 MAC16(b3, - W5, col[8*3]); | |
262 | |
2979 | 263 if(col[8*4]){ |
479 | 264 a0 += + W4*col[8*4]; |
265 a1 += - W4*col[8*4]; | |
266 a2 += - W4*col[8*4]; | |
267 a3 += + W4*col[8*4]; | |
2979 | 268 } |
479 | 269 |
2979 | 270 if (col[8*5]) { |
479 | 271 MAC16(b0, + W5, col[8*5]); |
272 MAC16(b1, - W1, col[8*5]); | |
273 MAC16(b2, + W7, col[8*5]); | |
274 MAC16(b3, + W3, col[8*5]); | |
2979 | 275 } |
479 | 276 |
2979 | 277 if(col[8*6]){ |
479 | 278 a0 += + W6*col[8*6]; |
279 a1 += - W2*col[8*6]; | |
280 a2 += + W2*col[8*6]; | |
281 a3 += - W6*col[8*6]; | |
2979 | 282 } |
479 | 283 |
2979 | 284 if (col[8*7]) { |
479 | 285 MAC16(b0, + W7, col[8*7]); |
286 MAC16(b1, - W5, col[8*7]); | |
287 MAC16(b2, + W3, col[8*7]); | |
288 MAC16(b3, - W1, col[8*7]); | |
2979 | 289 } |
479 | 290 |
291 dest[0] = cm[dest[0] + ((a0 + b0) >> COL_SHIFT)]; | |
292 dest += line_size; | |
293 dest[0] = cm[dest[0] + ((a1 + b1) >> COL_SHIFT)]; | |
294 dest += line_size; | |
295 dest[0] = cm[dest[0] + ((a2 + b2) >> COL_SHIFT)]; | |
296 dest += line_size; | |
297 dest[0] = cm[dest[0] + ((a3 + b3) >> COL_SHIFT)]; | |
298 dest += line_size; | |
299 dest[0] = cm[dest[0] + ((a3 - b3) >> COL_SHIFT)]; | |
300 dest += line_size; | |
301 dest[0] = cm[dest[0] + ((a2 - b2) >> COL_SHIFT)]; | |
302 dest += line_size; | |
303 dest[0] = cm[dest[0] + ((a1 - b1) >> COL_SHIFT)]; | |
304 dest += line_size; | |
305 dest[0] = cm[dest[0] + ((a0 - b0) >> COL_SHIFT)]; | |
175
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
306 } |
bd77d3cbb233
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
diff
changeset
|
307 |
1008 | 308 static inline void idctSparseCol (DCTELEM * col) |
633 | 309 { |
2979 | 310 int a0, a1, a2, a3, b0, b1, b2, b3; |
633 | 311 |
312 /* XXX: I did that only to give same values as previous code */ | |
2979 | 313 a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4)); |
314 a1 = a0; | |
315 a2 = a0; | |
316 a3 = a0; | |
633 | 317 |
318 a0 += + W2*col[8*2]; | |
319 a1 += + W6*col[8*2]; | |
320 a2 += - W6*col[8*2]; | |
321 a3 += - W2*col[8*2]; | |
322 | |
8497 | 323 b0 = MUL16(W1, col[8*1]); |
324 b1 = MUL16(W3, col[8*1]); | |
325 b2 = MUL16(W5, col[8*1]); | |
326 b3 = MUL16(W7, col[8*1]); | |
633 | 327 |
328 MAC16(b0, + W3, col[8*3]); | |
329 MAC16(b1, - W7, col[8*3]); | |
330 MAC16(b2, - W1, col[8*3]); | |
331 MAC16(b3, - W5, col[8*3]); | |
332 | |
2979 | 333 if(col[8*4]){ |
633 | 334 a0 += + W4*col[8*4]; |
335 a1 += - W4*col[8*4]; | |
336 a2 += - W4*col[8*4]; | |
337 a3 += + W4*col[8*4]; | |
2979 | 338 } |
633 | 339 |
2979 | 340 if (col[8*5]) { |
633 | 341 MAC16(b0, + W5, col[8*5]); |
342 MAC16(b1, - W1, col[8*5]); | |
343 MAC16(b2, + W7, col[8*5]); | |
344 MAC16(b3, + W3, col[8*5]); | |
2979 | 345 } |
633 | 346 |
2979 | 347 if(col[8*6]){ |
633 | 348 a0 += + W6*col[8*6]; |
349 a1 += - W2*col[8*6]; | |
350 a2 += + W2*col[8*6]; | |
351 a3 += - W6*col[8*6]; | |
2979 | 352 } |
633 | 353 |
2979 | 354 if (col[8*7]) { |
633 | 355 MAC16(b0, + W7, col[8*7]); |
356 MAC16(b1, - W5, col[8*7]); | |
357 MAC16(b2, + W3, col[8*7]); | |
358 MAC16(b3, - W1, col[8*7]); | |
2979 | 359 } |
633 | 360 |
361 col[0 ] = ((a0 + b0) >> COL_SHIFT); | |
362 col[8 ] = ((a1 + b1) >> COL_SHIFT); | |
363 col[16] = ((a2 + b2) >> COL_SHIFT); | |
364 col[24] = ((a3 + b3) >> COL_SHIFT); | |
365 col[32] = ((a3 - b3) >> COL_SHIFT); | |
366 col[40] = ((a2 - b2) >> COL_SHIFT); | |
367 col[48] = ((a1 - b1) >> COL_SHIFT); | |
368 col[56] = ((a0 - b0) >> COL_SHIFT); | |
369 } | |
370 | |
6001 | 371 void ff_simple_idct_put(uint8_t *dest, int line_size, DCTELEM *block) |
476
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
372 { |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
373 int i; |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
374 for(i=0; i<8; i++) |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
375 idctRowCondDC(block + i*8); |
2967 | 376 |
476
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
377 for(i=0; i<8; i++) |
479 | 378 idctSparseColPut(dest + i, line_size, block + i); |
379 } | |
380 | |
6001 | 381 void ff_simple_idct_add(uint8_t *dest, int line_size, DCTELEM *block) |
479 | 382 { |
383 int i; | |
384 for(i=0; i<8; i++) | |
385 idctRowCondDC(block + i*8); | |
2967 | 386 |
479 | 387 for(i=0; i<8; i++) |
388 idctSparseColAdd(dest + i, line_size, block + i); | |
476
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
389 } |
ec13b0a726c3
removed unused code - began to merge alpha specific stuff - added mac macros for suitable CPUs
bellard
parents:
464
diff
changeset
|
390 |
6001 | 391 void ff_simple_idct(DCTELEM *block) |
633 | 392 { |
393 int i; | |
394 for(i=0; i<8; i++) | |
395 idctRowCondDC(block + i*8); | |
2967 | 396 |
633 | 397 for(i=0; i<8; i++) |
398 idctSparseCol(block + i); | |
399 } | |
400 | |
719 | 401 /* 2x4x8 idct */ |
402 | |
403 #define CN_SHIFT 12 | |
404 #define C_FIX(x) ((int)((x) * (1 << CN_SHIFT) + 0.5)) | |
722
ff90043f4a2d
in fact IDCT248 needs to be normalized as I suspected
bellard
parents:
719
diff
changeset
|
405 #define C1 C_FIX(0.6532814824) |
ff90043f4a2d
in fact IDCT248 needs to be normalized as I suspected
bellard
parents:
719
diff
changeset
|
406 #define C2 C_FIX(0.2705980501) |
719 | 407 |
722
ff90043f4a2d
in fact IDCT248 needs to be normalized as I suspected
bellard
parents:
719
diff
changeset
|
408 /* row idct is multiple by 16 * sqrt(2.0), col idct4 is normalized, |
ff90043f4a2d
in fact IDCT248 needs to be normalized as I suspected
bellard
parents:
719
diff
changeset
|
409 and the butterfly must be multiplied by 0.5 * sqrt(2.0) */ |
ff90043f4a2d
in fact IDCT248 needs to be normalized as I suspected
bellard
parents:
719
diff
changeset
|
410 #define C_SHIFT (4+1+12) |
719 | 411 |
5978 | 412 static inline void idct4col_put(uint8_t *dest, int line_size, const DCTELEM *col) |
719 | 413 { |
414 int c0, c1, c2, c3, a0, a1, a2, a3; | |
4176 | 415 const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; |
719 | 416 |
417 a0 = col[8*0]; | |
418 a1 = col[8*2]; | |
419 a2 = col[8*4]; | |
420 a3 = col[8*6]; | |
722
ff90043f4a2d
in fact IDCT248 needs to be normalized as I suspected
bellard
parents:
719
diff
changeset
|
421 c0 = ((a0 + a2) << (CN_SHIFT - 1)) + (1 << (C_SHIFT - 1)); |
ff90043f4a2d
in fact IDCT248 needs to be normalized as I suspected
bellard
parents:
719
diff
changeset
|
422 c2 = ((a0 - a2) << (CN_SHIFT - 1)) + (1 << (C_SHIFT - 1)); |
719 | 423 c1 = a1 * C1 + a3 * C2; |
424 c3 = a1 * C2 - a3 * C1; | |
425 dest[0] = cm[(c0 + c1) >> C_SHIFT]; | |
426 dest += line_size; | |
427 dest[0] = cm[(c2 + c3) >> C_SHIFT]; | |
428 dest += line_size; | |
429 dest[0] = cm[(c2 - c3) >> C_SHIFT]; | |
430 dest += line_size; | |
431 dest[0] = cm[(c0 - c1) >> C_SHIFT]; | |
432 } | |
433 | |
434 #define BF(k) \ | |
435 {\ | |
436 int a0, a1;\ | |
437 a0 = ptr[k];\ | |
438 a1 = ptr[8 + k];\ | |
439 ptr[k] = a0 + a1;\ | |
440 ptr[8 + k] = a0 - a1;\ | |
441 } | |
442 | |
443 /* only used by DV codec. The input must be interlaced. 128 is added | |
444 to the pixels before clamping to avoid systematic error | |
445 (1024*sqrt(2)) offset would be needed otherwise. */ | |
446 /* XXX: I think a 1.0/sqrt(2) normalization should be needed to | |
447 compensate the extra butterfly stage - I don't have the full DV | |
448 specification */ | |
6001 | 449 void ff_simple_idct248_put(uint8_t *dest, int line_size, DCTELEM *block) |
719 | 450 { |
451 int i; | |
1008 | 452 DCTELEM *ptr; |
2967 | 453 |
719 | 454 /* butterfly */ |
455 ptr = block; | |
456 for(i=0;i<4;i++) { | |
457 BF(0); | |
458 BF(1); | |
459 BF(2); | |
460 BF(3); | |
461 BF(4); | |
462 BF(5); | |
463 BF(6); | |
464 BF(7); | |
465 ptr += 2 * 8; | |
466 } | |
467 | |
468 /* IDCT8 on each line */ | |
469 for(i=0; i<8; i++) { | |
470 idctRowCondDC(block + i*8); | |
471 } | |
472 | |
473 /* IDCT4 and store */ | |
474 for(i=0;i<8;i++) { | |
5978 | 475 idct4col_put(dest + i, 2 * line_size, block + i); |
476 idct4col_put(dest + line_size + i, 2 * line_size, block + 8 + i); | |
719 | 477 } |
478 } | |
936 | 479 |
480 /* 8x4 & 4x8 WMV2 IDCT */ | |
481 #undef CN_SHIFT | |
482 #undef C_SHIFT | |
483 #undef C_FIX | |
484 #undef C1 | |
485 #undef C2 | |
486 #define CN_SHIFT 12 | |
487 #define C_FIX(x) ((int)((x) * 1.414213562 * (1 << CN_SHIFT) + 0.5)) | |
488 #define C1 C_FIX(0.6532814824) | |
489 #define C2 C_FIX(0.2705980501) | |
490 #define C3 C_FIX(0.5) | |
491 #define C_SHIFT (4+1+12) | |
1064 | 492 static inline void idct4col_add(uint8_t *dest, int line_size, const DCTELEM *col) |
936 | 493 { |
494 int c0, c1, c2, c3, a0, a1, a2, a3; | |
4176 | 495 const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; |
936 | 496 |
497 a0 = col[8*0]; | |
498 a1 = col[8*1]; | |
499 a2 = col[8*2]; | |
500 a3 = col[8*3]; | |
501 c0 = (a0 + a2)*C3 + (1 << (C_SHIFT - 1)); | |
502 c2 = (a0 - a2)*C3 + (1 << (C_SHIFT - 1)); | |
503 c1 = a1 * C1 + a3 * C2; | |
504 c3 = a1 * C2 - a3 * C1; | |
505 dest[0] = cm[dest[0] + ((c0 + c1) >> C_SHIFT)]; | |
506 dest += line_size; | |
507 dest[0] = cm[dest[0] + ((c2 + c3) >> C_SHIFT)]; | |
508 dest += line_size; | |
509 dest[0] = cm[dest[0] + ((c2 - c3) >> C_SHIFT)]; | |
510 dest += line_size; | |
511 dest[0] = cm[dest[0] + ((c0 - c1) >> C_SHIFT)]; | |
512 } | |
513 | |
514 #define RN_SHIFT 15 | |
515 #define R_FIX(x) ((int)((x) * 1.414213562 * (1 << RN_SHIFT) + 0.5)) | |
516 #define R1 R_FIX(0.6532814824) | |
517 #define R2 R_FIX(0.2705980501) | |
518 #define R3 R_FIX(0.5) | |
519 #define R_SHIFT 11 | |
1008 | 520 static inline void idct4row(DCTELEM *row) |
936 | 521 { |
522 int c0, c1, c2, c3, a0, a1, a2, a3; | |
4176 | 523 //const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; |
936 | 524 |
525 a0 = row[0]; | |
526 a1 = row[1]; | |
527 a2 = row[2]; | |
528 a3 = row[3]; | |
529 c0 = (a0 + a2)*R3 + (1 << (R_SHIFT - 1)); | |
530 c2 = (a0 - a2)*R3 + (1 << (R_SHIFT - 1)); | |
531 c1 = a1 * R1 + a3 * R2; | |
532 c3 = a1 * R2 - a3 * R1; | |
533 row[0]= (c0 + c1) >> R_SHIFT; | |
534 row[1]= (c2 + c3) >> R_SHIFT; | |
535 row[2]= (c2 - c3) >> R_SHIFT; | |
536 row[3]= (c0 - c1) >> R_SHIFT; | |
537 } | |
538 | |
6001 | 539 void ff_simple_idct84_add(uint8_t *dest, int line_size, DCTELEM *block) |
936 | 540 { |
541 int i; | |
542 | |
543 /* IDCT8 on each line */ | |
544 for(i=0; i<4; i++) { | |
545 idctRowCondDC(block + i*8); | |
546 } | |
547 | |
548 /* IDCT4 and store */ | |
549 for(i=0;i<8;i++) { | |
550 idct4col_add(dest + i, line_size, block + i); | |
551 } | |
552 } | |
553 | |
6001 | 554 void ff_simple_idct48_add(uint8_t *dest, int line_size, DCTELEM *block) |
936 | 555 { |
556 int i; | |
557 | |
558 /* IDCT4 on each line */ | |
559 for(i=0; i<8; i++) { | |
560 idct4row(block + i*8); | |
561 } | |
562 | |
563 /* IDCT8 and store */ | |
564 for(i=0; i<4; i++){ | |
565 idctSparseColAdd(dest + i, line_size, block + i); | |
566 } | |
567 } | |
568 | |
6007 | 569 void ff_simple_idct44_add(uint8_t *dest, int line_size, DCTELEM *block) |
570 { | |
571 int i; | |
572 | |
573 /* IDCT4 on each line */ | |
574 for(i=0; i<4; i++) { | |
575 idct4row(block + i*8); | |
576 } | |
577 | |
578 /* IDCT4 and store */ | |
579 for(i=0; i<4; i++){ | |
580 idct4col_add(dest + i, line_size, block + i); | |
581 } | |
582 } |