Mercurial > libavcodec.hg
annotate dsputil.c @ 4166:eced83504436 libavcodec
mp3 header (de)compression bitstream filter
this will make mp3 frames 4 bytes smaller, it will not give you binary identical mp3 files, but it will give you mp3 files which decode to binary identical output
this will only work in containers providing at least packet size, sample_rate and number of channels
bugreports about mp3 files for which this fails are welcome
and this is experimental (dont expect compatibility and dont even expect to be able to decompress what you compressed, hell dont even expect this to work without editing the source a little)
author | michael |
---|---|
date | Fri, 10 Nov 2006 01:41:53 +0000 |
parents | 34fdffe98bd0 |
children | 8535fcac43c1 |
rev | line source |
---|---|
0 | 1 /* |
2 * DSP utils | |
429 | 3 * Copyright (c) 2000, 2001 Fabrice Bellard. |
1739
07a484280a82
copyright year update of the files i touched and remembered, things look annoyingly unmaintained otherwise
michael
parents:
1729
diff
changeset
|
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
0 | 5 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3807
diff
changeset
|
6 * This file is part of FFmpeg. |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3807
diff
changeset
|
7 * |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3807
diff
changeset
|
8 * FFmpeg is free software; you can redistribute it and/or |
429 | 9 * modify it under the terms of the GNU Lesser General Public |
10 * License as published by the Free Software Foundation; either | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3807
diff
changeset
|
11 * version 2.1 of the License, or (at your option) any later version. |
0 | 12 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3807
diff
changeset
|
13 * FFmpeg is distributed in the hope that it will be useful, |
0 | 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
429 | 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
16 * Lesser General Public License for more details. | |
0 | 17 * |
429 | 18 * You should have received a copy of the GNU Lesser General Public |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3807
diff
changeset
|
19 * License along with FFmpeg; if not, write to the Free Software |
3036
0b546eab515d
Update licensing information: The FSF changed postal address.
diego
parents:
3029
diff
changeset
|
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
256 | 21 * |
385 | 22 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at> |
0 | 23 */ |
2967 | 24 |
1106 | 25 /** |
26 * @file dsputil.c | |
27 * DSP utils | |
28 */ | |
2967 | 29 |
0 | 30 #include "avcodec.h" |
31 #include "dsputil.h" | |
936 | 32 #include "mpegvideo.h" |
1092 | 33 #include "simple_idct.h" |
1557 | 34 #include "faandct.h" |
3198
6b9f0c4fbdbe
First part of a series of speed-enchancing patches.
gpoirier
parents:
3105
diff
changeset
|
35 #include "snow.h" |
676 | 36 |
2522
e25782262d7d
kill warnings patch by (Mns Rullgrd <mru inprovide com>)
michael
parents:
2448
diff
changeset
|
37 /* snow.c */ |
e25782262d7d
kill warnings patch by (Mns Rullgrd <mru inprovide com>)
michael
parents:
2448
diff
changeset
|
38 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count); |
e25782262d7d
kill warnings patch by (Mns Rullgrd <mru inprovide com>)
michael
parents:
2448
diff
changeset
|
39 |
3536
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3526
diff
changeset
|
40 /* vorbis.c */ |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3526
diff
changeset
|
41 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize); |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3526
diff
changeset
|
42 |
2169
db8baace74d8
Minor Patch for shared libs on Mac OSX by (Bill May <wmay at cisco dot com>)
michael
parents:
2066
diff
changeset
|
43 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, }; |
db8baace74d8
Minor Patch for shared libs on Mac OSX by (Bill May <wmay at cisco dot com>)
michael
parents:
2066
diff
changeset
|
44 uint32_t squareTbl[512] = {0, }; |
0 | 45 |
1064 | 46 const uint8_t ff_zigzag_direct[64] = { |
706
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
689
diff
changeset
|
47 0, 1, 8, 16, 9, 2, 3, 10, |
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
689
diff
changeset
|
48 17, 24, 32, 25, 18, 11, 4, 5, |
34 | 49 12, 19, 26, 33, 40, 48, 41, 34, |
706
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
689
diff
changeset
|
50 27, 20, 13, 6, 7, 14, 21, 28, |
34 | 51 35, 42, 49, 56, 57, 50, 43, 36, |
52 29, 22, 15, 23, 30, 37, 44, 51, | |
53 58, 59, 52, 45, 38, 31, 39, 46, | |
54 53, 60, 61, 54, 47, 55, 62, 63 | |
55 }; | |
56 | |
1567 | 57 /* Specific zigzag scan for 248 idct. NOTE that unlike the |
58 specification, we interleave the fields */ | |
59 const uint8_t ff_zigzag248_direct[64] = { | |
60 0, 8, 1, 9, 16, 24, 2, 10, | |
61 17, 25, 32, 40, 48, 56, 33, 41, | |
62 18, 26, 3, 11, 4, 12, 19, 27, | |
63 34, 42, 49, 57, 50, 58, 35, 43, | |
64 20, 28, 5, 13, 6, 14, 21, 29, | |
65 36, 44, 51, 59, 52, 60, 37, 45, | |
66 22, 30, 7, 15, 23, 31, 38, 46, | |
67 53, 61, 54, 62, 39, 47, 55, 63, | |
68 }; | |
69 | |
220 | 70 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */ |
3089 | 71 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, }; |
220 | 72 |
1064 | 73 const uint8_t ff_alternate_horizontal_scan[64] = { |
2967 | 74 0, 1, 2, 3, 8, 9, 16, 17, |
34 | 75 10, 11, 4, 5, 6, 7, 15, 14, |
2967 | 76 13, 12, 19, 18, 24, 25, 32, 33, |
34 | 77 26, 27, 20, 21, 22, 23, 28, 29, |
2967 | 78 30, 31, 34, 35, 40, 41, 48, 49, |
34 | 79 42, 43, 36, 37, 38, 39, 44, 45, |
2967 | 80 46, 47, 50, 51, 56, 57, 58, 59, |
34 | 81 52, 53, 54, 55, 60, 61, 62, 63, |
82 }; | |
83 | |
1064 | 84 const uint8_t ff_alternate_vertical_scan[64] = { |
2967 | 85 0, 8, 16, 24, 1, 9, 2, 10, |
34 | 86 17, 25, 32, 40, 48, 56, 57, 49, |
2967 | 87 41, 33, 26, 18, 3, 11, 4, 12, |
34 | 88 19, 27, 34, 42, 50, 58, 35, 43, |
2967 | 89 51, 59, 20, 28, 5, 13, 6, 14, |
34 | 90 21, 29, 36, 44, 52, 60, 37, 45, |
2967 | 91 53, 61, 22, 30, 7, 15, 23, 31, |
34 | 92 38, 46, 54, 62, 39, 47, 55, 63, |
93 }; | |
94 | |
220 | 95 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */ |
1064 | 96 const uint32_t inverse[256]={ |
2967 | 97 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757, |
98 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154, | |
99 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709, | |
100 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333, | |
101 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367, | |
102 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283, | |
103 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315, | |
104 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085, | |
105 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498, | |
106 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675, | |
107 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441, | |
108 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183, | |
109 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712, | |
110 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400, | |
111 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163, | |
112 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641, | |
113 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573, | |
114 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737, | |
115 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493, | |
116 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373, | |
117 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368, | |
118 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671, | |
119 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767, | |
120 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740, | |
121 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751, | |
122 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635, | |
123 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593, | |
124 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944, | |
125 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933, | |
126 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575, | |
127 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532, | |
220 | 128 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010, |
129 }; | |
130 | |
1092 | 131 /* Input permutation for the simple_idct_mmx */ |
132 static const uint8_t simple_mmx_permutation[64]={ | |
2979 | 133 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D, |
134 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D, | |
135 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D, | |
136 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F, | |
137 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F, | |
138 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D, | |
139 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F, | |
140 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F, | |
1092 | 141 }; |
142 | |
1064 | 143 static int pix_sum_c(uint8_t * pix, int line_size) |
612 | 144 { |
145 int s, i, j; | |
146 | |
147 s = 0; | |
148 for (i = 0; i < 16; i++) { | |
2979 | 149 for (j = 0; j < 16; j += 8) { |
150 s += pix[0]; | |
151 s += pix[1]; | |
152 s += pix[2]; | |
153 s += pix[3]; | |
154 s += pix[4]; | |
155 s += pix[5]; | |
156 s += pix[6]; | |
157 s += pix[7]; | |
158 pix += 8; | |
159 } | |
160 pix += line_size - 16; | |
612 | 161 } |
162 return s; | |
163 } | |
164 | |
1064 | 165 static int pix_norm1_c(uint8_t * pix, int line_size) |
612 | 166 { |
167 int s, i, j; | |
1064 | 168 uint32_t *sq = squareTbl + 256; |
612 | 169 |
170 s = 0; | |
171 for (i = 0; i < 16; i++) { | |
2979 | 172 for (j = 0; j < 16; j += 8) { |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
173 #if 0 |
2979 | 174 s += sq[pix[0]]; |
175 s += sq[pix[1]]; | |
176 s += sq[pix[2]]; | |
177 s += sq[pix[3]]; | |
178 s += sq[pix[4]]; | |
179 s += sq[pix[5]]; | |
180 s += sq[pix[6]]; | |
181 s += sq[pix[7]]; | |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
182 #else |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
183 #if LONG_MAX > 2147483647 |
2979 | 184 register uint64_t x=*(uint64_t*)pix; |
185 s += sq[x&0xff]; | |
186 s += sq[(x>>8)&0xff]; | |
187 s += sq[(x>>16)&0xff]; | |
188 s += sq[(x>>24)&0xff]; | |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
189 s += sq[(x>>32)&0xff]; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
190 s += sq[(x>>40)&0xff]; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
191 s += sq[(x>>48)&0xff]; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
192 s += sq[(x>>56)&0xff]; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
193 #else |
2979 | 194 register uint32_t x=*(uint32_t*)pix; |
195 s += sq[x&0xff]; | |
196 s += sq[(x>>8)&0xff]; | |
197 s += sq[(x>>16)&0xff]; | |
198 s += sq[(x>>24)&0xff]; | |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
199 x=*(uint32_t*)(pix+4); |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
200 s += sq[x&0xff]; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
201 s += sq[(x>>8)&0xff]; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
202 s += sq[(x>>16)&0xff]; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
203 s += sq[(x>>24)&0xff]; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
204 #endif |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
205 #endif |
2979 | 206 pix += 8; |
207 } | |
208 pix += line_size - 16; | |
612 | 209 } |
210 return s; | |
211 } | |
212 | |
1273 | 213 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){ |
214 int i; | |
2967 | 215 |
1273 | 216 for(i=0; i+8<=w; i+=8){ |
217 dst[i+0]= bswap_32(src[i+0]); | |
218 dst[i+1]= bswap_32(src[i+1]); | |
219 dst[i+2]= bswap_32(src[i+2]); | |
220 dst[i+3]= bswap_32(src[i+3]); | |
221 dst[i+4]= bswap_32(src[i+4]); | |
222 dst[i+5]= bswap_32(src[i+5]); | |
223 dst[i+6]= bswap_32(src[i+6]); | |
224 dst[i+7]= bswap_32(src[i+7]); | |
225 } | |
226 for(;i<w; i++){ | |
227 dst[i+0]= bswap_32(src[i+0]); | |
228 } | |
229 } | |
612 | 230 |
2184 | 231 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) |
232 { | |
233 int s, i; | |
234 uint32_t *sq = squareTbl + 256; | |
235 | |
236 s = 0; | |
237 for (i = 0; i < h; i++) { | |
238 s += sq[pix1[0] - pix2[0]]; | |
239 s += sq[pix1[1] - pix2[1]]; | |
240 s += sq[pix1[2] - pix2[2]]; | |
241 s += sq[pix1[3] - pix2[3]]; | |
242 pix1 += line_size; | |
243 pix2 += line_size; | |
244 } | |
245 return s; | |
246 } | |
247 | |
1708 | 248 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) |
936 | 249 { |
250 int s, i; | |
1064 | 251 uint32_t *sq = squareTbl + 256; |
936 | 252 |
253 s = 0; | |
1708 | 254 for (i = 0; i < h; i++) { |
936 | 255 s += sq[pix1[0] - pix2[0]]; |
256 s += sq[pix1[1] - pix2[1]]; | |
257 s += sq[pix1[2] - pix2[2]]; | |
258 s += sq[pix1[3] - pix2[3]]; | |
259 s += sq[pix1[4] - pix2[4]]; | |
260 s += sq[pix1[5] - pix2[5]]; | |
261 s += sq[pix1[6] - pix2[6]]; | |
262 s += sq[pix1[7] - pix2[7]]; | |
263 pix1 += line_size; | |
264 pix2 += line_size; | |
265 } | |
266 return s; | |
267 } | |
268 | |
1708 | 269 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
884 | 270 { |
1012
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
271 int s, i; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
272 uint32_t *sq = squareTbl + 256; |
884 | 273 |
274 s = 0; | |
1708 | 275 for (i = 0; i < h; i++) { |
1012
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
276 s += sq[pix1[ 0] - pix2[ 0]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
277 s += sq[pix1[ 1] - pix2[ 1]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
278 s += sq[pix1[ 2] - pix2[ 2]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
279 s += sq[pix1[ 3] - pix2[ 3]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
280 s += sq[pix1[ 4] - pix2[ 4]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
281 s += sq[pix1[ 5] - pix2[ 5]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
282 s += sq[pix1[ 6] - pix2[ 6]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
283 s += sq[pix1[ 7] - pix2[ 7]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
284 s += sq[pix1[ 8] - pix2[ 8]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
285 s += sq[pix1[ 9] - pix2[ 9]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
286 s += sq[pix1[10] - pix2[10]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
287 s += sq[pix1[11] - pix2[11]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
288 s += sq[pix1[12] - pix2[12]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
289 s += sq[pix1[13] - pix2[13]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
290 s += sq[pix1[14] - pix2[14]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
291 s += sq[pix1[15] - pix2[15]]; |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
292 |
1012
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
293 pix1 += line_size; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
294 pix2 += line_size; |
884 | 295 } |
296 return s; | |
297 } | |
298 | |
2184 | 299 |
3373
b8996cc5ccae
Disable w53 and w97 cmp methods when snow encoder is disabled
gpoirier
parents:
3323
diff
changeset
|
300 #ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c |
2184 | 301 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){ |
302 int s, i, j; | |
303 const int dec_count= w==8 ? 3 : 4; | |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
304 int tmp[32*32]; |
2184 | 305 int level, ori; |
2967 | 306 static const int scale[2][2][4][4]={ |
2184 | 307 { |
308 { | |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
309 // 9/7 8x8 dec=3 |
2184 | 310 {268, 239, 239, 213}, |
311 { 0, 224, 224, 152}, | |
312 { 0, 135, 135, 110}, | |
313 },{ | |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
314 // 9/7 16x16 or 32x32 dec=4 |
2184 | 315 {344, 310, 310, 280}, |
316 { 0, 320, 320, 228}, | |
317 { 0, 175, 175, 136}, | |
318 { 0, 129, 129, 102}, | |
319 } | |
320 },{ | |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
321 { |
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
322 // 5/3 8x8 dec=3 |
2184 | 323 {275, 245, 245, 218}, |
324 { 0, 230, 230, 156}, | |
325 { 0, 138, 138, 113}, | |
326 },{ | |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
327 // 5/3 16x16 or 32x32 dec=4 |
2184 | 328 {352, 317, 317, 286}, |
329 { 0, 328, 328, 233}, | |
330 { 0, 180, 180, 140}, | |
331 { 0, 132, 132, 105}, | |
332 } | |
333 } | |
334 }; | |
335 | |
336 for (i = 0; i < h; i++) { | |
337 for (j = 0; j < w; j+=4) { | |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
338 tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4; |
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
339 tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4; |
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
340 tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4; |
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
341 tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4; |
2184 | 342 } |
343 pix1 += line_size; | |
344 pix2 += line_size; | |
345 } | |
2639 | 346 |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
347 ff_spatial_dwt(tmp, w, h, 32, type, dec_count); |
2184 | 348 |
349 s=0; | |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
350 assert(w==h); |
2184 | 351 for(level=0; level<dec_count; level++){ |
352 for(ori= level ? 1 : 0; ori<4; ori++){ | |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
353 int size= w>>(dec_count-level); |
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
354 int sx= (ori&1) ? size : 0; |
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
355 int stride= 32<<(dec_count-level); |
2184 | 356 int sy= (ori&2) ? stride>>1 : 0; |
2967 | 357 |
2184 | 358 for(i=0; i<size; i++){ |
359 for(j=0; j<size; j++){ | |
360 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori]; | |
4001 | 361 s += FFABS(v); |
2184 | 362 } |
363 } | |
364 } | |
365 } | |
2967 | 366 assert(s>=0); |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
367 return s>>9; |
2184 | 368 } |
369 | |
370 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){ | |
371 return w_c(v, pix1, pix2, line_size, 8, h, 1); | |
372 } | |
373 | |
374 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){ | |
375 return w_c(v, pix1, pix2, line_size, 8, h, 0); | |
376 } | |
377 | |
378 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){ | |
379 return w_c(v, pix1, pix2, line_size, 16, h, 1); | |
380 } | |
381 | |
382 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){ | |
383 return w_c(v, pix1, pix2, line_size, 16, h, 0); | |
384 } | |
385 | |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
386 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){ |
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
387 return w_c(v, pix1, pix2, line_size, 32, h, 1); |
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
388 } |
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
389 |
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
390 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){ |
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
391 return w_c(v, pix1, pix2, line_size, 32, h, 0); |
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
392 } |
3373
b8996cc5ccae
Disable w53 and w97 cmp methods when snow encoder is disabled
gpoirier
parents:
3323
diff
changeset
|
393 #endif |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
394 |
1064 | 395 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size) |
0 | 396 { |
397 int i; | |
398 | |
399 /* read the pixels */ | |
400 for(i=0;i<8;i++) { | |
516 | 401 block[0] = pixels[0]; |
402 block[1] = pixels[1]; | |
403 block[2] = pixels[2]; | |
404 block[3] = pixels[3]; | |
405 block[4] = pixels[4]; | |
406 block[5] = pixels[5]; | |
407 block[6] = pixels[6]; | |
408 block[7] = pixels[7]; | |
409 pixels += line_size; | |
410 block += 8; | |
0 | 411 } |
412 } | |
413 | |
1064 | 414 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1, |
2979 | 415 const uint8_t *s2, int stride){ |
324 | 416 int i; |
417 | |
418 /* read the pixels */ | |
419 for(i=0;i<8;i++) { | |
516 | 420 block[0] = s1[0] - s2[0]; |
421 block[1] = s1[1] - s2[1]; | |
422 block[2] = s1[2] - s2[2]; | |
423 block[3] = s1[3] - s2[3]; | |
424 block[4] = s1[4] - s2[4]; | |
425 block[5] = s1[5] - s2[5]; | |
426 block[6] = s1[6] - s2[6]; | |
427 block[7] = s1[7] - s2[7]; | |
324 | 428 s1 += stride; |
429 s2 += stride; | |
516 | 430 block += 8; |
324 | 431 } |
432 } | |
433 | |
434 | |
1064 | 435 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels, |
2979 | 436 int line_size) |
0 | 437 { |
438 int i; | |
1064 | 439 uint8_t *cm = cropTbl + MAX_NEG_CROP; |
2967 | 440 |
0 | 441 /* read the pixels */ |
442 for(i=0;i<8;i++) { | |
516 | 443 pixels[0] = cm[block[0]]; |
444 pixels[1] = cm[block[1]]; | |
445 pixels[2] = cm[block[2]]; | |
446 pixels[3] = cm[block[3]]; | |
447 pixels[4] = cm[block[4]]; | |
448 pixels[5] = cm[block[5]]; | |
449 pixels[6] = cm[block[6]]; | |
450 pixels[7] = cm[block[7]]; | |
451 | |
452 pixels += line_size; | |
453 block += 8; | |
0 | 454 } |
455 } | |
456 | |
2256 | 457 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels, |
2979 | 458 int line_size) |
2256 | 459 { |
460 int i; | |
461 uint8_t *cm = cropTbl + MAX_NEG_CROP; | |
2967 | 462 |
2256 | 463 /* read the pixels */ |
464 for(i=0;i<4;i++) { | |
465 pixels[0] = cm[block[0]]; | |
466 pixels[1] = cm[block[1]]; | |
467 pixels[2] = cm[block[2]]; | |
468 pixels[3] = cm[block[3]]; | |
469 | |
470 pixels += line_size; | |
471 block += 8; | |
472 } | |
473 } | |
474 | |
2257 | 475 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels, |
2979 | 476 int line_size) |
2257 | 477 { |
478 int i; | |
479 uint8_t *cm = cropTbl + MAX_NEG_CROP; | |
2967 | 480 |
2257 | 481 /* read the pixels */ |
482 for(i=0;i<2;i++) { | |
483 pixels[0] = cm[block[0]]; | |
484 pixels[1] = cm[block[1]]; | |
485 | |
486 pixels += line_size; | |
487 block += 8; | |
488 } | |
489 } | |
490 | |
2967 | 491 static void put_signed_pixels_clamped_c(const DCTELEM *block, |
1984
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
492 uint8_t *restrict pixels, |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
493 int line_size) |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
494 { |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
495 int i, j; |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
496 |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
497 for (i = 0; i < 8; i++) { |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
498 for (j = 0; j < 8; j++) { |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
499 if (*block < -128) |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
500 *pixels = 0; |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
501 else if (*block > 127) |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
502 *pixels = 255; |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
503 else |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
504 *pixels = (uint8_t)(*block + 128); |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
505 block++; |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
506 pixels++; |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
507 } |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
508 pixels += (line_size - 8); |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
509 } |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
510 } |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
511 |
1064 | 512 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels, |
516 | 513 int line_size) |
0 | 514 { |
515 int i; | |
1064 | 516 uint8_t *cm = cropTbl + MAX_NEG_CROP; |
2967 | 517 |
0 | 518 /* read the pixels */ |
519 for(i=0;i<8;i++) { | |
516 | 520 pixels[0] = cm[pixels[0] + block[0]]; |
521 pixels[1] = cm[pixels[1] + block[1]]; | |
522 pixels[2] = cm[pixels[2] + block[2]]; | |
523 pixels[3] = cm[pixels[3] + block[3]]; | |
524 pixels[4] = cm[pixels[4] + block[4]]; | |
525 pixels[5] = cm[pixels[5] + block[5]]; | |
526 pixels[6] = cm[pixels[6] + block[6]]; | |
527 pixels[7] = cm[pixels[7] + block[7]]; | |
528 pixels += line_size; | |
529 block += 8; | |
0 | 530 } |
531 } | |
2256 | 532 |
533 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels, | |
534 int line_size) | |
535 { | |
536 int i; | |
537 uint8_t *cm = cropTbl + MAX_NEG_CROP; | |
2967 | 538 |
2256 | 539 /* read the pixels */ |
540 for(i=0;i<4;i++) { | |
541 pixels[0] = cm[pixels[0] + block[0]]; | |
542 pixels[1] = cm[pixels[1] + block[1]]; | |
543 pixels[2] = cm[pixels[2] + block[2]]; | |
544 pixels[3] = cm[pixels[3] + block[3]]; | |
545 pixels += line_size; | |
546 block += 8; | |
547 } | |
548 } | |
2257 | 549 |
550 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels, | |
551 int line_size) | |
552 { | |
553 int i; | |
554 uint8_t *cm = cropTbl + MAX_NEG_CROP; | |
2967 | 555 |
2257 | 556 /* read the pixels */ |
557 for(i=0;i<2;i++) { | |
558 pixels[0] = cm[pixels[0] + block[0]]; | |
559 pixels[1] = cm[pixels[1] + block[1]]; | |
560 pixels += line_size; | |
561 block += 8; | |
562 } | |
563 } | |
2763 | 564 |
565 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size) | |
566 { | |
567 int i; | |
568 for(i=0;i<8;i++) { | |
569 pixels[0] += block[0]; | |
570 pixels[1] += block[1]; | |
571 pixels[2] += block[2]; | |
572 pixels[3] += block[3]; | |
573 pixels[4] += block[4]; | |
574 pixels[5] += block[5]; | |
575 pixels[6] += block[6]; | |
576 pixels[7] += block[7]; | |
577 pixels += line_size; | |
578 block += 8; | |
579 } | |
580 } | |
581 | |
582 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size) | |
583 { | |
584 int i; | |
585 for(i=0;i<4;i++) { | |
586 pixels[0] += block[0]; | |
587 pixels[1] += block[1]; | |
588 pixels[2] += block[2]; | |
589 pixels[3] += block[3]; | |
590 pixels += line_size; | |
591 block += 4; | |
592 } | |
593 } | |
594 | |
385 | 595 #if 0 |
596 | |
597 #define PIXOP2(OPNAME, OP) \ | |
651 | 598 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ |
385 | 599 {\ |
600 int i;\ | |
601 for(i=0; i<h; i++){\ | |
602 OP(*((uint64_t*)block), LD64(pixels));\ | |
603 pixels+=line_size;\ | |
604 block +=line_size;\ | |
605 }\ | |
606 }\ | |
607 \ | |
859 | 608 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ |
385 | 609 {\ |
610 int i;\ | |
611 for(i=0; i<h; i++){\ | |
612 const uint64_t a= LD64(pixels );\ | |
613 const uint64_t b= LD64(pixels+1);\ | |
614 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\ | |
615 pixels+=line_size;\ | |
616 block +=line_size;\ | |
617 }\ | |
618 }\ | |
619 \ | |
859 | 620 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ |
385 | 621 {\ |
622 int i;\ | |
623 for(i=0; i<h; i++){\ | |
624 const uint64_t a= LD64(pixels );\ | |
625 const uint64_t b= LD64(pixels+1);\ | |
626 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\ | |
627 pixels+=line_size;\ | |
628 block +=line_size;\ | |
629 }\ | |
630 }\ | |
631 \ | |
859 | 632 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ |
385 | 633 {\ |
634 int i;\ | |
635 for(i=0; i<h; i++){\ | |
636 const uint64_t a= LD64(pixels );\ | |
637 const uint64_t b= LD64(pixels+line_size);\ | |
638 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\ | |
639 pixels+=line_size;\ | |
640 block +=line_size;\ | |
641 }\ | |
642 }\ | |
643 \ | |
859 | 644 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ |
385 | 645 {\ |
646 int i;\ | |
647 for(i=0; i<h; i++){\ | |
648 const uint64_t a= LD64(pixels );\ | |
649 const uint64_t b= LD64(pixels+line_size);\ | |
650 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\ | |
651 pixels+=line_size;\ | |
652 block +=line_size;\ | |
653 }\ | |
654 }\ | |
655 \ | |
859 | 656 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ |
385 | 657 {\ |
658 int i;\ | |
659 const uint64_t a= LD64(pixels );\ | |
660 const uint64_t b= LD64(pixels+1);\ | |
661 uint64_t l0= (a&0x0303030303030303ULL)\ | |
662 + (b&0x0303030303030303ULL)\ | |
663 + 0x0202020202020202ULL;\ | |
664 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\ | |
665 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\ | |
666 uint64_t l1,h1;\ | |
667 \ | |
668 pixels+=line_size;\ | |
669 for(i=0; i<h; i+=2){\ | |
670 uint64_t a= LD64(pixels );\ | |
671 uint64_t b= LD64(pixels+1);\ | |
672 l1= (a&0x0303030303030303ULL)\ | |
673 + (b&0x0303030303030303ULL);\ | |
674 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\ | |
675 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\ | |
676 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\ | |
677 pixels+=line_size;\ | |
678 block +=line_size;\ | |
679 a= LD64(pixels );\ | |
680 b= LD64(pixels+1);\ | |
681 l0= (a&0x0303030303030303ULL)\ | |
682 + (b&0x0303030303030303ULL)\ | |
683 + 0x0202020202020202ULL;\ | |
684 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\ | |
685 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\ | |
686 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\ | |
687 pixels+=line_size;\ | |
688 block +=line_size;\ | |
689 }\ | |
690 }\ | |
691 \ | |
859 | 692 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ |
385 | 693 {\ |
694 int i;\ | |
695 const uint64_t a= LD64(pixels );\ | |
696 const uint64_t b= LD64(pixels+1);\ | |
697 uint64_t l0= (a&0x0303030303030303ULL)\ | |
698 + (b&0x0303030303030303ULL)\ | |
699 + 0x0101010101010101ULL;\ | |
700 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\ | |
701 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\ | |
702 uint64_t l1,h1;\ | |
703 \ | |
704 pixels+=line_size;\ | |
705 for(i=0; i<h; i+=2){\ | |
706 uint64_t a= LD64(pixels );\ | |
707 uint64_t b= LD64(pixels+1);\ | |
708 l1= (a&0x0303030303030303ULL)\ | |
709 + (b&0x0303030303030303ULL);\ | |
710 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\ | |
711 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\ | |
712 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\ | |
713 pixels+=line_size;\ | |
714 block +=line_size;\ | |
715 a= LD64(pixels );\ | |
716 b= LD64(pixels+1);\ | |
717 l0= (a&0x0303030303030303ULL)\ | |
718 + (b&0x0303030303030303ULL)\ | |
719 + 0x0101010101010101ULL;\ | |
720 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\ | |
721 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\ | |
722 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\ | |
723 pixels+=line_size;\ | |
724 block +=line_size;\ | |
725 }\ | |
726 }\ | |
727 \ | |
859 | 728 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\ |
729 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\ | |
730 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\ | |
731 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\ | |
732 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\ | |
733 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\ | |
734 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8) | |
385 | 735 |
736 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) ) | |
737 #else // 64 bit variant | |
738 | |
739 #define PIXOP2(OPNAME, OP) \ | |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
740 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
741 int i;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
742 for(i=0; i<h; i++){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
743 OP(*((uint16_t*)(block )), LD16(pixels ));\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
744 pixels+=line_size;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
745 block +=line_size;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
746 }\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
747 }\ |
1168 | 748 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
749 int i;\ | |
750 for(i=0; i<h; i++){\ | |
751 OP(*((uint32_t*)(block )), LD32(pixels ));\ | |
752 pixels+=line_size;\ | |
753 block +=line_size;\ | |
754 }\ | |
755 }\ | |
859 | 756 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
385 | 757 int i;\ |
758 for(i=0; i<h; i++){\ | |
759 OP(*((uint32_t*)(block )), LD32(pixels ));\ | |
760 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\ | |
761 pixels+=line_size;\ | |
762 block +=line_size;\ | |
763 }\ | |
764 }\ | |
859 | 765 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
766 OPNAME ## _pixels8_c(block, pixels, line_size, h);\ | |
651 | 767 }\ |
385 | 768 \ |
651 | 769 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ |
770 int src_stride1, int src_stride2, int h){\ | |
385 | 771 int i;\ |
772 for(i=0; i<h; i++){\ | |
651 | 773 uint32_t a,b;\ |
774 a= LD32(&src1[i*src_stride1 ]);\ | |
775 b= LD32(&src2[i*src_stride2 ]);\ | |
1264 | 776 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\ |
651 | 777 a= LD32(&src1[i*src_stride1+4]);\ |
778 b= LD32(&src2[i*src_stride2+4]);\ | |
1264 | 779 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\ |
385 | 780 }\ |
781 }\ | |
782 \ | |
651 | 783 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ |
784 int src_stride1, int src_stride2, int h){\ | |
385 | 785 int i;\ |
786 for(i=0; i<h; i++){\ | |
651 | 787 uint32_t a,b;\ |
788 a= LD32(&src1[i*src_stride1 ]);\ | |
789 b= LD32(&src2[i*src_stride2 ]);\ | |
1264 | 790 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\ |
651 | 791 a= LD32(&src1[i*src_stride1+4]);\ |
792 b= LD32(&src2[i*src_stride2+4]);\ | |
1264 | 793 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\ |
385 | 794 }\ |
795 }\ | |
796 \ | |
1168 | 797 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ |
798 int src_stride1, int src_stride2, int h){\ | |
799 int i;\ | |
800 for(i=0; i<h; i++){\ | |
801 uint32_t a,b;\ | |
802 a= LD32(&src1[i*src_stride1 ]);\ | |
803 b= LD32(&src2[i*src_stride2 ]);\ | |
1264 | 804 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\ |
1168 | 805 }\ |
806 }\ | |
807 \ | |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
808 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
809 int src_stride1, int src_stride2, int h){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
810 int i;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
811 for(i=0; i<h; i++){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
812 uint32_t a,b;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
813 a= LD16(&src1[i*src_stride1 ]);\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
814 b= LD16(&src2[i*src_stride2 ]);\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
815 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
816 }\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
817 }\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
818 \ |
651 | 819 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ |
820 int src_stride1, int src_stride2, int h){\ | |
821 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\ | |
822 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\ | |
823 }\ | |
824 \ | |
825 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ | |
826 int src_stride1, int src_stride2, int h){\ | |
827 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\ | |
828 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\ | |
829 }\ | |
830 \ | |
859 | 831 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
651 | 832 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\ |
833 }\ | |
834 \ | |
859 | 835 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
651 | 836 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\ |
837 }\ | |
838 \ | |
859 | 839 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
651 | 840 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\ |
841 }\ | |
842 \ | |
859 | 843 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
651 | 844 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\ |
385 | 845 }\ |
846 \ | |
651 | 847 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\ |
848 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ | |
849 int i;\ | |
850 for(i=0; i<h; i++){\ | |
851 uint32_t a, b, c, d, l0, l1, h0, h1;\ | |
852 a= LD32(&src1[i*src_stride1]);\ | |
853 b= LD32(&src2[i*src_stride2]);\ | |
854 c= LD32(&src3[i*src_stride3]);\ | |
855 d= LD32(&src4[i*src_stride4]);\ | |
856 l0= (a&0x03030303UL)\ | |
857 + (b&0x03030303UL)\ | |
858 + 0x02020202UL;\ | |
859 h0= ((a&0xFCFCFCFCUL)>>2)\ | |
860 + ((b&0xFCFCFCFCUL)>>2);\ | |
861 l1= (c&0x03030303UL)\ | |
862 + (d&0x03030303UL);\ | |
863 h1= ((c&0xFCFCFCFCUL)>>2)\ | |
864 + ((d&0xFCFCFCFCUL)>>2);\ | |
865 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ | |
866 a= LD32(&src1[i*src_stride1+4]);\ | |
867 b= LD32(&src2[i*src_stride2+4]);\ | |
868 c= LD32(&src3[i*src_stride3+4]);\ | |
869 d= LD32(&src4[i*src_stride4+4]);\ | |
870 l0= (a&0x03030303UL)\ | |
871 + (b&0x03030303UL)\ | |
872 + 0x02020202UL;\ | |
873 h0= ((a&0xFCFCFCFCUL)>>2)\ | |
874 + ((b&0xFCFCFCFCUL)>>2);\ | |
875 l1= (c&0x03030303UL)\ | |
876 + (d&0x03030303UL);\ | |
877 h1= ((c&0xFCFCFCFCUL)>>2)\ | |
878 + ((d&0xFCFCFCFCUL)>>2);\ | |
879 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ | |
880 }\ | |
881 }\ | |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
882 \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
883 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
884 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
885 }\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
886 \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
887 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
888 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
889 }\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
890 \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
891 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
892 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
893 }\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
894 \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
895 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
896 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
897 }\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
898 \ |
651 | 899 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\ |
900 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ | |
385 | 901 int i;\ |
902 for(i=0; i<h; i++){\ | |
651 | 903 uint32_t a, b, c, d, l0, l1, h0, h1;\ |
904 a= LD32(&src1[i*src_stride1]);\ | |
905 b= LD32(&src2[i*src_stride2]);\ | |
906 c= LD32(&src3[i*src_stride3]);\ | |
907 d= LD32(&src4[i*src_stride4]);\ | |
908 l0= (a&0x03030303UL)\ | |
909 + (b&0x03030303UL)\ | |
910 + 0x01010101UL;\ | |
911 h0= ((a&0xFCFCFCFCUL)>>2)\ | |
912 + ((b&0xFCFCFCFCUL)>>2);\ | |
913 l1= (c&0x03030303UL)\ | |
914 + (d&0x03030303UL);\ | |
915 h1= ((c&0xFCFCFCFCUL)>>2)\ | |
916 + ((d&0xFCFCFCFCUL)>>2);\ | |
917 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ | |
918 a= LD32(&src1[i*src_stride1+4]);\ | |
919 b= LD32(&src2[i*src_stride2+4]);\ | |
920 c= LD32(&src3[i*src_stride3+4]);\ | |
921 d= LD32(&src4[i*src_stride4+4]);\ | |
922 l0= (a&0x03030303UL)\ | |
923 + (b&0x03030303UL)\ | |
924 + 0x01010101UL;\ | |
925 h0= ((a&0xFCFCFCFCUL)>>2)\ | |
926 + ((b&0xFCFCFCFCUL)>>2);\ | |
927 l1= (c&0x03030303UL)\ | |
928 + (d&0x03030303UL);\ | |
929 h1= ((c&0xFCFCFCFCUL)>>2)\ | |
930 + ((d&0xFCFCFCFCUL)>>2);\ | |
931 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ | |
385 | 932 }\ |
933 }\ | |
651 | 934 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\ |
935 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ | |
936 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\ | |
937 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\ | |
938 }\ | |
939 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\ | |
940 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ | |
941 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\ | |
942 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\ | |
943 }\ | |
385 | 944 \ |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
945 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
946 {\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
947 int i, a0, b0, a1, b1;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
948 a0= pixels[0];\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
949 b0= pixels[1] + 2;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
950 a0 += b0;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
951 b0 += pixels[2];\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
952 \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
953 pixels+=line_size;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
954 for(i=0; i<h; i+=2){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
955 a1= pixels[0];\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
956 b1= pixels[1];\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
957 a1 += b1;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
958 b1 += pixels[2];\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
959 \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
960 block[0]= (a1+a0)>>2; /* FIXME non put */\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
961 block[1]= (b1+b0)>>2;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
962 \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
963 pixels+=line_size;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
964 block +=line_size;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
965 \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
966 a0= pixels[0];\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
967 b0= pixels[1] + 2;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
968 a0 += b0;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
969 b0 += pixels[2];\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
970 \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
971 block[0]= (a1+a0)>>2;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
972 block[1]= (b1+b0)>>2;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
973 pixels+=line_size;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
974 block +=line_size;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
975 }\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
976 }\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
977 \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
978 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
979 {\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
980 int i;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
981 const uint32_t a= LD32(pixels );\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
982 const uint32_t b= LD32(pixels+1);\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
983 uint32_t l0= (a&0x03030303UL)\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
984 + (b&0x03030303UL)\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
985 + 0x02020202UL;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
986 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
987 + ((b&0xFCFCFCFCUL)>>2);\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
988 uint32_t l1,h1;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
989 \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
990 pixels+=line_size;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
991 for(i=0; i<h; i+=2){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
992 uint32_t a= LD32(pixels );\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
993 uint32_t b= LD32(pixels+1);\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
994 l1= (a&0x03030303UL)\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
995 + (b&0x03030303UL);\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
996 h1= ((a&0xFCFCFCFCUL)>>2)\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
997 + ((b&0xFCFCFCFCUL)>>2);\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
998 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
999 pixels+=line_size;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1000 block +=line_size;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1001 a= LD32(pixels );\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1002 b= LD32(pixels+1);\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1003 l0= (a&0x03030303UL)\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1004 + (b&0x03030303UL)\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1005 + 0x02020202UL;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1006 h0= ((a&0xFCFCFCFCUL)>>2)\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1007 + ((b&0xFCFCFCFCUL)>>2);\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1008 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1009 pixels+=line_size;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1010 block +=line_size;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1011 }\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1012 }\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1013 \ |
859 | 1014 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ |
385 | 1015 {\ |
1016 int j;\ | |
1017 for(j=0; j<2; j++){\ | |
1018 int i;\ | |
1019 const uint32_t a= LD32(pixels );\ | |
1020 const uint32_t b= LD32(pixels+1);\ | |
1021 uint32_t l0= (a&0x03030303UL)\ | |
1022 + (b&0x03030303UL)\ | |
1023 + 0x02020202UL;\ | |
1024 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\ | |
1025 + ((b&0xFCFCFCFCUL)>>2);\ | |
1026 uint32_t l1,h1;\ | |
1027 \ | |
1028 pixels+=line_size;\ | |
1029 for(i=0; i<h; i+=2){\ | |
1030 uint32_t a= LD32(pixels );\ | |
1031 uint32_t b= LD32(pixels+1);\ | |
1032 l1= (a&0x03030303UL)\ | |
1033 + (b&0x03030303UL);\ | |
1034 h1= ((a&0xFCFCFCFCUL)>>2)\ | |
1035 + ((b&0xFCFCFCFCUL)>>2);\ | |
1036 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ | |
1037 pixels+=line_size;\ | |
1038 block +=line_size;\ | |
1039 a= LD32(pixels );\ | |
1040 b= LD32(pixels+1);\ | |
1041 l0= (a&0x03030303UL)\ | |
1042 + (b&0x03030303UL)\ | |
1043 + 0x02020202UL;\ | |
1044 h0= ((a&0xFCFCFCFCUL)>>2)\ | |
1045 + ((b&0xFCFCFCFCUL)>>2);\ | |
1046 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ | |
1047 pixels+=line_size;\ | |
1048 block +=line_size;\ | |
1049 }\ | |
1050 pixels+=4-line_size*(h+1);\ | |
1051 block +=4-line_size*h;\ | |
1052 }\ | |
1053 }\ | |
1054 \ | |
859 | 1055 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ |
385 | 1056 {\ |
1057 int j;\ | |
1058 for(j=0; j<2; j++){\ | |
1059 int i;\ | |
1060 const uint32_t a= LD32(pixels );\ | |
1061 const uint32_t b= LD32(pixels+1);\ | |
1062 uint32_t l0= (a&0x03030303UL)\ | |
1063 + (b&0x03030303UL)\ | |
1064 + 0x01010101UL;\ | |
1065 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\ | |
1066 + ((b&0xFCFCFCFCUL)>>2);\ | |
1067 uint32_t l1,h1;\ | |
1068 \ | |
1069 pixels+=line_size;\ | |
1070 for(i=0; i<h; i+=2){\ | |
1071 uint32_t a= LD32(pixels );\ | |
1072 uint32_t b= LD32(pixels+1);\ | |
1073 l1= (a&0x03030303UL)\ | |
1074 + (b&0x03030303UL);\ | |
1075 h1= ((a&0xFCFCFCFCUL)>>2)\ | |
1076 + ((b&0xFCFCFCFCUL)>>2);\ | |
1077 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ | |
1078 pixels+=line_size;\ | |
1079 block +=line_size;\ | |
1080 a= LD32(pixels );\ | |
1081 b= LD32(pixels+1);\ | |
1082 l0= (a&0x03030303UL)\ | |
1083 + (b&0x03030303UL)\ | |
1084 + 0x01010101UL;\ | |
1085 h0= ((a&0xFCFCFCFCUL)>>2)\ | |
1086 + ((b&0xFCFCFCFCUL)>>2);\ | |
1087 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ | |
1088 pixels+=line_size;\ | |
1089 block +=line_size;\ | |
1090 }\ | |
1091 pixels+=4-line_size*(h+1);\ | |
1092 block +=4-line_size*h;\ | |
1093 }\ | |
1094 }\ | |
1095 \ | |
859 | 1096 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\ |
1097 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\ | |
1098 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\ | |
1099 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\ | |
1100 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\ | |
1101 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\ | |
1102 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\ | |
1103 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\ | |
651 | 1104 |
1264 | 1105 #define op_avg(a, b) a = rnd_avg32(a, b) |
385 | 1106 #endif |
1107 #define op_put(a, b) a = b | |
1108 | |
1109 PIXOP2(avg, op_avg) | |
1110 PIXOP2(put, op_put) | |
1111 #undef op_avg | |
1112 #undef op_put | |
1113 | |
0 | 1114 #define avg2(a,b) ((a+b+1)>>1) |
1115 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2) | |
1116 | |
1864 | 1117 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){ |
1118 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h); | |
1119 } | |
1120 | |
1121 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){ | |
1122 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h); | |
1123 } | |
753 | 1124 |
1064 | 1125 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder) |
255 | 1126 { |
1127 const int A=(16-x16)*(16-y16); | |
1128 const int B=( x16)*(16-y16); | |
1129 const int C=(16-x16)*( y16); | |
1130 const int D=( x16)*( y16); | |
1131 int i; | |
1132 | |
1133 for(i=0; i<h; i++) | |
1134 { | |
651 | 1135 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8; |
1136 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8; | |
1137 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8; | |
1138 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8; | |
1139 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8; | |
1140 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8; | |
1141 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8; | |
1142 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8; | |
1143 dst+= stride; | |
1144 src+= stride; | |
255 | 1145 } |
1146 } | |
1147 | |
3248
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3245
diff
changeset
|
1148 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, |
753 | 1149 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height) |
1150 { | |
1151 int y, vx, vy; | |
1152 const int s= 1<<shift; | |
2967 | 1153 |
753 | 1154 width--; |
1155 height--; | |
1156 | |
1157 for(y=0; y<h; y++){ | |
1158 int x; | |
1159 | |
1160 vx= ox; | |
1161 vy= oy; | |
1162 for(x=0; x<8; x++){ //XXX FIXME optimize | |
1163 int src_x, src_y, frac_x, frac_y, index; | |
1164 | |
1165 src_x= vx>>16; | |
1166 src_y= vy>>16; | |
1167 frac_x= src_x&(s-1); | |
1168 frac_y= src_y&(s-1); | |
1169 src_x>>=shift; | |
1170 src_y>>=shift; | |
2967 | 1171 |
753 | 1172 if((unsigned)src_x < width){ |
1173 if((unsigned)src_y < height){ | |
1174 index= src_x + src_y*stride; | |
1175 dst[y*stride + x]= ( ( src[index ]*(s-frac_x) | |
1176 + src[index +1]* frac_x )*(s-frac_y) | |
1177 + ( src[index+stride ]*(s-frac_x) | |
1178 + src[index+stride+1]* frac_x )* frac_y | |
1179 + r)>>(shift*2); | |
1180 }else{ | |
2967 | 1181 index= src_x + clip(src_y, 0, height)*stride; |
1182 dst[y*stride + x]= ( ( src[index ]*(s-frac_x) | |
753 | 1183 + src[index +1]* frac_x )*s |
1184 + r)>>(shift*2); | |
1185 } | |
1186 }else{ | |
1187 if((unsigned)src_y < height){ | |
2967 | 1188 index= clip(src_x, 0, width) + src_y*stride; |
1189 dst[y*stride + x]= ( ( src[index ]*(s-frac_y) | |
753 | 1190 + src[index+stride ]* frac_y )*s |
1191 + r)>>(shift*2); | |
1192 }else{ | |
2967 | 1193 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride; |
753 | 1194 dst[y*stride + x]= src[index ]; |
1195 } | |
1196 } | |
2967 | 1197 |
753 | 1198 vx+= dxx; |
1199 vy+= dyx; | |
1200 } | |
1201 ox += dxy; | |
1202 oy += dyy; | |
1203 } | |
1204 } | |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1205 |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1206 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1207 switch(width){ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1208 case 2: put_pixels2_c (dst, src, stride, height); break; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1209 case 4: put_pixels4_c (dst, src, stride, height); break; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1210 case 8: put_pixels8_c (dst, src, stride, height); break; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1211 case 16:put_pixels16_c(dst, src, stride, height); break; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1212 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1213 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1214 |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1215 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1216 int i,j; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1217 for (i=0; i < height; i++) { |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1218 for (j=0; j < width; j++) { |
2979 | 1219 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11; |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1220 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1221 src += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1222 dst += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1223 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1224 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1225 |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1226 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1227 int i,j; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1228 for (i=0; i < height; i++) { |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1229 for (j=0; j < width; j++) { |
2979 | 1230 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11; |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1231 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1232 src += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1233 dst += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1234 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1235 } |
2967 | 1236 |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1237 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1238 int i,j; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1239 for (i=0; i < height; i++) { |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1240 for (j=0; j < width; j++) { |
2979 | 1241 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11; |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1242 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1243 src += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1244 dst += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1245 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1246 } |
2967 | 1247 |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1248 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1249 int i,j; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1250 for (i=0; i < height; i++) { |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1251 for (j=0; j < width; j++) { |
2979 | 1252 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15; |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1253 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1254 src += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1255 dst += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1256 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1257 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1258 |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1259 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1260 int i,j; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1261 for (i=0; i < height; i++) { |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1262 for (j=0; j < width; j++) { |
2979 | 1263 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15; |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1264 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1265 src += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1266 dst += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1267 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1268 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1269 |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1270 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1271 int i,j; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1272 for (i=0; i < height; i++) { |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1273 for (j=0; j < width; j++) { |
2979 | 1274 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11; |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1275 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1276 src += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1277 dst += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1278 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1279 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1280 |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1281 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1282 int i,j; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1283 for (i=0; i < height; i++) { |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1284 for (j=0; j < width; j++) { |
2979 | 1285 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15; |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1286 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1287 src += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1288 dst += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1289 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1290 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1291 |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1292 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1293 int i,j; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1294 for (i=0; i < height; i++) { |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1295 for (j=0; j < width; j++) { |
2979 | 1296 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15; |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1297 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1298 src += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1299 dst += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1300 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1301 } |
1319 | 1302 |
1303 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ | |
1304 switch(width){ | |
1305 case 2: avg_pixels2_c (dst, src, stride, height); break; | |
1306 case 4: avg_pixels4_c (dst, src, stride, height); break; | |
1307 case 8: avg_pixels8_c (dst, src, stride, height); break; | |
1308 case 16:avg_pixels16_c(dst, src, stride, height); break; | |
1309 } | |
1310 } | |
1311 | |
1312 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ | |
1313 int i,j; | |
1314 for (i=0; i < height; i++) { | |
1315 for (j=0; j < width; j++) { | |
2979 | 1316 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1; |
1319 | 1317 } |
1318 src += stride; | |
1319 dst += stride; | |
1320 } | |
1321 } | |
1322 | |
1323 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ | |
1324 int i,j; | |
1325 for (i=0; i < height; i++) { | |
1326 for (j=0; j < width; j++) { | |
2979 | 1327 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1; |
1319 | 1328 } |
1329 src += stride; | |
1330 dst += stride; | |
1331 } | |
1332 } | |
2967 | 1333 |
1319 | 1334 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ |
1335 int i,j; | |
1336 for (i=0; i < height; i++) { | |
1337 for (j=0; j < width; j++) { | |
2979 | 1338 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1; |
1319 | 1339 } |
1340 src += stride; | |
1341 dst += stride; | |
1342 } | |
1343 } | |
2967 | 1344 |
1319 | 1345 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ |
1346 int i,j; | |
1347 for (i=0; i < height; i++) { | |
1348 for (j=0; j < width; j++) { | |
2979 | 1349 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1; |
1319 | 1350 } |
1351 src += stride; | |
1352 dst += stride; | |
1353 } | |
1354 } | |
1355 | |
1356 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ | |
1357 int i,j; | |
1358 for (i=0; i < height; i++) { | |
1359 for (j=0; j < width; j++) { | |
2979 | 1360 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1; |
1319 | 1361 } |
1362 src += stride; | |
1363 dst += stride; | |
1364 } | |
1365 } | |
1366 | |
1367 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ | |
1368 int i,j; | |
1369 for (i=0; i < height; i++) { | |
1370 for (j=0; j < width; j++) { | |
2979 | 1371 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1; |
1319 | 1372 } |
1373 src += stride; | |
1374 dst += stride; | |
1375 } | |
1376 } | |
1377 | |
1378 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ | |
1379 int i,j; | |
1380 for (i=0; i < height; i++) { | |
1381 for (j=0; j < width; j++) { | |
2979 | 1382 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1; |
1319 | 1383 } |
1384 src += stride; | |
1385 dst += stride; | |
1386 } | |
1387 } | |
1388 | |
1389 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ | |
1390 int i,j; | |
1391 for (i=0; i < height; i++) { | |
1392 for (j=0; j < width; j++) { | |
2979 | 1393 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1; |
1319 | 1394 } |
1395 src += stride; | |
1396 dst += stride; | |
1397 } | |
1398 } | |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1399 #if 0 |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1400 #define TPEL_WIDTH(width)\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1401 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1402 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1403 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1404 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1405 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1406 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1407 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1408 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1409 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1410 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1411 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1412 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1413 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1414 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1415 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1416 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1417 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1418 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);} |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1419 #endif |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1420 |
1168 | 1421 #define H264_CHROMA_MC(OPNAME, OP)\ |
1422 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\ | |
1423 const int A=(8-x)*(8-y);\ | |
1424 const int B=( x)*(8-y);\ | |
1425 const int C=(8-x)*( y);\ | |
1426 const int D=( x)*( y);\ | |
1427 int i;\ | |
1428 \ | |
1429 assert(x<8 && y<8 && x>=0 && y>=0);\ | |
1430 \ | |
1431 for(i=0; i<h; i++)\ | |
1432 {\ | |
1433 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\ | |
1434 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\ | |
1435 dst+= stride;\ | |
1436 src+= stride;\ | |
1437 }\ | |
1438 }\ | |
1439 \ | |
1440 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\ | |
1441 const int A=(8-x)*(8-y);\ | |
1442 const int B=( x)*(8-y);\ | |
1443 const int C=(8-x)*( y);\ | |
1444 const int D=( x)*( y);\ | |
1445 int i;\ | |
1446 \ | |
1447 assert(x<8 && y<8 && x>=0 && y>=0);\ | |
1448 \ | |
1449 for(i=0; i<h; i++)\ | |
1450 {\ | |
1451 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\ | |
1452 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\ | |
1453 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\ | |
1454 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\ | |
1455 dst+= stride;\ | |
1456 src+= stride;\ | |
1457 }\ | |
1458 }\ | |
1459 \ | |
1460 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\ | |
1461 const int A=(8-x)*(8-y);\ | |
1462 const int B=( x)*(8-y);\ | |
1463 const int C=(8-x)*( y);\ | |
1464 const int D=( x)*( y);\ | |
1465 int i;\ | |
1466 \ | |
1467 assert(x<8 && y<8 && x>=0 && y>=0);\ | |
1468 \ | |
1469 for(i=0; i<h; i++)\ | |
1470 {\ | |
1471 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\ | |
1472 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\ | |
1473 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\ | |
1474 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\ | |
1475 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\ | |
1476 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\ | |
1477 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\ | |
1478 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\ | |
1479 dst+= stride;\ | |
1480 src+= stride;\ | |
1481 }\ | |
1482 } | |
1483 | |
1484 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1) | |
1485 #define op_put(a, b) a = (((b) + 32)>>6) | |
1486 | |
1487 H264_CHROMA_MC(put_ , op_put) | |
1488 H264_CHROMA_MC(avg_ , op_avg) | |
1489 #undef op_avg | |
1490 #undef op_put | |
1491 | |
3663 | 1492 static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){ |
1493 const int A=(8-x)*(8-y); | |
1494 const int B=( x)*(8-y); | |
1495 const int C=(8-x)*( y); | |
1496 const int D=( x)*( y); | |
1497 int i; | |
1498 | |
1499 assert(x<8 && y<8 && x>=0 && y>=0); | |
1500 | |
1501 for(i=0; i<h; i++) | |
1502 { | |
1503 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6; | |
1504 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6; | |
1505 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6; | |
1506 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6; | |
1507 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6; | |
1508 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6; | |
1509 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6; | |
1510 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6; | |
1511 dst+= stride; | |
1512 src+= stride; | |
1513 } | |
1514 } | |
1515 | |
3020
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
1516 static inline void copy_block2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h) |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
1517 { |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
1518 int i; |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
1519 for(i=0; i<h; i++) |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
1520 { |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
1521 ST16(dst , LD16(src )); |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
1522 dst+=dstStride; |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
1523 src+=srcStride; |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
1524 } |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
1525 } |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
1526 |
1168 | 1527 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h) |
1528 { | |
1529 int i; | |
1530 for(i=0; i<h; i++) | |
1531 { | |
1532 ST32(dst , LD32(src )); | |
1533 dst+=dstStride; | |
1534 src+=srcStride; | |
1535 } | |
1536 } | |
1537 | |
1538 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h) | |
1539 { | |
1540 int i; | |
1541 for(i=0; i<h; i++) | |
1542 { | |
1543 ST32(dst , LD32(src )); | |
1544 ST32(dst+4 , LD32(src+4 )); | |
1545 dst+=dstStride; | |
1546 src+=srcStride; | |
1547 } | |
1548 } | |
1549 | |
1550 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h) | |
1551 { | |
1552 int i; | |
1553 for(i=0; i<h; i++) | |
1554 { | |
1555 ST32(dst , LD32(src )); | |
1556 ST32(dst+4 , LD32(src+4 )); | |
1557 ST32(dst+8 , LD32(src+8 )); | |
1558 ST32(dst+12, LD32(src+12)); | |
1559 dst+=dstStride; | |
1560 src+=srcStride; | |
1561 } | |
1562 } | |
753 | 1563 |
1064 | 1564 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h) |
255 | 1565 { |
1566 int i; | |
1567 for(i=0; i<h; i++) | |
1568 { | |
651 | 1569 ST32(dst , LD32(src )); |
1570 ST32(dst+4 , LD32(src+4 )); | |
1571 ST32(dst+8 , LD32(src+8 )); | |
1572 ST32(dst+12, LD32(src+12)); | |
1573 dst[16]= src[16]; | |
255 | 1574 dst+=dstStride; |
1575 src+=srcStride; | |
1576 } | |
1577 } | |
1578 | |
1064 | 1579 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h) |
255 | 1580 { |
1581 int i; | |
651 | 1582 for(i=0; i<h; i++) |
255 | 1583 { |
651 | 1584 ST32(dst , LD32(src )); |
1585 ST32(dst+4 , LD32(src+4 )); | |
1586 dst[8]= src[8]; | |
255 | 1587 dst+=dstStride; |
1588 src+=srcStride; | |
1589 } | |
1590 } | |
1591 | |
954 | 1592 |
651 | 1593 #define QPEL_MC(r, OPNAME, RND, OP) \ |
1064 | 1594 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ |
1595 uint8_t *cm = cropTbl + MAX_NEG_CROP;\ | |
651 | 1596 int i;\ |
1597 for(i=0; i<h; i++)\ | |
1598 {\ | |
1599 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\ | |
1600 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\ | |
1601 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\ | |
1602 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\ | |
1603 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\ | |
1604 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\ | |
1605 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\ | |
1606 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\ | |
1607 dst+=dstStride;\ | |
1608 src+=srcStride;\ | |
1609 }\ | |
1610 }\ | |
1611 \ | |
1064 | 1612 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
984 | 1613 const int w=8;\ |
1064 | 1614 uint8_t *cm = cropTbl + MAX_NEG_CROP;\ |
651 | 1615 int i;\ |
1616 for(i=0; i<w; i++)\ | |
1617 {\ | |
1618 const int src0= src[0*srcStride];\ | |
1619 const int src1= src[1*srcStride];\ | |
1620 const int src2= src[2*srcStride];\ | |
1621 const int src3= src[3*srcStride];\ | |
1622 const int src4= src[4*srcStride];\ | |
1623 const int src5= src[5*srcStride];\ | |
1624 const int src6= src[6*srcStride];\ | |
1625 const int src7= src[7*srcStride];\ | |
1626 const int src8= src[8*srcStride];\ | |
1627 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\ | |
1628 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\ | |
1629 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\ | |
1630 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\ | |
1631 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\ | |
1632 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\ | |
1633 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\ | |
1634 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\ | |
1635 dst++;\ | |
1636 src++;\ | |
1637 }\ | |
1638 }\ | |
1639 \ | |
1064 | 1640 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ |
1641 uint8_t *cm = cropTbl + MAX_NEG_CROP;\ | |
651 | 1642 int i;\ |
954 | 1643 \ |
651 | 1644 for(i=0; i<h; i++)\ |
1645 {\ | |
1646 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\ | |
1647 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\ | |
1648 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\ | |
1649 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\ | |
1650 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\ | |
1651 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\ | |
1652 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\ | |
1653 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\ | |
1654 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\ | |
1655 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\ | |
1656 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\ | |
1657 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\ | |
1658 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\ | |
1659 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\ | |
1660 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\ | |
1661 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\ | |
1662 dst+=dstStride;\ | |
1663 src+=srcStride;\ | |
1664 }\ | |
255 | 1665 }\ |
1666 \ | |
1064 | 1667 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
1668 uint8_t *cm = cropTbl + MAX_NEG_CROP;\ | |
651 | 1669 int i;\ |
954 | 1670 const int w=16;\ |
651 | 1671 for(i=0; i<w; i++)\ |
1672 {\ | |
1673 const int src0= src[0*srcStride];\ | |
1674 const int src1= src[1*srcStride];\ | |
1675 const int src2= src[2*srcStride];\ | |
1676 const int src3= src[3*srcStride];\ | |
1677 const int src4= src[4*srcStride];\ | |
1678 const int src5= src[5*srcStride];\ | |
1679 const int src6= src[6*srcStride];\ | |
1680 const int src7= src[7*srcStride];\ | |
1681 const int src8= src[8*srcStride];\ | |
1682 const int src9= src[9*srcStride];\ | |
1683 const int src10= src[10*srcStride];\ | |
1684 const int src11= src[11*srcStride];\ | |
1685 const int src12= src[12*srcStride];\ | |
1686 const int src13= src[13*srcStride];\ | |
1687 const int src14= src[14*srcStride];\ | |
1688 const int src15= src[15*srcStride];\ | |
1689 const int src16= src[16*srcStride];\ | |
1690 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\ | |
1691 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\ | |
1692 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\ | |
1693 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\ | |
1694 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\ | |
1695 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\ | |
1696 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\ | |
1697 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\ | |
1698 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\ | |
1699 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\ | |
1700 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\ | |
1701 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\ | |
1702 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\ | |
1703 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\ | |
1704 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\ | |
1705 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\ | |
1706 dst++;\ | |
1707 src++;\ | |
1708 }\ | |
255 | 1709 }\ |
1710 \ | |
1064 | 1711 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\ |
859 | 1712 OPNAME ## pixels8_c(dst, src, stride, 8);\ |
255 | 1713 }\ |
1714 \ | |
1064 | 1715 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\ |
1716 uint8_t half[64];\ | |
651 | 1717 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\ |
1718 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\ | |
1719 }\ | |
1720 \ | |
1064 | 1721 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\ |
651 | 1722 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\ |
255 | 1723 }\ |
1724 \ | |
1064 | 1725 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\ |
1726 uint8_t half[64];\ | |
651 | 1727 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\ |
1728 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\ | |
1729 }\ | |
1730 \ | |
1064 | 1731 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\ |
1732 uint8_t full[16*9];\ | |
1733 uint8_t half[64];\ | |
651 | 1734 copy_block9(full, src, 16, stride, 9);\ |
984 | 1735 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\ |
651 | 1736 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\ |
1737 }\ | |
1738 \ | |
1064 | 1739 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\ |
1740 uint8_t full[16*9];\ | |
651 | 1741 copy_block9(full, src, 16, stride, 9);\ |
984 | 1742 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\ |
255 | 1743 }\ |
1744 \ | |
1064 | 1745 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\ |
1746 uint8_t full[16*9];\ | |
1747 uint8_t half[64];\ | |
651 | 1748 copy_block9(full, src, 16, stride, 9);\ |
984 | 1749 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\ |
651 | 1750 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\ |
1751 }\ | |
1064 | 1752 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
1753 uint8_t full[16*9];\ | |
1754 uint8_t halfH[72];\ | |
1755 uint8_t halfV[64];\ | |
1756 uint8_t halfHV[64];\ | |
651 | 1757 copy_block9(full, src, 16, stride, 9);\ |
1758 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\ | |
984 | 1759 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\ |
1760 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ | |
651 | 1761 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\ |
255 | 1762 }\ |
1064 | 1763 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\ |
1764 uint8_t full[16*9];\ | |
1765 uint8_t halfH[72];\ | |
1766 uint8_t halfHV[64];\ | |
984 | 1767 copy_block9(full, src, 16, stride, 9);\ |
1768 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\ | |
1769 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\ | |
1770 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ | |
1771 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\ | |
1772 }\ | |
1064 | 1773 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
1774 uint8_t full[16*9];\ | |
1775 uint8_t halfH[72];\ | |
1776 uint8_t halfV[64];\ | |
1777 uint8_t halfHV[64];\ | |
651 | 1778 copy_block9(full, src, 16, stride, 9);\ |
1779 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\ | |
984 | 1780 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\ |
1781 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ | |
651 | 1782 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\ |
255 | 1783 }\ |
1064 | 1784 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\ |
1785 uint8_t full[16*9];\ | |
1786 uint8_t halfH[72];\ | |
1787 uint8_t halfHV[64];\ | |
984 | 1788 copy_block9(full, src, 16, stride, 9);\ |
1789 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\ | |
1790 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\ | |
1791 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ | |
1792 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\ | |
1793 }\ | |
1064 | 1794 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
1795 uint8_t full[16*9];\ | |
1796 uint8_t halfH[72];\ | |
1797 uint8_t halfV[64];\ | |
1798 uint8_t halfHV[64];\ | |
651 | 1799 copy_block9(full, src, 16, stride, 9);\ |
1800 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\ | |
984 | 1801 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\ |
1802 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ | |
651 | 1803 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\ |
1804 }\ | |
1064 | 1805 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\ |
1806 uint8_t full[16*9];\ | |
1807 uint8_t halfH[72];\ | |
1808 uint8_t halfHV[64];\ | |
984 | 1809 copy_block9(full, src, 16, stride, 9);\ |
1810 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\ | |
1811 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\ | |
1812 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ | |
1813 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\ | |
1814 }\ | |
1064 | 1815 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
1816 uint8_t full[16*9];\ | |
1817 uint8_t halfH[72];\ | |
1818 uint8_t halfV[64];\ | |
1819 uint8_t halfHV[64];\ | |
651 | 1820 copy_block9(full, src, 16, stride, 9);\ |
1821 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\ | |
984 | 1822 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\ |
1823 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ | |
651 | 1824 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\ |
255 | 1825 }\ |
1064 | 1826 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\ |
1827 uint8_t full[16*9];\ | |
1828 uint8_t halfH[72];\ | |
1829 uint8_t halfHV[64];\ | |
984 | 1830 copy_block9(full, src, 16, stride, 9);\ |
1831 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\ | |
1832 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\ | |
1833 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ | |
1834 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\ | |
1835 }\ | |
1064 | 1836 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\ |
1837 uint8_t halfH[72];\ | |
1838 uint8_t halfHV[64];\ | |
651 | 1839 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\ |
984 | 1840 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ |
651 | 1841 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\ |
1842 }\ | |
1064 | 1843 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\ |
1844 uint8_t halfH[72];\ | |
1845 uint8_t halfHV[64];\ | |
651 | 1846 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\ |
984 | 1847 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ |
651 | 1848 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\ |
1849 }\ | |
1064 | 1850 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
1851 uint8_t full[16*9];\ | |
1852 uint8_t halfH[72];\ | |
1853 uint8_t halfV[64];\ | |
1854 uint8_t halfHV[64];\ | |
651 | 1855 copy_block9(full, src, 16, stride, 9);\ |
1856 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\ | |
984 | 1857 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\ |
1858 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ | |
651 | 1859 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\ |
255 | 1860 }\ |
1064 | 1861 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\ |
1862 uint8_t full[16*9];\ | |
1863 uint8_t halfH[72];\ | |
984 | 1864 copy_block9(full, src, 16, stride, 9);\ |
1865 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\ | |
1866 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\ | |
1867 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\ | |
1868 }\ | |
1064 | 1869 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
1870 uint8_t full[16*9];\ | |
1871 uint8_t halfH[72];\ | |
1872 uint8_t halfV[64];\ | |
1873 uint8_t halfHV[64];\ | |
651 | 1874 copy_block9(full, src, 16, stride, 9);\ |
1875 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\ | |
984 | 1876 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\ |
1877 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ | |
651 | 1878 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\ |
1879 }\ | |
1064 | 1880 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\ |
1881 uint8_t full[16*9];\ | |
1882 uint8_t halfH[72];\ | |
984 | 1883 copy_block9(full, src, 16, stride, 9);\ |
1884 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\ | |
1885 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\ | |
1886 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\ | |
1887 }\ | |
1064 | 1888 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\ |
1889 uint8_t halfH[72];\ | |
651 | 1890 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\ |
984 | 1891 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\ |
651 | 1892 }\ |
1064 | 1893 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\ |
859 | 1894 OPNAME ## pixels16_c(dst, src, stride, 16);\ |
255 | 1895 }\ |
651 | 1896 \ |
1064 | 1897 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\ |
1898 uint8_t half[256];\ | |
651 | 1899 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\ |
1900 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\ | |
1901 }\ | |
1902 \ | |
1064 | 1903 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\ |
651 | 1904 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\ |
1905 }\ | |
1906 \ | |
1064 | 1907 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\ |
1908 uint8_t half[256];\ | |
651 | 1909 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\ |
1910 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\ | |
1911 }\ | |
1912 \ | |
1064 | 1913 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\ |
1914 uint8_t full[24*17];\ | |
1915 uint8_t half[256];\ | |
651 | 1916 copy_block17(full, src, 24, stride, 17);\ |
954 | 1917 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\ |
651 | 1918 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\ |
255 | 1919 }\ |
651 | 1920 \ |
1064 | 1921 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\ |
1922 uint8_t full[24*17];\ | |
651 | 1923 copy_block17(full, src, 24, stride, 17);\ |
954 | 1924 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\ |
651 | 1925 }\ |
1926 \ | |
1064 | 1927 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\ |
1928 uint8_t full[24*17];\ | |
1929 uint8_t half[256];\ | |
651 | 1930 copy_block17(full, src, 24, stride, 17);\ |
954 | 1931 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\ |
651 | 1932 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\ |
255 | 1933 }\ |
1064 | 1934 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
1935 uint8_t full[24*17];\ | |
1936 uint8_t halfH[272];\ | |
1937 uint8_t halfV[256];\ | |
1938 uint8_t halfHV[256];\ | |
651 | 1939 copy_block17(full, src, 24, stride, 17);\ |
1940 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\ | |
954 | 1941 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\ |
1942 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ | |
651 | 1943 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\ |
1944 }\ | |
1064 | 1945 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\ |
1946 uint8_t full[24*17];\ | |
1947 uint8_t halfH[272];\ | |
1948 uint8_t halfHV[256];\ | |
984 | 1949 copy_block17(full, src, 24, stride, 17);\ |
1950 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\ | |
1951 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\ | |
1952 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ | |
1953 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\ | |
1954 }\ | |
1064 | 1955 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
1956 uint8_t full[24*17];\ | |
1957 uint8_t halfH[272];\ | |
1958 uint8_t halfV[256];\ | |
1959 uint8_t halfHV[256];\ | |
651 | 1960 copy_block17(full, src, 24, stride, 17);\ |
1961 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\ | |
954 | 1962 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\ |
1963 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ | |
651 | 1964 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\ |
1965 }\ | |
1064 | 1966 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\ |
1967 uint8_t full[24*17];\ | |
1968 uint8_t halfH[272];\ | |
1969 uint8_t halfHV[256];\ | |
984 | 1970 copy_block17(full, src, 24, stride, 17);\ |
1971 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\ | |
1972 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\ | |
1973 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ | |
1974 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\ | |
1975 }\ | |
1064 | 1976 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
1977 uint8_t full[24*17];\ | |
1978 uint8_t halfH[272];\ | |
1979 uint8_t halfV[256];\ | |
1980 uint8_t halfHV[256];\ | |
651 | 1981 copy_block17(full, src, 24, stride, 17);\ |
1982 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\ | |
954 | 1983 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\ |
1984 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ | |
651 | 1985 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\ |
255 | 1986 }\ |
1064 | 1987 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\ |
1988 uint8_t full[24*17];\ | |
1989 uint8_t halfH[272];\ | |
1990 uint8_t halfHV[256];\ | |
984 | 1991 copy_block17(full, src, 24, stride, 17);\ |
1992 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\ | |
1993 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\ | |
1994 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ | |
1995 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\ | |
1996 }\ | |
1064 | 1997 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
1998 uint8_t full[24*17];\ | |
1999 uint8_t halfH[272];\ | |
2000 uint8_t halfV[256];\ | |
2001 uint8_t halfHV[256];\ | |
651 | 2002 copy_block17(full, src, 24, stride, 17);\ |
2003 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\ | |
954 | 2004 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\ |
2005 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ | |
651 | 2006 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\ |
2007 }\ | |
1064 | 2008 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\ |
2009 uint8_t full[24*17];\ | |
2010 uint8_t halfH[272];\ | |
2011 uint8_t halfHV[256];\ | |
984 | 2012 copy_block17(full, src, 24, stride, 17);\ |
2013 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\ | |
2014 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\ | |
2015 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ | |
2016 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\ | |
2017 }\ | |
1064 | 2018 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\ |
2019 uint8_t halfH[272];\ | |
2020 uint8_t halfHV[256];\ | |
651 | 2021 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\ |
954 | 2022 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ |
651 | 2023 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\ |
255 | 2024 }\ |
1064 | 2025 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\ |
2026 uint8_t halfH[272];\ | |
2027 uint8_t halfHV[256];\ | |
651 | 2028 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\ |
954 | 2029 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ |
651 | 2030 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\ |
2031 }\ | |
1064 | 2032 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
2033 uint8_t full[24*17];\ | |
2034 uint8_t halfH[272];\ | |
2035 uint8_t halfV[256];\ | |
2036 uint8_t halfHV[256];\ | |
651 | 2037 copy_block17(full, src, 24, stride, 17);\ |
2038 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\ | |
954 | 2039 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\ |
2040 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ | |
651 | 2041 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\ |
255 | 2042 }\ |
1064 | 2043 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\ |
2044 uint8_t full[24*17];\ | |
2045 uint8_t halfH[272];\ | |
984 | 2046 copy_block17(full, src, 24, stride, 17);\ |
2047 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\ | |
2048 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\ | |
2049 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\ | |
2050 }\ | |
1064 | 2051 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
2052 uint8_t full[24*17];\ | |
2053 uint8_t halfH[272];\ | |
2054 uint8_t halfV[256];\ | |
2055 uint8_t halfHV[256];\ | |
651 | 2056 copy_block17(full, src, 24, stride, 17);\ |
2057 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\ | |
954 | 2058 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\ |
2059 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ | |
651 | 2060 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\ |
2061 }\ | |
1064 | 2062 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\ |
2063 uint8_t full[24*17];\ | |
2064 uint8_t halfH[272];\ | |
984 | 2065 copy_block17(full, src, 24, stride, 17);\ |
2066 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\ | |
2067 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\ | |
2068 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\ | |
2069 }\ | |
1064 | 2070 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\ |
2071 uint8_t halfH[272];\ | |
651 | 2072 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\ |
954 | 2073 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\ |
859 | 2074 } |
255 | 2075 |
651 | 2076 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1) |
2077 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1) | |
2078 #define op_put(a, b) a = cm[((b) + 16)>>5] | |
2079 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5] | |
2080 | |
2081 QPEL_MC(0, put_ , _ , op_put) | |
2082 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd) | |
2083 QPEL_MC(0, avg_ , _ , op_avg) | |
2084 //QPEL_MC(1, avg_no_rnd , _ , op_avg) | |
2085 #undef op_avg | |
2086 #undef op_avg_no_rnd | |
2087 #undef op_put | |
2088 #undef op_put_no_rnd | |
255 | 2089 |
1168 | 2090 #if 1 |
2091 #define H264_LOWPASS(OPNAME, OP, OP2) \ | |
3020
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2092 static void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2093 const int h=2;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2094 uint8_t *cm = cropTbl + MAX_NEG_CROP;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2095 int i;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2096 for(i=0; i<h; i++)\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2097 {\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2098 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2099 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2100 dst+=dstStride;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2101 src+=srcStride;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2102 }\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2103 }\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2104 \ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2105 static void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2106 const int w=2;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2107 uint8_t *cm = cropTbl + MAX_NEG_CROP;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2108 int i;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2109 for(i=0; i<w; i++)\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2110 {\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2111 const int srcB= src[-2*srcStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2112 const int srcA= src[-1*srcStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2113 const int src0= src[0 *srcStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2114 const int src1= src[1 *srcStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2115 const int src2= src[2 *srcStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2116 const int src3= src[3 *srcStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2117 const int src4= src[4 *srcStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2118 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2119 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2120 dst++;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2121 src++;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2122 }\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2123 }\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2124 \ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2125 static void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2126 const int h=2;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2127 const int w=2;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2128 uint8_t *cm = cropTbl + MAX_NEG_CROP;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2129 int i;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2130 src -= 2*srcStride;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2131 for(i=0; i<h+5; i++)\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2132 {\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2133 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2134 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2135 tmp+=tmpStride;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2136 src+=srcStride;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2137 }\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2138 tmp -= tmpStride*(h+5-2);\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2139 for(i=0; i<w; i++)\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2140 {\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2141 const int tmpB= tmp[-2*tmpStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2142 const int tmpA= tmp[-1*tmpStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2143 const int tmp0= tmp[0 *tmpStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2144 const int tmp1= tmp[1 *tmpStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2145 const int tmp2= tmp[2 *tmpStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2146 const int tmp3= tmp[3 *tmpStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2147 const int tmp4= tmp[4 *tmpStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2148 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2149 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2150 dst++;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2151 tmp++;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2152 }\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2153 }\ |
1168 | 2154 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
2155 const int h=4;\ | |
2156 uint8_t *cm = cropTbl + MAX_NEG_CROP;\ | |
2157 int i;\ | |
2158 for(i=0; i<h; i++)\ | |
2159 {\ | |
2160 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\ | |
2161 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\ | |
2162 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\ | |
2163 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\ | |
2164 dst+=dstStride;\ | |
2165 src+=srcStride;\ | |
2166 }\ | |
2167 }\ | |
2168 \ | |
2169 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
2170 const int w=4;\ | |
2171 uint8_t *cm = cropTbl + MAX_NEG_CROP;\ | |
2172 int i;\ | |
2173 for(i=0; i<w; i++)\ | |
2174 {\ | |
2175 const int srcB= src[-2*srcStride];\ | |
2176 const int srcA= src[-1*srcStride];\ | |
2177 const int src0= src[0 *srcStride];\ | |
2178 const int src1= src[1 *srcStride];\ | |
2179 const int src2= src[2 *srcStride];\ | |
2180 const int src3= src[3 *srcStride];\ | |
2181 const int src4= src[4 *srcStride];\ | |
2182 const int src5= src[5 *srcStride];\ | |
2183 const int src6= src[6 *srcStride];\ | |
2184 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\ | |
2185 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\ | |
2186 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\ | |
2187 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\ | |
2188 dst++;\ | |
2189 src++;\ | |
2190 }\ | |
2191 }\ | |
2192 \ | |
2193 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ | |
2194 const int h=4;\ | |
2195 const int w=4;\ | |
2196 uint8_t *cm = cropTbl + MAX_NEG_CROP;\ | |
2197 int i;\ | |
2198 src -= 2*srcStride;\ | |
2199 for(i=0; i<h+5; i++)\ | |
2200 {\ | |
2201 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\ | |
2202 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\ | |
2203 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\ | |
2204 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\ | |
2205 tmp+=tmpStride;\ | |
2206 src+=srcStride;\ | |
2207 }\ | |
2208 tmp -= tmpStride*(h+5-2);\ | |
2209 for(i=0; i<w; i++)\ | |
2210 {\ | |
2211 const int tmpB= tmp[-2*tmpStride];\ | |
2212 const int tmpA= tmp[-1*tmpStride];\ | |
2213 const int tmp0= tmp[0 *tmpStride];\ | |
2214 const int tmp1= tmp[1 *tmpStride];\ | |
2215 const int tmp2= tmp[2 *tmpStride];\ | |
2216 const int tmp3= tmp[3 *tmpStride];\ | |
2217 const int tmp4= tmp[4 *tmpStride];\ | |
2218 const int tmp5= tmp[5 *tmpStride];\ | |
2219 const int tmp6= tmp[6 *tmpStride];\ | |
2220 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\ | |
2221 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\ | |
2222 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\ | |
2223 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\ | |
2224 dst++;\ | |
2225 tmp++;\ | |
2226 }\ | |
2227 }\ | |
2228 \ | |
2229 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
2230 const int h=8;\ | |
2231 uint8_t *cm = cropTbl + MAX_NEG_CROP;\ | |
2232 int i;\ | |
2233 for(i=0; i<h; i++)\ | |
2234 {\ | |
2235 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\ | |
2236 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\ | |
2237 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\ | |
2238 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\ | |
2239 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\ | |
2240 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\ | |
2241 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\ | |
2242 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\ | |
2243 dst+=dstStride;\ | |
2244 src+=srcStride;\ | |
2245 }\ | |
2246 }\ | |
2247 \ | |
2248 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
2249 const int w=8;\ | |
2250 uint8_t *cm = cropTbl + MAX_NEG_CROP;\ | |
2251 int i;\ | |
2252 for(i=0; i<w; i++)\ | |
2253 {\ | |
2254 const int srcB= src[-2*srcStride];\ | |
2255 const int srcA= src[-1*srcStride];\ | |
2256 const int src0= src[0 *srcStride];\ | |
2257 const int src1= src[1 *srcStride];\ | |
2258 const int src2= src[2 *srcStride];\ | |
2259 const int src3= src[3 *srcStride];\ | |
2260 const int src4= src[4 *srcStride];\ | |
2261 const int src5= src[5 *srcStride];\ | |
2262 const int src6= src[6 *srcStride];\ | |
2263 const int src7= src[7 *srcStride];\ | |
2264 const int src8= src[8 *srcStride];\ | |
2265 const int src9= src[9 *srcStride];\ | |
2266 const int src10=src[10*srcStride];\ | |
2267 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\ | |
2268 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\ | |
2269 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\ | |
2270 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\ | |
2271 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\ | |
2272 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\ | |
2273 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\ | |
2274 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\ | |
2275 dst++;\ | |
2276 src++;\ | |
2277 }\ | |
2278 }\ | |
2279 \ | |
2280 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ | |
2281 const int h=8;\ | |
2282 const int w=8;\ | |
2283 uint8_t *cm = cropTbl + MAX_NEG_CROP;\ | |
2284 int i;\ | |
2285 src -= 2*srcStride;\ | |
2286 for(i=0; i<h+5; i++)\ | |
2287 {\ | |
2288 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\ | |
2289 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\ | |
2290 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\ | |
2291 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\ | |
2292 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\ | |
2293 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\ | |
2294 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\ | |
2295 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\ | |
2296 tmp+=tmpStride;\ | |
2297 src+=srcStride;\ | |
2298 }\ | |
2299 tmp -= tmpStride*(h+5-2);\ | |
2300 for(i=0; i<w; i++)\ | |
2301 {\ | |
2302 const int tmpB= tmp[-2*tmpStride];\ | |
2303 const int tmpA= tmp[-1*tmpStride];\ | |
2304 const int tmp0= tmp[0 *tmpStride];\ | |
2305 const int tmp1= tmp[1 *tmpStride];\ | |
2306 const int tmp2= tmp[2 *tmpStride];\ | |
2307 const int tmp3= tmp[3 *tmpStride];\ | |
2308 const int tmp4= tmp[4 *tmpStride];\ | |
2309 const int tmp5= tmp[5 *tmpStride];\ | |
2310 const int tmp6= tmp[6 *tmpStride];\ | |
2311 const int tmp7= tmp[7 *tmpStride];\ | |
2312 const int tmp8= tmp[8 *tmpStride];\ | |
2313 const int tmp9= tmp[9 *tmpStride];\ | |
2314 const int tmp10=tmp[10*tmpStride];\ | |
2315 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\ | |
2316 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\ | |
2317 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\ | |
2318 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\ | |
2319 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\ | |
2320 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\ | |
2321 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\ | |
2322 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\ | |
2323 dst++;\ | |
2324 tmp++;\ | |
2325 }\ | |
2326 }\ | |
2327 \ | |
2328 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
2329 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\ | |
2330 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\ | |
2331 src += 8*srcStride;\ | |
2332 dst += 8*dstStride;\ | |
2333 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\ | |
2334 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\ | |
2335 }\ | |
2336 \ | |
2337 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
2338 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\ | |
2339 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\ | |
2340 src += 8*srcStride;\ | |
2341 dst += 8*dstStride;\ | |
2342 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\ | |
2343 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\ | |
2344 }\ | |
2345 \ | |
2346 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ | |
2347 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\ | |
2348 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\ | |
2349 src += 8*srcStride;\ | |
2350 dst += 8*dstStride;\ | |
2351 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\ | |
2352 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\ | |
2353 }\ | |
2354 | |
2355 #define H264_MC(OPNAME, SIZE) \ | |
2356 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\ | |
2357 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\ | |
2358 }\ | |
2359 \ | |
2360 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2361 uint8_t half[SIZE*SIZE];\ | |
2362 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\ | |
2363 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\ | |
2364 }\ | |
2365 \ | |
2366 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2367 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\ | |
2368 }\ | |
2369 \ | |
2370 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2371 uint8_t half[SIZE*SIZE];\ | |
2372 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\ | |
2373 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\ | |
2374 }\ | |
2375 \ | |
2376 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2377 uint8_t full[SIZE*(SIZE+5)];\ | |
2378 uint8_t * const full_mid= full + SIZE*2;\ | |
2379 uint8_t half[SIZE*SIZE];\ | |
2380 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ | |
2381 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\ | |
2382 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\ | |
2383 }\ | |
2384 \ | |
2385 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2386 uint8_t full[SIZE*(SIZE+5)];\ | |
2387 uint8_t * const full_mid= full + SIZE*2;\ | |
2388 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ | |
2389 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\ | |
2390 }\ | |
2391 \ | |
2392 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2393 uint8_t full[SIZE*(SIZE+5)];\ | |
2394 uint8_t * const full_mid= full + SIZE*2;\ | |
2395 uint8_t half[SIZE*SIZE];\ | |
2396 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ | |
2397 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\ | |
2398 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\ | |
2399 }\ | |
2400 \ | |
2401 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2402 uint8_t full[SIZE*(SIZE+5)];\ | |
2403 uint8_t * const full_mid= full + SIZE*2;\ | |
2404 uint8_t halfH[SIZE*SIZE];\ | |
2405 uint8_t halfV[SIZE*SIZE];\ | |
2406 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\ | |
2407 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ | |
2408 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\ | |
2409 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\ | |
2410 }\ | |
2411 \ | |
2412 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2413 uint8_t full[SIZE*(SIZE+5)];\ | |
2414 uint8_t * const full_mid= full + SIZE*2;\ | |
2415 uint8_t halfH[SIZE*SIZE];\ | |
2416 uint8_t halfV[SIZE*SIZE];\ | |
2417 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\ | |
2418 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\ | |
2419 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\ | |
2420 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\ | |
2421 }\ | |
2422 \ | |
2423 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2424 uint8_t full[SIZE*(SIZE+5)];\ | |
2425 uint8_t * const full_mid= full + SIZE*2;\ | |
2426 uint8_t halfH[SIZE*SIZE];\ | |
2427 uint8_t halfV[SIZE*SIZE];\ | |
2428 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\ | |
2429 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ | |
2430 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\ | |
2431 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\ | |
2432 }\ | |
2433 \ | |
2434 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2435 uint8_t full[SIZE*(SIZE+5)];\ | |
2436 uint8_t * const full_mid= full + SIZE*2;\ | |
2437 uint8_t halfH[SIZE*SIZE];\ | |
2438 uint8_t halfV[SIZE*SIZE];\ | |
2439 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\ | |
2440 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\ | |
2441 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\ | |
2442 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\ | |
2443 }\ | |
2444 \ | |
2445 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2446 int16_t tmp[SIZE*(SIZE+5)];\ | |
2447 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\ | |
2448 }\ | |
2449 \ | |
2450 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2451 int16_t tmp[SIZE*(SIZE+5)];\ | |
2452 uint8_t halfH[SIZE*SIZE];\ | |
2453 uint8_t halfHV[SIZE*SIZE];\ | |
2454 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\ | |
2455 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\ | |
2456 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\ | |
2457 }\ | |
2458 \ | |
2459 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2460 int16_t tmp[SIZE*(SIZE+5)];\ | |
2461 uint8_t halfH[SIZE*SIZE];\ | |
2462 uint8_t halfHV[SIZE*SIZE];\ | |
2463 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\ | |
2464 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\ | |
2465 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\ | |
2466 }\ | |
2467 \ | |
2468 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2469 uint8_t full[SIZE*(SIZE+5)];\ | |
2470 uint8_t * const full_mid= full + SIZE*2;\ | |
2471 int16_t tmp[SIZE*(SIZE+5)];\ | |
2472 uint8_t halfV[SIZE*SIZE];\ | |
2473 uint8_t halfHV[SIZE*SIZE];\ | |
2474 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ | |
2475 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\ | |
2476 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\ | |
2477 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\ | |
2478 }\ | |
2479 \ | |
2480 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2481 uint8_t full[SIZE*(SIZE+5)];\ | |
2482 uint8_t * const full_mid= full + SIZE*2;\ | |
2483 int16_t tmp[SIZE*(SIZE+5)];\ | |
2484 uint8_t halfV[SIZE*SIZE];\ | |
2485 uint8_t halfHV[SIZE*SIZE];\ | |
2486 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\ | |
2487 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\ | |
2488 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\ | |
2489 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\ | |
2490 }\ | |
2491 | |
2492 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1) | |
2493 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7) | |
2494 #define op_put(a, b) a = cm[((b) + 16)>>5] | |
2495 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1) | |
2496 #define op2_put(a, b) a = cm[((b) + 512)>>10] | |
2497 | |
2498 H264_LOWPASS(put_ , op_put, op2_put) | |
2499 H264_LOWPASS(avg_ , op_avg, op2_avg) | |
3020
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2500 H264_MC(put_, 2) |
1168 | 2501 H264_MC(put_, 4) |
2502 H264_MC(put_, 8) | |
2503 H264_MC(put_, 16) | |
2504 H264_MC(avg_, 4) | |
2505 H264_MC(avg_, 8) | |
2506 H264_MC(avg_, 16) | |
2507 | |
2508 #undef op_avg | |
2509 #undef op_put | |
2510 #undef op2_avg | |
2511 #undef op2_put | |
2512 #endif | |
2513 | |
2448 | 2514 #define op_scale1(x) block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom ) |
2515 #define op_scale2(x) dst[x] = clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1)) | |
2415 | 2516 #define H264_WEIGHT(W,H) \ |
2517 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \ | |
3029 | 2518 int y; \ |
2415 | 2519 offset <<= log2_denom; \ |
2520 if(log2_denom) offset += 1<<(log2_denom-1); \ | |
2521 for(y=0; y<H; y++, block += stride){ \ | |
2522 op_scale1(0); \ | |
2523 op_scale1(1); \ | |
2524 if(W==2) continue; \ | |
2525 op_scale1(2); \ | |
2526 op_scale1(3); \ | |
2527 if(W==4) continue; \ | |
2528 op_scale1(4); \ | |
2529 op_scale1(5); \ | |
2530 op_scale1(6); \ | |
2531 op_scale1(7); \ | |
2532 if(W==8) continue; \ | |
2533 op_scale1(8); \ | |
2534 op_scale1(9); \ | |
2535 op_scale1(10); \ | |
2536 op_scale1(11); \ | |
2537 op_scale1(12); \ | |
2538 op_scale1(13); \ | |
2539 op_scale1(14); \ | |
2540 op_scale1(15); \ | |
2541 } \ | |
2542 } \ | |
3029 | 2543 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \ |
2544 int y; \ | |
2545 offset = ((offset + 1) | 1) << log2_denom; \ | |
2415 | 2546 for(y=0; y<H; y++, dst += stride, src += stride){ \ |
2547 op_scale2(0); \ | |
2548 op_scale2(1); \ | |
2549 if(W==2) continue; \ | |
2550 op_scale2(2); \ | |
2551 op_scale2(3); \ | |
2552 if(W==4) continue; \ | |
2553 op_scale2(4); \ | |
2554 op_scale2(5); \ | |
2555 op_scale2(6); \ | |
2556 op_scale2(7); \ | |
2557 if(W==8) continue; \ | |
2558 op_scale2(8); \ | |
2559 op_scale2(9); \ | |
2560 op_scale2(10); \ | |
2561 op_scale2(11); \ | |
2562 op_scale2(12); \ | |
2563 op_scale2(13); \ | |
2564 op_scale2(14); \ | |
2565 op_scale2(15); \ | |
2566 } \ | |
2567 } | |
2568 | |
2569 H264_WEIGHT(16,16) | |
2570 H264_WEIGHT(16,8) | |
2571 H264_WEIGHT(8,16) | |
2572 H264_WEIGHT(8,8) | |
2573 H264_WEIGHT(8,4) | |
2574 H264_WEIGHT(4,8) | |
2575 H264_WEIGHT(4,4) | |
2576 H264_WEIGHT(4,2) | |
2577 H264_WEIGHT(2,4) | |
2578 H264_WEIGHT(2,2) | |
2579 | |
2580 #undef op_scale1 | |
2581 #undef op_scale2 | |
2582 #undef H264_WEIGHT | |
2583 | |
936 | 2584 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){ |
2585 uint8_t *cm = cropTbl + MAX_NEG_CROP; | |
2586 int i; | |
2587 | |
2588 for(i=0; i<h; i++){ | |
2589 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4]; | |
2590 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4]; | |
2591 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4]; | |
2592 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4]; | |
2593 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4]; | |
2594 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4]; | |
2595 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4]; | |
2596 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4]; | |
2597 dst+=dstStride; | |
2967 | 2598 src+=srcStride; |
936 | 2599 } |
2600 } | |
2601 | |
3432 | 2602 #ifdef CONFIG_CAVS_DECODER |
3395
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2603 /* AVS specific */ |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2604 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx); |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2605 |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2606 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) { |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2607 put_pixels8_c(dst, src, stride, 8); |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2608 } |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2609 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) { |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2610 avg_pixels8_c(dst, src, stride, 8); |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2611 } |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2612 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) { |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2613 put_pixels16_c(dst, src, stride, 16); |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2614 } |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2615 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) { |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2616 avg_pixels16_c(dst, src, stride, 16); |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2617 } |
3432 | 2618 #endif /* CONFIG_CAVS_DECODER */ |
3395
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2619 |
3526 | 2620 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER) |
2621 /* VC-1 specific */ | |
2622 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx); | |
2623 | |
2624 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) { | |
2625 put_pixels8_c(dst, src, stride, 8); | |
2626 } | |
2627 #endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */ | |
2628 | |
936 | 2629 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){ |
2630 uint8_t *cm = cropTbl + MAX_NEG_CROP; | |
2631 int i; | |
2632 | |
2633 for(i=0; i<w; i++){ | |
2634 const int src_1= src[ -srcStride]; | |
2635 const int src0 = src[0 ]; | |
2636 const int src1 = src[ srcStride]; | |
2637 const int src2 = src[2*srcStride]; | |
2638 const int src3 = src[3*srcStride]; | |
2639 const int src4 = src[4*srcStride]; | |
2640 const int src5 = src[5*srcStride]; | |
2641 const int src6 = src[6*srcStride]; | |
2642 const int src7 = src[7*srcStride]; | |
2643 const int src8 = src[8*srcStride]; | |
2644 const int src9 = src[9*srcStride]; | |
2645 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4]; | |
2646 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4]; | |
2647 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4]; | |
2648 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4]; | |
2649 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4]; | |
2650 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4]; | |
2651 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4]; | |
2652 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4]; | |
2653 src++; | |
2654 dst++; | |
2655 } | |
2656 } | |
2657 | |
2658 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){ | |
2659 put_pixels8_c(dst, src, stride, 8); | |
2660 } | |
2661 | |
2662 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){ | |
2663 uint8_t half[64]; | |
2664 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8); | |
2665 put_pixels8_l2(dst, src, half, stride, stride, 8, 8); | |
2666 } | |
2667 | |
2668 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){ | |
2669 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8); | |
2670 } | |
2671 | |
2672 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){ | |
2673 uint8_t half[64]; | |
2674 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8); | |
2675 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8); | |
2676 } | |
2677 | |
2678 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){ | |
2679 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8); | |
2680 } | |
2681 | |
2682 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){ | |
2683 uint8_t halfH[88]; | |
2684 uint8_t halfV[64]; | |
2685 uint8_t halfHV[64]; | |
2686 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11); | |
2687 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8); | |
2688 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8); | |
2689 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8); | |
2690 } | |
2691 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){ | |
2692 uint8_t halfH[88]; | |
2693 uint8_t halfV[64]; | |
2694 uint8_t halfHV[64]; | |
2695 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11); | |
2696 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8); | |
2697 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8); | |
2698 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8); | |
2699 } | |
2700 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){ | |
2701 uint8_t halfH[88]; | |
2702 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11); | |
2703 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8); | |
2704 } | |
2705 | |
1644 | 2706 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){ |
2707 int x; | |
2708 const int strength= ff_h263_loop_filter_strength[qscale]; | |
2967 | 2709 |
1644 | 2710 for(x=0; x<8; x++){ |
2711 int d1, d2, ad1; | |
2712 int p0= src[x-2*stride]; | |
2713 int p1= src[x-1*stride]; | |
2714 int p2= src[x+0*stride]; | |
2715 int p3= src[x+1*stride]; | |
2716 int d = (p0 - p3 + 4*(p2 - p1)) / 8; | |
2717 | |
2718 if (d<-2*strength) d1= 0; | |
2719 else if(d<- strength) d1=-2*strength - d; | |
2720 else if(d< strength) d1= d; | |
2721 else if(d< 2*strength) d1= 2*strength - d; | |
2722 else d1= 0; | |
2967 | 2723 |
1644 | 2724 p1 += d1; |
2725 p2 -= d1; | |
2726 if(p1&256) p1= ~(p1>>31); | |
2727 if(p2&256) p2= ~(p2>>31); | |
2967 | 2728 |
1644 | 2729 src[x-1*stride] = p1; |
2730 src[x+0*stride] = p2; | |
2731 | |
4001 | 2732 ad1= FFABS(d1)>>1; |
2967 | 2733 |
1644 | 2734 d2= clip((p0-p3)/4, -ad1, ad1); |
2967 | 2735 |
1644 | 2736 src[x-2*stride] = p0 - d2; |
2737 src[x+ stride] = p3 + d2; | |
2738 } | |
2739 } | |
2740 | |
2741 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){ | |
2742 int y; | |
2743 const int strength= ff_h263_loop_filter_strength[qscale]; | |
2967 | 2744 |
1644 | 2745 for(y=0; y<8; y++){ |
2746 int d1, d2, ad1; | |
2747 int p0= src[y*stride-2]; | |
2748 int p1= src[y*stride-1]; | |
2749 int p2= src[y*stride+0]; | |
2750 int p3= src[y*stride+1]; | |
2751 int d = (p0 - p3 + 4*(p2 - p1)) / 8; | |
2752 | |
2753 if (d<-2*strength) d1= 0; | |
2754 else if(d<- strength) d1=-2*strength - d; | |
2755 else if(d< strength) d1= d; | |
2756 else if(d< 2*strength) d1= 2*strength - d; | |
2757 else d1= 0; | |
2967 | 2758 |
1644 | 2759 p1 += d1; |
2760 p2 -= d1; | |
2761 if(p1&256) p1= ~(p1>>31); | |
2762 if(p2&256) p2= ~(p2>>31); | |
2967 | 2763 |
1644 | 2764 src[y*stride-1] = p1; |
2765 src[y*stride+0] = p2; | |
2766 | |
4001 | 2767 ad1= FFABS(d1)>>1; |
2967 | 2768 |
1644 | 2769 d2= clip((p0-p3)/4, -ad1, ad1); |
2967 | 2770 |
1644 | 2771 src[y*stride-2] = p0 - d2; |
2772 src[y*stride+1] = p3 + d2; | |
2773 } | |
2774 } | |
936 | 2775 |
2045 | 2776 static void h261_loop_filter_c(uint8_t *src, int stride){ |
2777 int x,y,xy,yz; | |
2778 int temp[64]; | |
2779 | |
2780 for(x=0; x<8; x++){ | |
2781 temp[x ] = 4*src[x ]; | |
2782 temp[x + 7*8] = 4*src[x + 7*stride]; | |
2783 } | |
2784 for(y=1; y<7; y++){ | |
2785 for(x=0; x<8; x++){ | |
2786 xy = y * stride + x; | |
2787 yz = y * 8 + x; | |
2788 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride]; | |
2044
b6f2add2511e
h261 decoder by (Maarten Daniels <maarten.daniels at student dot luc dot ac dot be>)
michael
parents:
1984
diff
changeset
|
2789 } |
b6f2add2511e
h261 decoder by (Maarten Daniels <maarten.daniels at student dot luc dot ac dot be>)
michael
parents:
1984
diff
changeset
|
2790 } |
2967 | 2791 |
2045 | 2792 for(y=0; y<8; y++){ |
2793 src[ y*stride] = (temp[ y*8] + 2)>>2; | |
2794 src[7+y*stride] = (temp[7+y*8] + 2)>>2; | |
2795 for(x=1; x<7; x++){ | |
2796 xy = y * stride + x; | |
2797 yz = y * 8 + x; | |
2798 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4; | |
2044
b6f2add2511e
h261 decoder by (Maarten Daniels <maarten.daniels at student dot luc dot ac dot be>)
michael
parents:
1984
diff
changeset
|
2799 } |
b6f2add2511e
h261 decoder by (Maarten Daniels <maarten.daniels at student dot luc dot ac dot be>)
michael
parents:
1984
diff
changeset
|
2800 } |
b6f2add2511e
h261 decoder by (Maarten Daniels <maarten.daniels at student dot luc dot ac dot be>)
michael
parents:
1984
diff
changeset
|
2801 } |
b6f2add2511e
h261 decoder by (Maarten Daniels <maarten.daniels at student dot luc dot ac dot be>)
michael
parents:
1984
diff
changeset
|
2802 |
2707
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
2803 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0) |
2633 | 2804 { |
2805 int i, d; | |
2806 for( i = 0; i < 4; i++ ) { | |
2807 if( tc0[i] < 0 ) { | |
2808 pix += 4*ystride; | |
2809 continue; | |
2810 } | |
2811 for( d = 0; d < 4; d++ ) { | |
2812 const int p0 = pix[-1*xstride]; | |
2813 const int p1 = pix[-2*xstride]; | |
2814 const int p2 = pix[-3*xstride]; | |
2815 const int q0 = pix[0]; | |
2816 const int q1 = pix[1*xstride]; | |
2817 const int q2 = pix[2*xstride]; | |
2967 | 2818 |
4001 | 2819 if( FFABS( p0 - q0 ) < alpha && |
2820 FFABS( p1 - p0 ) < beta && | |
2821 FFABS( q1 - q0 ) < beta ) { | |
2967 | 2822 |
2633 | 2823 int tc = tc0[i]; |
2824 int i_delta; | |
2967 | 2825 |
4001 | 2826 if( FFABS( p2 - p0 ) < beta ) { |
2651 | 2827 pix[-2*xstride] = p1 + clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] ); |
2633 | 2828 tc++; |
2829 } | |
4001 | 2830 if( FFABS( q2 - q0 ) < beta ) { |
2651 | 2831 pix[ xstride] = q1 + clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] ); |
2633 | 2832 tc++; |
2833 } | |
2967 | 2834 |
2633 | 2835 i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); |
2836 pix[-xstride] = clip_uint8( p0 + i_delta ); /* p0' */ | |
2837 pix[0] = clip_uint8( q0 - i_delta ); /* q0' */ | |
2838 } | |
2839 pix += ystride; | |
2840 } | |
2841 } | |
2842 } | |
2707
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
2843 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) |
2633 | 2844 { |
2845 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0); | |
2846 } | |
2707
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
2847 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) |
2633 | 2848 { |
2849 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0); | |
2850 } | |
2851 | |
2707
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
2852 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0) |
2633 | 2853 { |
2854 int i, d; | |
2855 for( i = 0; i < 4; i++ ) { | |
2856 const int tc = tc0[i]; | |
2857 if( tc <= 0 ) { | |
2858 pix += 2*ystride; | |
2859 continue; | |
2860 } | |
2861 for( d = 0; d < 2; d++ ) { | |
2862 const int p0 = pix[-1*xstride]; | |
2863 const int p1 = pix[-2*xstride]; | |
2864 const int q0 = pix[0]; | |
2865 const int q1 = pix[1*xstride]; | |
2866 | |
4001 | 2867 if( FFABS( p0 - q0 ) < alpha && |
2868 FFABS( p1 - p0 ) < beta && | |
2869 FFABS( q1 - q0 ) < beta ) { | |
2633 | 2870 |
2871 int delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); | |
2872 | |
2873 pix[-xstride] = clip_uint8( p0 + delta ); /* p0' */ | |
2874 pix[0] = clip_uint8( q0 - delta ); /* q0' */ | |
2875 } | |
2876 pix += ystride; | |
2877 } | |
2878 } | |
2879 } | |
2707
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
2880 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) |
2633 | 2881 { |
2882 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0); | |
2883 } | |
2707
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
2884 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) |
2633 | 2885 { |
2886 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0); | |
2887 } | |
2888 | |
2707
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
2889 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta) |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
2890 { |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
2891 int d; |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
2892 for( d = 0; d < 8; d++ ) { |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
2893 const int p0 = pix[-1*xstride]; |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
2894 const int p1 = pix[-2*xstride]; |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
2895 const int q0 = pix[0]; |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
2896 const int q1 = pix[1*xstride]; |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
2897 |
4001 | 2898 if( FFABS( p0 - q0 ) < alpha && |
2899 FFABS( p1 - p0 ) < beta && | |
2900 FFABS( q1 - q0 ) < beta ) { | |
2707
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
2901 |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
2902 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */ |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
2903 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */ |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
2904 } |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
2905 pix += ystride; |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
2906 } |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
2907 } |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
2908 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta) |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
2909 { |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
2910 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta); |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
2911 } |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
2912 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta) |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
2913 { |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
2914 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta); |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
2915 } |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
2916 |
1708 | 2917 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
0 | 2918 { |
2919 int s, i; | |
2920 | |
2921 s = 0; | |
1708 | 2922 for(i=0;i<h;i++) { |
0 | 2923 s += abs(pix1[0] - pix2[0]); |
2924 s += abs(pix1[1] - pix2[1]); | |
2925 s += abs(pix1[2] - pix2[2]); | |
2926 s += abs(pix1[3] - pix2[3]); | |
2927 s += abs(pix1[4] - pix2[4]); | |
2928 s += abs(pix1[5] - pix2[5]); | |
2929 s += abs(pix1[6] - pix2[6]); | |
2930 s += abs(pix1[7] - pix2[7]); | |
2931 s += abs(pix1[8] - pix2[8]); | |
2932 s += abs(pix1[9] - pix2[9]); | |
2933 s += abs(pix1[10] - pix2[10]); | |
2934 s += abs(pix1[11] - pix2[11]); | |
2935 s += abs(pix1[12] - pix2[12]); | |
2936 s += abs(pix1[13] - pix2[13]); | |
2937 s += abs(pix1[14] - pix2[14]); | |
2938 s += abs(pix1[15] - pix2[15]); | |
2939 pix1 += line_size; | |
2940 pix2 += line_size; | |
2941 } | |
2942 return s; | |
2943 } | |
2944 | |
1708 | 2945 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
0 | 2946 { |
2947 int s, i; | |
2948 | |
2949 s = 0; | |
1708 | 2950 for(i=0;i<h;i++) { |
0 | 2951 s += abs(pix1[0] - avg2(pix2[0], pix2[1])); |
2952 s += abs(pix1[1] - avg2(pix2[1], pix2[2])); | |
2953 s += abs(pix1[2] - avg2(pix2[2], pix2[3])); | |
2954 s += abs(pix1[3] - avg2(pix2[3], pix2[4])); | |
2955 s += abs(pix1[4] - avg2(pix2[4], pix2[5])); | |
2956 s += abs(pix1[5] - avg2(pix2[5], pix2[6])); | |
2957 s += abs(pix1[6] - avg2(pix2[6], pix2[7])); | |
2958 s += abs(pix1[7] - avg2(pix2[7], pix2[8])); | |
2959 s += abs(pix1[8] - avg2(pix2[8], pix2[9])); | |
2960 s += abs(pix1[9] - avg2(pix2[9], pix2[10])); | |
2961 s += abs(pix1[10] - avg2(pix2[10], pix2[11])); | |
2962 s += abs(pix1[11] - avg2(pix2[11], pix2[12])); | |
2963 s += abs(pix1[12] - avg2(pix2[12], pix2[13])); | |
2964 s += abs(pix1[13] - avg2(pix2[13], pix2[14])); | |
2965 s += abs(pix1[14] - avg2(pix2[14], pix2[15])); | |
2966 s += abs(pix1[15] - avg2(pix2[15], pix2[16])); | |
2967 pix1 += line_size; | |
2968 pix2 += line_size; | |
2969 } | |
2970 return s; | |
2971 } | |
2972 | |
1708 | 2973 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
0 | 2974 { |
2975 int s, i; | |
1064 | 2976 uint8_t *pix3 = pix2 + line_size; |
0 | 2977 |
2978 s = 0; | |
1708 | 2979 for(i=0;i<h;i++) { |
0 | 2980 s += abs(pix1[0] - avg2(pix2[0], pix3[0])); |
2981 s += abs(pix1[1] - avg2(pix2[1], pix3[1])); | |
2982 s += abs(pix1[2] - avg2(pix2[2], pix3[2])); | |
2983 s += abs(pix1[3] - avg2(pix2[3], pix3[3])); | |
2984 s += abs(pix1[4] - avg2(pix2[4], pix3[4])); | |
2985 s += abs(pix1[5] - avg2(pix2[5], pix3[5])); | |
2986 s += abs(pix1[6] - avg2(pix2[6], pix3[6])); | |
2987 s += abs(pix1[7] - avg2(pix2[7], pix3[7])); | |
2988 s += abs(pix1[8] - avg2(pix2[8], pix3[8])); | |
2989 s += abs(pix1[9] - avg2(pix2[9], pix3[9])); | |
2990 s += abs(pix1[10] - avg2(pix2[10], pix3[10])); | |
2991 s += abs(pix1[11] - avg2(pix2[11], pix3[11])); | |
2992 s += abs(pix1[12] - avg2(pix2[12], pix3[12])); | |
2993 s += abs(pix1[13] - avg2(pix2[13], pix3[13])); | |
2994 s += abs(pix1[14] - avg2(pix2[14], pix3[14])); | |
2995 s += abs(pix1[15] - avg2(pix2[15], pix3[15])); | |
2996 pix1 += line_size; | |
2997 pix2 += line_size; | |
2998 pix3 += line_size; | |
2999 } | |
3000 return s; | |
3001 } | |
3002 | |
1708 | 3003 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
0 | 3004 { |
3005 int s, i; | |
1064 | 3006 uint8_t *pix3 = pix2 + line_size; |
0 | 3007 |
3008 s = 0; | |
1708 | 3009 for(i=0;i<h;i++) { |
0 | 3010 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1])); |
3011 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2])); | |
3012 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3])); | |
3013 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4])); | |
3014 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5])); | |
3015 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6])); | |
3016 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7])); | |
3017 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8])); | |
3018 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9])); | |
3019 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10])); | |
3020 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11])); | |
3021 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12])); | |
3022 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13])); | |
3023 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14])); | |
3024 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15])); | |
3025 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16])); | |
3026 pix1 += line_size; | |
3027 pix2 += line_size; | |
3028 pix3 += line_size; | |
3029 } | |
3030 return s; | |
3031 } | |
3032 | |
1708 | 3033 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
294 | 3034 { |
3035 int s, i; | |
3036 | |
3037 s = 0; | |
1708 | 3038 for(i=0;i<h;i++) { |
294 | 3039 s += abs(pix1[0] - pix2[0]); |
3040 s += abs(pix1[1] - pix2[1]); | |
3041 s += abs(pix1[2] - pix2[2]); | |
3042 s += abs(pix1[3] - pix2[3]); | |
3043 s += abs(pix1[4] - pix2[4]); | |
3044 s += abs(pix1[5] - pix2[5]); | |
3045 s += abs(pix1[6] - pix2[6]); | |
3046 s += abs(pix1[7] - pix2[7]); | |
3047 pix1 += line_size; | |
3048 pix2 += line_size; | |
3049 } | |
3050 return s; | |
3051 } | |
3052 | |
1708 | 3053 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
294 | 3054 { |
3055 int s, i; | |
3056 | |
3057 s = 0; | |
1708 | 3058 for(i=0;i<h;i++) { |
294 | 3059 s += abs(pix1[0] - avg2(pix2[0], pix2[1])); |
3060 s += abs(pix1[1] - avg2(pix2[1], pix2[2])); | |
3061 s += abs(pix1[2] - avg2(pix2[2], pix2[3])); | |
3062 s += abs(pix1[3] - avg2(pix2[3], pix2[4])); | |
3063 s += abs(pix1[4] - avg2(pix2[4], pix2[5])); | |
3064 s += abs(pix1[5] - avg2(pix2[5], pix2[6])); | |
3065 s += abs(pix1[6] - avg2(pix2[6], pix2[7])); | |
3066 s += abs(pix1[7] - avg2(pix2[7], pix2[8])); | |
3067 pix1 += line_size; | |
3068 pix2 += line_size; | |
3069 } | |
3070 return s; | |
3071 } | |
3072 | |
1708 | 3073 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
294 | 3074 { |
3075 int s, i; | |
1064 | 3076 uint8_t *pix3 = pix2 + line_size; |
294 | 3077 |
3078 s = 0; | |
1708 | 3079 for(i=0;i<h;i++) { |
294 | 3080 s += abs(pix1[0] - avg2(pix2[0], pix3[0])); |
3081 s += abs(pix1[1] - avg2(pix2[1], pix3[1])); | |
3082 s += abs(pix1[2] - avg2(pix2[2], pix3[2])); | |
3083 s += abs(pix1[3] - avg2(pix2[3], pix3[3])); | |
3084 s += abs(pix1[4] - avg2(pix2[4], pix3[4])); | |
3085 s += abs(pix1[5] - avg2(pix2[5], pix3[5])); | |
3086 s += abs(pix1[6] - avg2(pix2[6], pix3[6])); | |
3087 s += abs(pix1[7] - avg2(pix2[7], pix3[7])); | |
3088 pix1 += line_size; | |
3089 pix2 += line_size; | |
3090 pix3 += line_size; | |
3091 } | |
3092 return s; | |
3093 } | |
3094 | |
1708 | 3095 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
294 | 3096 { |
3097 int s, i; | |
1064 | 3098 uint8_t *pix3 = pix2 + line_size; |
294 | 3099 |
3100 s = 0; | |
1708 | 3101 for(i=0;i<h;i++) { |
294 | 3102 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1])); |
3103 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2])); | |
3104 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3])); | |
3105 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4])); | |
3106 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5])); | |
3107 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6])); | |
3108 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7])); | |
3109 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8])); | |
3110 pix1 += line_size; | |
3111 pix2 += line_size; | |
3112 pix3 += line_size; | |
3113 } | |
3114 return s; | |
3115 } | |
3116 | |
2834 | 3117 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){ |
3118 MpegEncContext *c = v; | |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3119 int score1=0; |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3120 int score2=0; |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3121 int x,y; |
2066 | 3122 |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3123 for(y=0; y<h; y++){ |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3124 for(x=0; x<16; x++){ |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3125 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]); |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3126 } |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3127 if(y+1<h){ |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3128 for(x=0; x<15; x++){ |
4001 | 3129 score2+= FFABS( s1[x ] - s1[x +stride] |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3130 - s1[x+1] + s1[x+1+stride]) |
4001 | 3131 -FFABS( s2[x ] - s2[x +stride] |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3132 - s2[x+1] + s2[x+1+stride]); |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3133 } |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3134 } |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3135 s1+= stride; |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3136 s2+= stride; |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3137 } |
2066 | 3138 |
4001 | 3139 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight; |
3140 else return score1 + FFABS(score2)*8; | |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3141 } |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3142 |
2834 | 3143 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){ |
3144 MpegEncContext *c = v; | |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3145 int score1=0; |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3146 int score2=0; |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3147 int x,y; |
2967 | 3148 |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3149 for(y=0; y<h; y++){ |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3150 for(x=0; x<8; x++){ |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3151 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]); |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3152 } |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3153 if(y+1<h){ |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3154 for(x=0; x<7; x++){ |
4001 | 3155 score2+= FFABS( s1[x ] - s1[x +stride] |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3156 - s1[x+1] + s1[x+1+stride]) |
4001 | 3157 -FFABS( s2[x ] - s2[x +stride] |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3158 - s2[x+1] + s2[x+1+stride]); |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3159 } |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3160 } |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3161 s1+= stride; |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3162 s2+= stride; |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3163 } |
2967 | 3164 |
4001 | 3165 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight; |
3166 else return score1 + FFABS(score2)*8; | |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3167 } |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3168 |
1784 | 3169 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){ |
3170 int i; | |
3171 unsigned int sum=0; | |
3172 | |
3173 for(i=0; i<8*8; i++){ | |
3174 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT)); | |
3175 int w= weight[i]; | |
3176 b>>= RECON_SHIFT; | |
3177 assert(-512<b && b<512); | |
3178 | |
3179 sum += (w*b)*(w*b)>>4; | |
3180 } | |
3181 return sum>>2; | |
3182 } | |
3183 | |
3184 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){ | |
3185 int i; | |
3186 | |
3187 for(i=0; i<8*8; i++){ | |
3188 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT); | |
2967 | 3189 } |
1784 | 3190 } |
3191 | |
1100 | 3192 /** |
3193 * permutes an 8x8 block. | |
1101 | 3194 * @param block the block which will be permuted according to the given permutation vector |
1100 | 3195 * @param permutation the permutation vector |
3196 * @param last the last non zero coefficient in scantable order, used to speed the permutation up | |
2967 | 3197 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not |
1101 | 3198 * (inverse) permutated to scantable order! |
1100 | 3199 */ |
1064 | 3200 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last) |
174
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
88
diff
changeset
|
3201 { |
764 | 3202 int i; |
945 | 3203 DCTELEM temp[64]; |
2967 | 3204 |
764 | 3205 if(last<=0) return; |
882 | 3206 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms |
174
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
88
diff
changeset
|
3207 |
764 | 3208 for(i=0; i<=last; i++){ |
3209 const int j= scantable[i]; | |
3210 temp[j]= block[j]; | |
3211 block[j]=0; | |
3212 } | |
2967 | 3213 |
764 | 3214 for(i=0; i<=last; i++){ |
3215 const int j= scantable[i]; | |
3216 const int perm_j= permutation[j]; | |
3217 block[perm_j]= temp[j]; | |
3218 } | |
174
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
88
diff
changeset
|
3219 } |
34 | 3220 |
1729 | 3221 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){ |
3222 return 0; | |
3223 } | |
3224 | |
3225 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){ | |
3226 int i; | |
2967 | 3227 |
1729 | 3228 memset(cmp, 0, sizeof(void*)*5); |
2967 | 3229 |
1729 | 3230 for(i=0; i<5; i++){ |
3231 switch(type&0xFF){ | |
3232 case FF_CMP_SAD: | |
3233 cmp[i]= c->sad[i]; | |
3234 break; | |
3235 case FF_CMP_SATD: | |
3236 cmp[i]= c->hadamard8_diff[i]; | |
3237 break; | |
3238 case FF_CMP_SSE: | |
3239 cmp[i]= c->sse[i]; | |
3240 break; | |
3241 case FF_CMP_DCT: | |
3242 cmp[i]= c->dct_sad[i]; | |
3243 break; | |
3010
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3244 case FF_CMP_DCT264: |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3245 cmp[i]= c->dct264_sad[i]; |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3246 break; |
2382 | 3247 case FF_CMP_DCTMAX: |
3248 cmp[i]= c->dct_max[i]; | |
3249 break; | |
1729 | 3250 case FF_CMP_PSNR: |
3251 cmp[i]= c->quant_psnr[i]; | |
3252 break; | |
3253 case FF_CMP_BIT: | |
3254 cmp[i]= c->bit[i]; | |
3255 break; | |
3256 case FF_CMP_RD: | |
3257 cmp[i]= c->rd[i]; | |
3258 break; | |
3259 case FF_CMP_VSAD: | |
3260 cmp[i]= c->vsad[i]; | |
3261 break; | |
3262 case FF_CMP_VSSE: | |
3263 cmp[i]= c->vsse[i]; | |
3264 break; | |
3265 case FF_CMP_ZERO: | |
3266 cmp[i]= zero_cmp; | |
3267 break; | |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3268 case FF_CMP_NSSE: |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3269 cmp[i]= c->nsse[i]; |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3270 break; |
3373
b8996cc5ccae
Disable w53 and w97 cmp methods when snow encoder is disabled
gpoirier
parents:
3323
diff
changeset
|
3271 #ifdef CONFIG_SNOW_ENCODER |
2184 | 3272 case FF_CMP_W53: |
3273 cmp[i]= c->w53[i]; | |
3274 break; | |
3275 case FF_CMP_W97: | |
3276 cmp[i]= c->w97[i]; | |
3277 break; | |
3373
b8996cc5ccae
Disable w53 and w97 cmp methods when snow encoder is disabled
gpoirier
parents:
3323
diff
changeset
|
3278 #endif |
1729 | 3279 default: |
3280 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n"); | |
3281 } | |
3282 } | |
3283 } | |
3284 | |
1101 | 3285 /** |
3286 * memset(blocks, 0, sizeof(DCTELEM)*6*64) | |
3287 */ | |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
3288 static void clear_blocks_c(DCTELEM *blocks) |
296 | 3289 { |
3290 memset(blocks, 0, sizeof(DCTELEM)*6*64); | |
3291 } | |
3292 | |
866 | 3293 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){ |
3294 int i; | |
996
ad44196ea483
add/diff_bytes bugfix patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>)
michaelni
parents:
984
diff
changeset
|
3295 for(i=0; i+7<w; i+=8){ |
866 | 3296 dst[i+0] += src[i+0]; |
3297 dst[i+1] += src[i+1]; | |
3298 dst[i+2] += src[i+2]; | |
3299 dst[i+3] += src[i+3]; | |
3300 dst[i+4] += src[i+4]; | |
3301 dst[i+5] += src[i+5]; | |
3302 dst[i+6] += src[i+6]; | |
3303 dst[i+7] += src[i+7]; | |
3304 } | |
3305 for(; i<w; i++) | |
3306 dst[i+0] += src[i+0]; | |
3307 } | |
3308 | |
3309 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ | |
3310 int i; | |
996
ad44196ea483
add/diff_bytes bugfix patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>)
michaelni
parents:
984
diff
changeset
|
3311 for(i=0; i+7<w; i+=8){ |
866 | 3312 dst[i+0] = src1[i+0]-src2[i+0]; |
3313 dst[i+1] = src1[i+1]-src2[i+1]; | |
3314 dst[i+2] = src1[i+2]-src2[i+2]; | |
3315 dst[i+3] = src1[i+3]-src2[i+3]; | |
3316 dst[i+4] = src1[i+4]-src2[i+4]; | |
3317 dst[i+5] = src1[i+5]-src2[i+5]; | |
3318 dst[i+6] = src1[i+6]-src2[i+6]; | |
3319 dst[i+7] = src1[i+7]-src2[i+7]; | |
3320 } | |
3321 for(; i<w; i++) | |
3322 dst[i+0] = src1[i+0]-src2[i+0]; | |
3323 } | |
3324 | |
1527 | 3325 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){ |
3326 int i; | |
3327 uint8_t l, lt; | |
3328 | |
3329 l= *left; | |
3330 lt= *left_top; | |
3331 | |
3332 for(i=0; i<w; i++){ | |
3333 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF); | |
3334 lt= src1[i]; | |
3335 l= src2[i]; | |
3336 dst[i]= l - pred; | |
2967 | 3337 } |
1527 | 3338 |
3339 *left= l; | |
3340 *left_top= lt; | |
3341 } | |
3342 | |
936 | 3343 #define BUTTERFLY2(o1,o2,i1,i2) \ |
3344 o1= (i1)+(i2);\ | |
3345 o2= (i1)-(i2); | |
3346 | |
3347 #define BUTTERFLY1(x,y) \ | |
3348 {\ | |
3349 int a,b;\ | |
3350 a= x;\ | |
3351 b= y;\ | |
3352 x= a+b;\ | |
3353 y= a-b;\ | |
3354 } | |
3355 | |
4001 | 3356 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y))) |
936 | 3357 |
1708 | 3358 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){ |
936 | 3359 int i; |
3360 int temp[64]; | |
3361 int sum=0; | |
2967 | 3362 |
1708 | 3363 assert(h==8); |
936 | 3364 |
3365 for(i=0; i<8; i++){ | |
3366 //FIXME try pointer walks | |
3367 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]); | |
3368 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]); | |
3369 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]); | |
3370 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]); | |
2967 | 3371 |
936 | 3372 BUTTERFLY1(temp[8*i+0], temp[8*i+2]); |
3373 BUTTERFLY1(temp[8*i+1], temp[8*i+3]); | |
3374 BUTTERFLY1(temp[8*i+4], temp[8*i+6]); | |
3375 BUTTERFLY1(temp[8*i+5], temp[8*i+7]); | |
2967 | 3376 |
936 | 3377 BUTTERFLY1(temp[8*i+0], temp[8*i+4]); |
3378 BUTTERFLY1(temp[8*i+1], temp[8*i+5]); | |
3379 BUTTERFLY1(temp[8*i+2], temp[8*i+6]); | |
3380 BUTTERFLY1(temp[8*i+3], temp[8*i+7]); | |
3381 } | |
3382 | |
3383 for(i=0; i<8; i++){ | |
3384 BUTTERFLY1(temp[8*0+i], temp[8*1+i]); | |
3385 BUTTERFLY1(temp[8*2+i], temp[8*3+i]); | |
3386 BUTTERFLY1(temp[8*4+i], temp[8*5+i]); | |
3387 BUTTERFLY1(temp[8*6+i], temp[8*7+i]); | |
2967 | 3388 |
936 | 3389 BUTTERFLY1(temp[8*0+i], temp[8*2+i]); |
3390 BUTTERFLY1(temp[8*1+i], temp[8*3+i]); | |
3391 BUTTERFLY1(temp[8*4+i], temp[8*6+i]); | |
3392 BUTTERFLY1(temp[8*5+i], temp[8*7+i]); | |
3393 | |
2967 | 3394 sum += |
936 | 3395 BUTTERFLYA(temp[8*0+i], temp[8*4+i]) |
3396 +BUTTERFLYA(temp[8*1+i], temp[8*5+i]) | |
3397 +BUTTERFLYA(temp[8*2+i], temp[8*6+i]) | |
3398 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]); | |
3399 } | |
3400 #if 0 | |
3401 static int maxi=0; | |
3402 if(sum>maxi){ | |
3403 maxi=sum; | |
3404 printf("MAX:%d\n", maxi); | |
3405 } | |
3406 #endif | |
3407 return sum; | |
3408 } | |
3409 | |
1729 | 3410 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){ |
936 | 3411 int i; |
3412 int temp[64]; | |
3413 int sum=0; | |
2967 | 3414 |
1729 | 3415 assert(h==8); |
2967 | 3416 |
936 | 3417 for(i=0; i<8; i++){ |
3418 //FIXME try pointer walks | |
1729 | 3419 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]); |
3420 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]); | |
3421 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]); | |
3422 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]); | |
2967 | 3423 |
936 | 3424 BUTTERFLY1(temp[8*i+0], temp[8*i+2]); |
3425 BUTTERFLY1(temp[8*i+1], temp[8*i+3]); | |
3426 BUTTERFLY1(temp[8*i+4], temp[8*i+6]); | |
3427 BUTTERFLY1(temp[8*i+5], temp[8*i+7]); | |
2967 | 3428 |
936 | 3429 BUTTERFLY1(temp[8*i+0], temp[8*i+4]); |
3430 BUTTERFLY1(temp[8*i+1], temp[8*i+5]); | |
3431 BUTTERFLY1(temp[8*i+2], temp[8*i+6]); | |
3432 BUTTERFLY1(temp[8*i+3], temp[8*i+7]); | |
3433 } | |
3434 | |
3435 for(i=0; i<8; i++){ | |
3436 BUTTERFLY1(temp[8*0+i], temp[8*1+i]); | |
3437 BUTTERFLY1(temp[8*2+i], temp[8*3+i]); | |
3438 BUTTERFLY1(temp[8*4+i], temp[8*5+i]); | |
3439 BUTTERFLY1(temp[8*6+i], temp[8*7+i]); | |
2967 | 3440 |
936 | 3441 BUTTERFLY1(temp[8*0+i], temp[8*2+i]); |
3442 BUTTERFLY1(temp[8*1+i], temp[8*3+i]); | |
3443 BUTTERFLY1(temp[8*4+i], temp[8*6+i]); | |
3444 BUTTERFLY1(temp[8*5+i], temp[8*7+i]); | |
2967 | 3445 |
3446 sum += | |
936 | 3447 BUTTERFLYA(temp[8*0+i], temp[8*4+i]) |
3448 +BUTTERFLYA(temp[8*1+i], temp[8*5+i]) | |
3449 +BUTTERFLYA(temp[8*2+i], temp[8*6+i]) | |
3450 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]); | |
3451 } | |
2967 | 3452 |
4001 | 3453 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean |
2967 | 3454 |
936 | 3455 return sum; |
3456 } | |
3457 | |
1708 | 3458 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){ |
936 | 3459 MpegEncContext * const s= (MpegEncContext *)c; |
3089 | 3460 DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]); |
1016 | 3461 DCTELEM * const temp= (DCTELEM*)aligned_temp; |
936 | 3462 int sum=0, i; |
2967 | 3463 |
1708 | 3464 assert(h==8); |
936 | 3465 |
3466 s->dsp.diff_pixels(temp, src1, src2, stride); | |
1092 | 3467 s->dsp.fdct(temp); |
936 | 3468 |
3469 for(i=0; i<64; i++) | |
4001 | 3470 sum+= FFABS(temp[i]); |
2967 | 3471 |
936 | 3472 return sum; |
3473 } | |
3474 | |
3010
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3475 #ifdef CONFIG_GPL |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3476 #define DCT8_1D {\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3477 const int s07 = SRC(0) + SRC(7);\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3478 const int s16 = SRC(1) + SRC(6);\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3479 const int s25 = SRC(2) + SRC(5);\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3480 const int s34 = SRC(3) + SRC(4);\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3481 const int a0 = s07 + s34;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3482 const int a1 = s16 + s25;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3483 const int a2 = s07 - s34;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3484 const int a3 = s16 - s25;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3485 const int d07 = SRC(0) - SRC(7);\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3486 const int d16 = SRC(1) - SRC(6);\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3487 const int d25 = SRC(2) - SRC(5);\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3488 const int d34 = SRC(3) - SRC(4);\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3489 const int a4 = d16 + d25 + (d07 + (d07>>1));\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3490 const int a5 = d07 - d34 - (d25 + (d25>>1));\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3491 const int a6 = d07 + d34 - (d16 + (d16>>1));\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3492 const int a7 = d16 - d25 + (d34 + (d34>>1));\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3493 DST(0, a0 + a1 ) ;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3494 DST(1, a4 + (a7>>2)) ;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3495 DST(2, a2 + (a3>>1)) ;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3496 DST(3, a5 + (a6>>2)) ;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3497 DST(4, a0 - a1 ) ;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3498 DST(5, a6 - (a5>>2)) ;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3499 DST(6, (a2>>1) - a3 ) ;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3500 DST(7, (a4>>2) - a7 ) ;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3501 } |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3502 |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3503 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3504 MpegEncContext * const s= (MpegEncContext *)c; |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3505 int16_t dct[8][8]; |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3506 int i; |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3507 int sum=0; |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3508 |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3509 s->dsp.diff_pixels(dct, src1, src2, stride); |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3510 |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3511 #define SRC(x) dct[i][x] |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3512 #define DST(x,v) dct[i][x]= v |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3513 for( i = 0; i < 8; i++ ) |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3514 DCT8_1D |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3515 #undef SRC |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3516 #undef DST |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3517 |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3518 #define SRC(x) dct[x][i] |
4001 | 3519 #define DST(x,v) sum += FFABS(v) |
3010
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3520 for( i = 0; i < 8; i++ ) |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3521 DCT8_1D |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3522 #undef SRC |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3523 #undef DST |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3524 return sum; |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3525 } |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3526 #endif |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3527 |
2382 | 3528 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){ |
3529 MpegEncContext * const s= (MpegEncContext *)c; | |
3089 | 3530 DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]); |
2382 | 3531 DCTELEM * const temp= (DCTELEM*)aligned_temp; |
3532 int sum=0, i; | |
2967 | 3533 |
2382 | 3534 assert(h==8); |
3535 | |
3536 s->dsp.diff_pixels(temp, src1, src2, stride); | |
3537 s->dsp.fdct(temp); | |
3538 | |
3539 for(i=0; i<64; i++) | |
4001 | 3540 sum= FFMAX(sum, FFABS(temp[i])); |
2967 | 3541 |
2382 | 3542 return sum; |
3543 } | |
3544 | |
1708 | 3545 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){ |
936 | 3546 MpegEncContext * const s= (MpegEncContext *)c; |
3089 | 3547 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]); |
1016 | 3548 DCTELEM * const temp= (DCTELEM*)aligned_temp; |
3549 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64; | |
936 | 3550 int sum=0, i; |
3551 | |
1708 | 3552 assert(h==8); |
936 | 3553 s->mb_intra=0; |
2967 | 3554 |
936 | 3555 s->dsp.diff_pixels(temp, src1, src2, stride); |
2967 | 3556 |
936 | 3557 memcpy(bak, temp, 64*sizeof(DCTELEM)); |
2967 | 3558 |
1013 | 3559 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i); |
1689 | 3560 s->dct_unquantize_inter(s, temp, 0, s->qscale); |
2967 | 3561 simple_idct(temp); //FIXME |
3562 | |
936 | 3563 for(i=0; i<64; i++) |
3564 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]); | |
2967 | 3565 |
936 | 3566 return sum; |
3567 } | |
3568 | |
1708 | 3569 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){ |
1007 | 3570 MpegEncContext * const s= (MpegEncContext *)c; |
1064 | 3571 const uint8_t *scantable= s->intra_scantable.permutated; |
3089 | 3572 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]); |
3573 DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]); | |
1016 | 3574 DCTELEM * const temp= (DCTELEM*)aligned_temp; |
3575 uint8_t * const bak= (uint8_t*)aligned_bak; | |
1007 | 3576 int i, last, run, bits, level, distoration, start_i; |
3577 const int esc_length= s->ac_esc_length; | |
3578 uint8_t * length; | |
3579 uint8_t * last_length; | |
2967 | 3580 |
1708 | 3581 assert(h==8); |
3582 | |
1007 | 3583 for(i=0; i<8; i++){ |
3584 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0]; | |
3585 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1]; | |
3586 } | |
3587 | |
3588 s->dsp.diff_pixels(temp, src1, src2, stride); | |
3589 | |
1013 | 3590 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i); |
3591 | |
3592 bits=0; | |
2967 | 3593 |
1013 | 3594 if (s->mb_intra) { |
2967 | 3595 start_i = 1; |
1013 | 3596 length = s->intra_ac_vlc_length; |
3597 last_length= s->intra_ac_vlc_last_length; | |
3598 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma | |
3599 } else { | |
3600 start_i = 0; | |
3601 length = s->inter_ac_vlc_length; | |
3602 last_length= s->inter_ac_vlc_last_length; | |
3603 } | |
2967 | 3604 |
1013 | 3605 if(last>=start_i){ |
1007 | 3606 run=0; |
3607 for(i=start_i; i<last; i++){ | |
3608 int j= scantable[i]; | |
3609 level= temp[j]; | |
2967 | 3610 |
1007 | 3611 if(level){ |
3612 level+=64; | |
3613 if((level&(~127)) == 0){ | |
3614 bits+= length[UNI_AC_ENC_INDEX(run, level)]; | |
3615 }else | |
3616 bits+= esc_length; | |
3617 run=0; | |
3618 }else | |
3619 run++; | |
3620 } | |
3621 i= scantable[last]; | |
2967 | 3622 |
1011 | 3623 level= temp[i] + 64; |
3624 | |
3625 assert(level - 64); | |
2967 | 3626 |
1007 | 3627 if((level&(~127)) == 0){ |
3628 bits+= last_length[UNI_AC_ENC_INDEX(run, level)]; | |
3629 }else | |
3630 bits+= esc_length; | |
2967 | 3631 |
1013 | 3632 } |
3633 | |
3634 if(last>=0){ | |
1689 | 3635 if(s->mb_intra) |
3636 s->dct_unquantize_intra(s, temp, 0, s->qscale); | |
3637 else | |
3638 s->dct_unquantize_inter(s, temp, 0, s->qscale); | |
1007 | 3639 } |
2967 | 3640 |
1092 | 3641 s->dsp.idct_add(bak, stride, temp); |
2967 | 3642 |
1708 | 3643 distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8); |
1007 | 3644 |
1013 | 3645 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7); |
1007 | 3646 } |
3647 | |
1708 | 3648 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){ |
1007 | 3649 MpegEncContext * const s= (MpegEncContext *)c; |
1064 | 3650 const uint8_t *scantable= s->intra_scantable.permutated; |
3089 | 3651 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]); |
1016 | 3652 DCTELEM * const temp= (DCTELEM*)aligned_temp; |
1007 | 3653 int i, last, run, bits, level, start_i; |
3654 const int esc_length= s->ac_esc_length; | |
3655 uint8_t * length; | |
3656 uint8_t * last_length; | |
1708 | 3657 |
3658 assert(h==8); | |
2967 | 3659 |
1013 | 3660 s->dsp.diff_pixels(temp, src1, src2, stride); |
1007 | 3661 |
1013 | 3662 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i); |
3663 | |
3664 bits=0; | |
2967 | 3665 |
1007 | 3666 if (s->mb_intra) { |
2967 | 3667 start_i = 1; |
1007 | 3668 length = s->intra_ac_vlc_length; |
3669 last_length= s->intra_ac_vlc_last_length; | |
1013 | 3670 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma |
1007 | 3671 } else { |
3672 start_i = 0; | |
3673 length = s->inter_ac_vlc_length; | |
3674 last_length= s->inter_ac_vlc_last_length; | |
3675 } | |
2967 | 3676 |
1013 | 3677 if(last>=start_i){ |
1007 | 3678 run=0; |
3679 for(i=start_i; i<last; i++){ | |
3680 int j= scantable[i]; | |
3681 level= temp[j]; | |
2967 | 3682 |
1007 | 3683 if(level){ |
3684 level+=64; | |
3685 if((level&(~127)) == 0){ | |
3686 bits+= length[UNI_AC_ENC_INDEX(run, level)]; | |
3687 }else | |
3688 bits+= esc_length; | |
3689 run=0; | |
3690 }else | |
3691 run++; | |
3692 } | |
3693 i= scantable[last]; | |
2967 | 3694 |
1013 | 3695 level= temp[i] + 64; |
2967 | 3696 |
1013 | 3697 assert(level - 64); |
2967 | 3698 |
1007 | 3699 if((level&(~127)) == 0){ |
3700 bits+= last_length[UNI_AC_ENC_INDEX(run, level)]; | |
3701 }else | |
3702 bits+= esc_length; | |
3703 } | |
3704 | |
3705 return bits; | |
3706 } | |
3707 | |
1729 | 3708 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ |
3709 int score=0; | |
3710 int x,y; | |
2967 | 3711 |
1729 | 3712 for(y=1; y<h; y++){ |
3713 for(x=0; x<16; x+=4){ | |
4001 | 3714 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) |
3715 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); | |
1729 | 3716 } |
3717 s+= stride; | |
3718 } | |
2967 | 3719 |
1729 | 3720 return score; |
3721 } | |
3722 | |
3723 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){ | |
3724 int score=0; | |
3725 int x,y; | |
2967 | 3726 |
1729 | 3727 for(y=1; y<h; y++){ |
3728 for(x=0; x<16; x++){ | |
4001 | 3729 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]); |
1729 | 3730 } |
3731 s1+= stride; | |
3732 s2+= stride; | |
3733 } | |
2967 | 3734 |
1729 | 3735 return score; |
3736 } | |
3737 | |
3738 #define SQ(a) ((a)*(a)) | |
3739 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ | |
3740 int score=0; | |
3741 int x,y; | |
2967 | 3742 |
1729 | 3743 for(y=1; y<h; y++){ |
3744 for(x=0; x<16; x+=4){ | |
2967 | 3745 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) |
1729 | 3746 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); |
3747 } | |
3748 s+= stride; | |
3749 } | |
2967 | 3750 |
1729 | 3751 return score; |
3752 } | |
3753 | |
3754 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){ | |
3755 int score=0; | |
3756 int x,y; | |
2967 | 3757 |
1729 | 3758 for(y=1; y<h; y++){ |
3759 for(x=0; x<16; x++){ | |
3760 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]); | |
3761 } | |
3762 s1+= stride; | |
3763 s2+= stride; | |
3764 } | |
2967 | 3765 |
1729 | 3766 return score; |
3767 } | |
3768 | |
1708 | 3769 WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c) |
1729 | 3770 WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c) |
1708 | 3771 WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c) |
3013 | 3772 #ifdef CONFIG_GPL |
3010
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3773 WARPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c) |
3013 | 3774 #endif |
2382 | 3775 WARPER8_16_SQ(dct_max8x8_c, dct_max16_c) |
1708 | 3776 WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c) |
3777 WARPER8_16_SQ(rd8x8_c, rd16_c) | |
3778 WARPER8_16_SQ(bit8x8_c, bit16_c) | |
936 | 3779 |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
3780 static void vector_fmul_c(float *dst, const float *src, int len){ |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
3781 int i; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
3782 for(i=0; i<len; i++) |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
3783 dst[i] *= src[i]; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
3784 } |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
3785 |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
3786 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){ |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
3787 int i; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
3788 src1 += len-1; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
3789 for(i=0; i<len; i++) |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
3790 dst[i] = src0[i] * src1[-i]; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
3791 } |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
3792 |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
3793 void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){ |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
3794 int i; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
3795 for(i=0; i<len; i++) |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
3796 dst[i*step] = src0[i] * src1[i] + src2[i] + src3; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
3797 } |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
3798 |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
3799 void ff_float_to_int16_c(int16_t *dst, const float *src, int len){ |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
3800 int i; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
3801 for(i=0; i<len; i++) { |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
3802 int_fast32_t tmp = ((int32_t*)src)[i]; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
3803 if(tmp & 0xf0000){ |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
3804 tmp = (0x43c0ffff - tmp)>>31; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
3805 // is this faster on some gcc/cpu combinations? |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
3806 // if(tmp > 0x43c0ffff) tmp = 0xFFFF; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
3807 // else tmp = 0; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
3808 } |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
3809 dst[i] = tmp - 0x8000; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
3810 } |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
3811 } |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
3812 |
1092 | 3813 /* XXX: those functions should be suppressed ASAP when all IDCTs are |
3814 converted */ | |
3815 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block) | |
3816 { | |
3817 j_rev_dct (block); | |
3818 put_pixels_clamped_c(block, dest, line_size); | |
3819 } | |
3820 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block) | |
3821 { | |
3822 j_rev_dct (block); | |
3823 add_pixels_clamped_c(block, dest, line_size); | |
3824 } | |
3825 | |
2256 | 3826 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block) |
3827 { | |
3828 j_rev_dct4 (block); | |
3829 put_pixels_clamped4_c(block, dest, line_size); | |
3830 } | |
3831 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block) | |
3832 { | |
3833 j_rev_dct4 (block); | |
3834 add_pixels_clamped4_c(block, dest, line_size); | |
3835 } | |
3836 | |
2257 | 3837 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block) |
3838 { | |
3839 j_rev_dct2 (block); | |
3840 put_pixels_clamped2_c(block, dest, line_size); | |
3841 } | |
3842 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block) | |
3843 { | |
3844 j_rev_dct2 (block); | |
3845 add_pixels_clamped2_c(block, dest, line_size); | |
3846 } | |
3847 | |
2259 | 3848 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block) |
3849 { | |
3850 uint8_t *cm = cropTbl + MAX_NEG_CROP; | |
3851 | |
3852 dest[0] = cm[(block[0] + 4)>>3]; | |
3853 } | |
3854 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block) | |
3855 { | |
3856 uint8_t *cm = cropTbl + MAX_NEG_CROP; | |
3857 | |
3858 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)]; | |
3859 } | |
3860 | |
3215
06f98047ff26
prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents:
3199
diff
changeset
|
3861 static void just_return() { return; } |
06f98047ff26
prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents:
3199
diff
changeset
|
3862 |
1201 | 3863 /* init static data */ |
3864 void dsputil_static_init(void) | |
0 | 3865 { |
751 | 3866 int i; |
0 | 3867 |
1201 | 3868 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i; |
3869 for(i=0;i<MAX_NEG_CROP;i++) { | |
3870 cropTbl[i] = 0; | |
3871 cropTbl[i + MAX_NEG_CROP + 256] = 255; | |
3872 } | |
2967 | 3873 |
1201 | 3874 for(i=0;i<512;i++) { |
3875 squareTbl[i] = (i - 256) * (i - 256); | |
3876 } | |
2967 | 3877 |
1201 | 3878 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1; |
3879 } | |
0 | 3880 |
861 | 3881 |
1201 | 3882 void dsputil_init(DSPContext* c, AVCodecContext *avctx) |
3883 { | |
3884 int i; | |
0 | 3885 |
1092 | 3886 #ifdef CONFIG_ENCODERS |
1567 | 3887 if(avctx->dct_algo==FF_DCT_FASTINT) { |
1092 | 3888 c->fdct = fdct_ifast; |
2979 | 3889 c->fdct248 = fdct_ifast248; |
2967 | 3890 } |
1567 | 3891 else if(avctx->dct_algo==FF_DCT_FAAN) { |
1557 | 3892 c->fdct = ff_faandct; |
2979 | 3893 c->fdct248 = ff_faandct248; |
2967 | 3894 } |
1567 | 3895 else { |
1092 | 3896 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default |
2979 | 3897 c->fdct248 = ff_fdct248_islow; |
1567 | 3898 } |
1092 | 3899 #endif //CONFIG_ENCODERS |
3900 | |
2256 | 3901 if(avctx->lowres==1){ |
2272
cd43603c46f9
move h264 idct to its own file and call via function pointer in DspContext
michael
parents:
2259
diff
changeset
|
3902 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO){ |
cd43603c46f9
move h264 idct to its own file and call via function pointer in DspContext
michael
parents:
2259
diff
changeset
|
3903 c->idct_put= ff_jref_idct4_put; |
cd43603c46f9
move h264 idct to its own file and call via function pointer in DspContext
michael
parents:
2259
diff
changeset
|
3904 c->idct_add= ff_jref_idct4_add; |
cd43603c46f9
move h264 idct to its own file and call via function pointer in DspContext
michael
parents:
2259
diff
changeset
|
3905 }else{ |
cd43603c46f9
move h264 idct to its own file and call via function pointer in DspContext
michael
parents:
2259
diff
changeset
|
3906 c->idct_put= ff_h264_lowres_idct_put_c; |
cd43603c46f9
move h264 idct to its own file and call via function pointer in DspContext
michael
parents:
2259
diff
changeset
|
3907 c->idct_add= ff_h264_lowres_idct_add_c; |
cd43603c46f9
move h264 idct to its own file and call via function pointer in DspContext
michael
parents:
2259
diff
changeset
|
3908 } |
2256 | 3909 c->idct = j_rev_dct4; |
1092 | 3910 c->idct_permutation_type= FF_NO_IDCT_PERM; |
2257 | 3911 }else if(avctx->lowres==2){ |
3912 c->idct_put= ff_jref_idct2_put; | |
3913 c->idct_add= ff_jref_idct2_add; | |
3914 c->idct = j_rev_dct2; | |
3915 c->idct_permutation_type= FF_NO_IDCT_PERM; | |
2259 | 3916 }else if(avctx->lowres==3){ |
3917 c->idct_put= ff_jref_idct1_put; | |
3918 c->idct_add= ff_jref_idct1_add; | |
3919 c->idct = j_rev_dct1; | |
3920 c->idct_permutation_type= FF_NO_IDCT_PERM; | |
2256 | 3921 }else{ |
3922 if(avctx->idct_algo==FF_IDCT_INT){ | |
3923 c->idct_put= ff_jref_idct_put; | |
3924 c->idct_add= ff_jref_idct_add; | |
3925 c->idct = j_rev_dct; | |
3926 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM; | |
2693 | 3927 }else if(avctx->idct_algo==FF_IDCT_VP3){ |
3928 c->idct_put= ff_vp3_idct_put_c; | |
3929 c->idct_add= ff_vp3_idct_add_c; | |
3930 c->idct = ff_vp3_idct_c; | |
3931 c->idct_permutation_type= FF_NO_IDCT_PERM; | |
2256 | 3932 }else{ //accurate/default |
3933 c->idct_put= simple_idct_put; | |
3934 c->idct_add= simple_idct_add; | |
3935 c->idct = simple_idct; | |
3936 c->idct_permutation_type= FF_NO_IDCT_PERM; | |
3937 } | |
1092 | 3938 } |
3939 | |
2272
cd43603c46f9
move h264 idct to its own file and call via function pointer in DspContext
michael
parents:
2259
diff
changeset
|
3940 c->h264_idct_add= ff_h264_idct_add_c; |
2755 | 3941 c->h264_idct8_add= ff_h264_idct8_add_c; |
3105
2d35fb3cb940
h264: special case dc-only idct. ~1% faster overall
lorenm
parents:
3089
diff
changeset
|
3942 c->h264_idct_dc_add= ff_h264_idct_dc_add_c; |
2d35fb3cb940
h264: special case dc-only idct. ~1% faster overall
lorenm
parents:
3089
diff
changeset
|
3943 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c; |
2272
cd43603c46f9
move h264 idct to its own file and call via function pointer in DspContext
michael
parents:
2259
diff
changeset
|
3944 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
3945 c->get_pixels = get_pixels_c; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
3946 c->diff_pixels = diff_pixels_c; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
3947 c->put_pixels_clamped = put_pixels_clamped_c; |
1984
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
3948 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c; |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
3949 c->add_pixels_clamped = add_pixels_clamped_c; |
2763 | 3950 c->add_pixels8 = add_pixels8_c; |
3951 c->add_pixels4 = add_pixels4_c; | |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
3952 c->gmc1 = gmc1_c; |
3248
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3245
diff
changeset
|
3953 c->gmc = ff_gmc_c; |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
3954 c->clear_blocks = clear_blocks_c; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
3955 c->pix_sum = pix_sum_c; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
3956 c->pix_norm1 = pix_norm1_c; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
3957 |
859 | 3958 /* TODO [0] 16 [1] 8 */ |
1708 | 3959 c->pix_abs[0][0] = pix_abs16_c; |
3960 c->pix_abs[0][1] = pix_abs16_x2_c; | |
3961 c->pix_abs[0][2] = pix_abs16_y2_c; | |
3962 c->pix_abs[0][3] = pix_abs16_xy2_c; | |
3963 c->pix_abs[1][0] = pix_abs8_c; | |
3964 c->pix_abs[1][1] = pix_abs8_x2_c; | |
3965 c->pix_abs[1][2] = pix_abs8_y2_c; | |
3966 c->pix_abs[1][3] = pix_abs8_xy2_c; | |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
3967 |
859 | 3968 #define dspfunc(PFX, IDX, NUM) \ |
3969 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \ | |
3970 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \ | |
3971 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \ | |
3972 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c | |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
3973 |
859 | 3974 dspfunc(put, 0, 16); |
3975 dspfunc(put_no_rnd, 0, 16); | |
3976 dspfunc(put, 1, 8); | |
3977 dspfunc(put_no_rnd, 1, 8); | |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
3978 dspfunc(put, 2, 4); |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
3979 dspfunc(put, 3, 2); |
0 | 3980 |
859 | 3981 dspfunc(avg, 0, 16); |
3982 dspfunc(avg_no_rnd, 0, 16); | |
3983 dspfunc(avg, 1, 8); | |
3984 dspfunc(avg_no_rnd, 1, 8); | |
1319 | 3985 dspfunc(avg, 2, 4); |
3986 dspfunc(avg, 3, 2); | |
859 | 3987 #undef dspfunc |
857 | 3988 |
1864 | 3989 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c; |
3990 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c; | |
3991 | |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
3992 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
3993 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
3994 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
3995 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
3996 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
3997 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
3998 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
3999 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
4000 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
4001 |
1319 | 4002 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c; |
4003 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c; | |
4004 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c; | |
4005 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c; | |
4006 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c; | |
4007 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c; | |
4008 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c; | |
4009 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c; | |
4010 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c; | |
4011 | |
859 | 4012 #define dspfunc(PFX, IDX, NUM) \ |
4013 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \ | |
4014 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \ | |
4015 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \ | |
4016 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \ | |
4017 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \ | |
4018 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \ | |
4019 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \ | |
4020 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \ | |
4021 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \ | |
4022 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \ | |
4023 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \ | |
4024 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \ | |
4025 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \ | |
4026 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \ | |
4027 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \ | |
4028 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c | |
857 | 4029 |
859 | 4030 dspfunc(put_qpel, 0, 16); |
4031 dspfunc(put_no_rnd_qpel, 0, 16); | |
4032 | |
4033 dspfunc(avg_qpel, 0, 16); | |
4034 /* dspfunc(avg_no_rnd_qpel, 0, 16); */ | |
857 | 4035 |
859 | 4036 dspfunc(put_qpel, 1, 8); |
4037 dspfunc(put_no_rnd_qpel, 1, 8); | |
4038 | |
4039 dspfunc(avg_qpel, 1, 8); | |
4040 /* dspfunc(avg_no_rnd_qpel, 1, 8); */ | |
1168 | 4041 |
4042 dspfunc(put_h264_qpel, 0, 16); | |
4043 dspfunc(put_h264_qpel, 1, 8); | |
4044 dspfunc(put_h264_qpel, 2, 4); | |
3020
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
4045 dspfunc(put_h264_qpel, 3, 2); |
1168 | 4046 dspfunc(avg_h264_qpel, 0, 16); |
4047 dspfunc(avg_h264_qpel, 1, 8); | |
4048 dspfunc(avg_h264_qpel, 2, 4); | |
4049 | |
859 | 4050 #undef dspfunc |
1168 | 4051 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c; |
4052 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c; | |
4053 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c; | |
4054 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c; | |
4055 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c; | |
4056 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c; | |
3663 | 4057 c->put_no_rnd_h264_chroma_pixels_tab[0]= put_no_rnd_h264_chroma_mc8_c; |
857 | 4058 |
2415 | 4059 c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c; |
4060 c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c; | |
4061 c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c; | |
4062 c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c; | |
4063 c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c; | |
4064 c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c; | |
4065 c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c; | |
4066 c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c; | |
4067 c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c; | |
4068 c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c; | |
4069 c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c; | |
4070 c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c; | |
4071 c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c; | |
4072 c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c; | |
4073 c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c; | |
4074 c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c; | |
4075 c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c; | |
4076 c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c; | |
4077 c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c; | |
4078 c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c; | |
4079 | |
3432 | 4080 #ifdef CONFIG_CAVS_DECODER |
3395
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
4081 ff_cavsdsp_init(c,avctx); |
3432 | 4082 #endif |
3526 | 4083 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER) |
4084 ff_vc1dsp_init(c,avctx); | |
4085 #endif | |
3395
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
4086 |
936 | 4087 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c; |
4088 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c; | |
4089 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c; | |
4090 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c; | |
4091 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c; | |
4092 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c; | |
4093 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c; | |
4094 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c; | |
2967 | 4095 |
1708 | 4096 #define SET_CMP_FUNC(name) \ |
4097 c->name[0]= name ## 16_c;\ | |
4098 c->name[1]= name ## 8x8_c; | |
2967 | 4099 |
1708 | 4100 SET_CMP_FUNC(hadamard8_diff) |
1729 | 4101 c->hadamard8_diff[4]= hadamard8_intra16_c; |
1708 | 4102 SET_CMP_FUNC(dct_sad) |
2382 | 4103 SET_CMP_FUNC(dct_max) |
3013 | 4104 #ifdef CONFIG_GPL |
3010
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
4105 SET_CMP_FUNC(dct264_sad) |
3013 | 4106 #endif |
1708 | 4107 c->sad[0]= pix_abs16_c; |
4108 c->sad[1]= pix_abs8_c; | |
4109 c->sse[0]= sse16_c; | |
4110 c->sse[1]= sse8_c; | |
2184 | 4111 c->sse[2]= sse4_c; |
1708 | 4112 SET_CMP_FUNC(quant_psnr) |
4113 SET_CMP_FUNC(rd) | |
4114 SET_CMP_FUNC(bit) | |
1729 | 4115 c->vsad[0]= vsad16_c; |
4116 c->vsad[4]= vsad_intra16_c; | |
4117 c->vsse[0]= vsse16_c; | |
4118 c->vsse[4]= vsse_intra16_c; | |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
4119 c->nsse[0]= nsse16_c; |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
4120 c->nsse[1]= nsse8_c; |
3373
b8996cc5ccae
Disable w53 and w97 cmp methods when snow encoder is disabled
gpoirier
parents:
3323
diff
changeset
|
4121 #ifdef CONFIG_SNOW_ENCODER |
2184 | 4122 c->w53[0]= w53_16_c; |
4123 c->w53[1]= w53_8_c; | |
4124 c->w97[0]= w97_16_c; | |
4125 c->w97[1]= w97_8_c; | |
3373
b8996cc5ccae
Disable w53 and w97 cmp methods when snow encoder is disabled
gpoirier
parents:
3323
diff
changeset
|
4126 #endif |
2184 | 4127 |
866 | 4128 c->add_bytes= add_bytes_c; |
4129 c->diff_bytes= diff_bytes_c; | |
1527 | 4130 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c; |
1273 | 4131 c->bswap_buf= bswap_buf; |
2633 | 4132 |
4133 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c; | |
4134 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c; | |
4135 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c; | |
4136 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c; | |
2707
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
4137 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c; |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
4138 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c; |
3645
47821be55b6c
mmx implementation of deblocking strength decision.
lorenm
parents:
3568
diff
changeset
|
4139 c->h264_loop_filter_strength= NULL; |
2967 | 4140 |
1644 | 4141 c->h263_h_loop_filter= h263_h_loop_filter_c; |
4142 c->h263_v_loop_filter= h263_v_loop_filter_c; | |
2967 | 4143 |
2045 | 4144 c->h261_loop_filter= h261_loop_filter_c; |
2967 | 4145 |
1784 | 4146 c->try_8x8basis= try_8x8basis_c; |
4147 c->add_8x8basis= add_8x8basis_c; | |
866 | 4148 |
3199
1651e69b9f7a
10l: Only set *compose97i *add_yblock to dsputils context if we are building with Snow enabled
gpoirier
parents:
3198
diff
changeset
|
4149 #ifdef CONFIG_SNOW_ENCODER |
3198
6b9f0c4fbdbe
First part of a series of speed-enchancing patches.
gpoirier
parents:
3105
diff
changeset
|
4150 c->vertical_compose97i = ff_snow_vertical_compose97i; |
6b9f0c4fbdbe
First part of a series of speed-enchancing patches.
gpoirier
parents:
3105
diff
changeset
|
4151 c->horizontal_compose97i = ff_snow_horizontal_compose97i; |
6b9f0c4fbdbe
First part of a series of speed-enchancing patches.
gpoirier
parents:
3105
diff
changeset
|
4152 c->inner_add_yblock = ff_snow_inner_add_yblock; |
3199
1651e69b9f7a
10l: Only set *compose97i *add_yblock to dsputils context if we are building with Snow enabled
gpoirier
parents:
3198
diff
changeset
|
4153 #endif |
3198
6b9f0c4fbdbe
First part of a series of speed-enchancing patches.
gpoirier
parents:
3105
diff
changeset
|
4154 |
3536
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3526
diff
changeset
|
4155 #ifdef CONFIG_VORBIS_DECODER |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3526
diff
changeset
|
4156 c->vorbis_inverse_coupling = vorbis_inverse_coupling; |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3526
diff
changeset
|
4157 #endif |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4158 c->vector_fmul = vector_fmul_c; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4159 c->vector_fmul_reverse = vector_fmul_reverse_c; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4160 c->vector_fmul_add_add = ff_vector_fmul_add_add_c; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4161 c->float_to_int16 = ff_float_to_int16_c; |
3536
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3526
diff
changeset
|
4162 |
3245 | 4163 c->shrink[0]= ff_img_copy_plane; |
4164 c->shrink[1]= ff_shrink22; | |
4165 c->shrink[2]= ff_shrink44; | |
4166 c->shrink[3]= ff_shrink88; | |
4167 | |
3215
06f98047ff26
prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents:
3199
diff
changeset
|
4168 c->prefetch= just_return; |
06f98047ff26
prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents:
3199
diff
changeset
|
4169 |
3807
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3728
diff
changeset
|
4170 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab)); |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3728
diff
changeset
|
4171 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab)); |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3728
diff
changeset
|
4172 |
2 | 4173 #ifdef HAVE_MMX |
1092 | 4174 dsputil_init_mmx(c, avctx); |
0 | 4175 #endif |
62 | 4176 #ifdef ARCH_ARMV4L |
1092 | 4177 dsputil_init_armv4l(c, avctx); |
62 | 4178 #endif |
88 | 4179 #ifdef HAVE_MLIB |
1092 | 4180 dsputil_init_mlib(c, avctx); |
88 | 4181 #endif |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
1866
diff
changeset
|
4182 #ifdef ARCH_SPARC |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
1866
diff
changeset
|
4183 dsputil_init_vis(c,avctx); |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
1866
diff
changeset
|
4184 #endif |
214
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
209
diff
changeset
|
4185 #ifdef ARCH_ALPHA |
1092 | 4186 dsputil_init_alpha(c, avctx); |
214
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
209
diff
changeset
|
4187 #endif |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
612
diff
changeset
|
4188 #ifdef ARCH_POWERPC |
1092 | 4189 dsputil_init_ppc(c, avctx); |
626
23a093d6e450
patch by Heliodoro Tammaro <helio at interactives dot org>
michaelni
parents:
625
diff
changeset
|
4190 #endif |
689
efcbfbd18864
ps2 idct patch by (Leon van Stuivenberg <leonvs at iae dot nl>)
michaelni
parents:
676
diff
changeset
|
4191 #ifdef HAVE_MMI |
1092 | 4192 dsputil_init_mmi(c, avctx); |
689
efcbfbd18864
ps2 idct patch by (Leon van Stuivenberg <leonvs at iae dot nl>)
michaelni
parents:
676
diff
changeset
|
4193 #endif |
1259
e8c3884f2c7e
sh4 optimized idct & bswap patch by (BERO <bero at geocities dot co dot jp>)
michaelni
parents:
1201
diff
changeset
|
4194 #ifdef ARCH_SH4 |
e8c3884f2c7e
sh4 optimized idct & bswap patch by (BERO <bero at geocities dot co dot jp>)
michaelni
parents:
1201
diff
changeset
|
4195 dsputil_init_sh4(c,avctx); |
e8c3884f2c7e
sh4 optimized idct & bswap patch by (BERO <bero at geocities dot co dot jp>)
michaelni
parents:
1201
diff
changeset
|
4196 #endif |
3728 | 4197 #ifdef ARCH_BFIN |
4198 dsputil_init_bfin(c,avctx); | |
4199 #endif | |
1092 | 4200 |
3807
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3728
diff
changeset
|
4201 for(i=0; i<64; i++){ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3728
diff
changeset
|
4202 if(!c->put_2tap_qpel_pixels_tab[0][i]) |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3728
diff
changeset
|
4203 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i]; |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3728
diff
changeset
|
4204 if(!c->avg_2tap_qpel_pixels_tab[0][i]) |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3728
diff
changeset
|
4205 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i]; |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3728
diff
changeset
|
4206 } |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3728
diff
changeset
|
4207 |
1092 | 4208 switch(c->idct_permutation_type){ |
4209 case FF_NO_IDCT_PERM: | |
4210 for(i=0; i<64; i++) | |
4211 c->idct_permutation[i]= i; | |
4212 break; | |
4213 case FF_LIBMPEG2_IDCT_PERM: | |
4214 for(i=0; i<64; i++) | |
4215 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2); | |
4216 break; | |
4217 case FF_SIMPLE_IDCT_PERM: | |
4218 for(i=0; i<64; i++) | |
4219 c->idct_permutation[i]= simple_mmx_permutation[i]; | |
4220 break; | |
4221 case FF_TRANSPOSE_IDCT_PERM: | |
4222 for(i=0; i<64; i++) | |
4223 c->idct_permutation[i]= ((i&7)<<3) | (i>>3); | |
4224 break; | |
2696
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2693
diff
changeset
|
4225 case FF_PARTTRANS_IDCT_PERM: |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2693
diff
changeset
|
4226 for(i=0; i<64; i++) |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2693
diff
changeset
|
4227 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3); |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2693
diff
changeset
|
4228 break; |
1092 | 4229 default: |
1598
932d306bf1dc
av_log() patch by (Michel Bardiaux <mbardiaux at peaktime dot be>)
michael
parents:
1571
diff
changeset
|
4230 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n"); |
1092 | 4231 } |
0 | 4232 } |
252
ddb1a0e94cf4
- Added PSNR feature to libavcodec and ffmpeg. By now just Y PSNR until I'm
pulento
parents:
220
diff
changeset
|
4233 |