Mercurial > libavcodec.hg
annotate dsputil.c @ 10320:44f31c1c9acc libavcodec
Make sure that dv encoder initializes all encoded packet data.
The specification does not say which value to use for unused
parts, so fill all unused bytes with 0xff, which is consistent
with what DV usually uses for reserved or unused parts.
author | reimar |
---|---|
date | Tue, 29 Sep 2009 19:17:18 +0000 |
parents | 4d1b9ca628fc |
children | 6009feb54020 |
rev | line source |
---|---|
0 | 1 /* |
2 * DSP utils | |
8629
04423b2f6e0b
cosmetics: Remove pointless period after copyright statement non-sentences.
diego
parents:
8627
diff
changeset
|
3 * Copyright (c) 2000, 2001 Fabrice Bellard |
1739
07a484280a82
copyright year update of the files i touched and remembered, things look annoyingly unmaintained otherwise
michael
parents:
1729
diff
changeset
|
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
0 | 5 * |
5214 | 6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at> |
7 * | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3807
diff
changeset
|
8 * This file is part of FFmpeg. |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3807
diff
changeset
|
9 * |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3807
diff
changeset
|
10 * FFmpeg is free software; you can redistribute it and/or |
429 | 11 * modify it under the terms of the GNU Lesser General Public |
12 * License as published by the Free Software Foundation; either | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3807
diff
changeset
|
13 * version 2.1 of the License, or (at your option) any later version. |
0 | 14 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3807
diff
changeset
|
15 * FFmpeg is distributed in the hope that it will be useful, |
0 | 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
429 | 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
18 * Lesser General Public License for more details. | |
0 | 19 * |
429 | 20 * You should have received a copy of the GNU Lesser General Public |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3807
diff
changeset
|
21 * License along with FFmpeg; if not, write to the Free Software |
3036
0b546eab515d
Update licensing information: The FSF changed postal address.
diego
parents:
3029
diff
changeset
|
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
0 | 23 */ |
2967 | 24 |
1106 | 25 /** |
8718
e9d9d946f213
Use full internal pathname in doxygen @file directives.
diego
parents:
8629
diff
changeset
|
26 * @file libavcodec/dsputil.c |
1106 | 27 * DSP utils |
28 */ | |
2967 | 29 |
0 | 30 #include "avcodec.h" |
31 #include "dsputil.h" | |
1092 | 32 #include "simple_idct.h" |
1557 | 33 #include "faandct.h" |
6407 | 34 #include "faanidct.h" |
8627
d6bab465b82c
moves mid_pred() into mathops.h (with arch specific code split by directory)
aurel
parents:
8596
diff
changeset
|
35 #include "mathops.h" |
5277
7b3fcb7c61ce
Avoid linking with h263.c functions when the relevant codecs
aurel
parents:
5256
diff
changeset
|
36 #include "h263.h" |
3198
6b9f0c4fbdbe
First part of a series of speed-enchancing patches.
gpoirier
parents:
3105
diff
changeset
|
37 #include "snow.h" |
676 | 38 |
2522
e25782262d7d
kill warnings patch by (Mns Rullgrd <mru inprovide com>)
michael
parents:
2448
diff
changeset
|
39 /* snow.c */ |
e25782262d7d
kill warnings patch by (Mns Rullgrd <mru inprovide com>)
michael
parents:
2448
diff
changeset
|
40 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count); |
e25782262d7d
kill warnings patch by (Mns Rullgrd <mru inprovide com>)
michael
parents:
2448
diff
changeset
|
41 |
3536
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3526
diff
changeset
|
42 /* vorbis.c */ |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3526
diff
changeset
|
43 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize); |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3526
diff
changeset
|
44 |
7563 | 45 /* ac3dec.c */ |
46 void ff_ac3_downmix_c(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len); | |
47 | |
5737 | 48 /* flacenc.c */ |
49 void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc); | |
50 | |
6384 | 51 /* pngdec.c */ |
52 void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp); | |
53 | |
8120 | 54 /* eaidct.c */ |
55 void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block); | |
56 | |
4176 | 57 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, }; |
4179 | 58 uint32_t ff_squareTbl[512] = {0, }; |
0 | 59 |
6387 | 60 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size |
61 #define pb_7f (~0UL/255 * 0x7f) | |
62 #define pb_80 (~0UL/255 * 0x80) | |
6385 | 63 |
1064 | 64 const uint8_t ff_zigzag_direct[64] = { |
706
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
689
diff
changeset
|
65 0, 1, 8, 16, 9, 2, 3, 10, |
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
689
diff
changeset
|
66 17, 24, 32, 25, 18, 11, 4, 5, |
34 | 67 12, 19, 26, 33, 40, 48, 41, 34, |
706
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
689
diff
changeset
|
68 27, 20, 13, 6, 7, 14, 21, 28, |
34 | 69 35, 42, 49, 56, 57, 50, 43, 36, |
70 29, 22, 15, 23, 30, 37, 44, 51, | |
71 58, 59, 52, 45, 38, 31, 39, 46, | |
72 53, 60, 61, 54, 47, 55, 62, 63 | |
73 }; | |
74 | |
1567 | 75 /* Specific zigzag scan for 248 idct. NOTE that unlike the |
76 specification, we interleave the fields */ | |
77 const uint8_t ff_zigzag248_direct[64] = { | |
78 0, 8, 1, 9, 16, 24, 2, 10, | |
79 17, 25, 32, 40, 48, 56, 33, 41, | |
80 18, 26, 3, 11, 4, 12, 19, 27, | |
81 34, 42, 49, 57, 50, 58, 35, 43, | |
82 20, 28, 5, 13, 6, 14, 21, 29, | |
83 36, 44, 51, 59, 52, 60, 37, 45, | |
84 22, 30, 7, 15, 23, 31, 38, 46, | |
85 53, 61, 54, 62, 39, 47, 55, 63, | |
86 }; | |
87 | |
220 | 88 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */ |
10090
1315878bc455
100l, inv_zigzag_direct16 must be aligned to 16 bytes for dct_quantize_SSE2
reimar
parents:
10085
diff
changeset
|
89 DECLARE_ALIGNED_16(uint16_t, inv_zigzag_direct16[64]); |
220 | 90 |
1064 | 91 const uint8_t ff_alternate_horizontal_scan[64] = { |
2967 | 92 0, 1, 2, 3, 8, 9, 16, 17, |
34 | 93 10, 11, 4, 5, 6, 7, 15, 14, |
2967 | 94 13, 12, 19, 18, 24, 25, 32, 33, |
34 | 95 26, 27, 20, 21, 22, 23, 28, 29, |
2967 | 96 30, 31, 34, 35, 40, 41, 48, 49, |
34 | 97 42, 43, 36, 37, 38, 39, 44, 45, |
2967 | 98 46, 47, 50, 51, 56, 57, 58, 59, |
34 | 99 52, 53, 54, 55, 60, 61, 62, 63, |
100 }; | |
101 | |
1064 | 102 const uint8_t ff_alternate_vertical_scan[64] = { |
2967 | 103 0, 8, 16, 24, 1, 9, 2, 10, |
34 | 104 17, 25, 32, 40, 48, 56, 57, 49, |
2967 | 105 41, 33, 26, 18, 3, 11, 4, 12, |
34 | 106 19, 27, 34, 42, 50, 58, 35, 43, |
2967 | 107 51, 59, 20, 28, 5, 13, 6, 14, |
34 | 108 21, 29, 36, 44, 52, 60, 37, 45, |
2967 | 109 53, 61, 22, 30, 7, 15, 23, 31, |
34 | 110 38, 46, 54, 62, 39, 47, 55, 63, |
111 }; | |
112 | |
10207 | 113 /* a*inverse[b]>>32 == a/b for all 0<=a<=16909558 && 2<=b<=256 |
114 * for a>16909558, is an overestimate by less than 1 part in 1<<24 */ | |
115 const uint32_t ff_inverse[257]={ | |
2967 | 116 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757, |
117 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154, | |
118 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709, | |
119 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333, | |
120 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367, | |
121 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283, | |
122 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315, | |
123 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085, | |
124 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498, | |
125 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675, | |
126 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441, | |
127 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183, | |
128 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712, | |
129 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400, | |
130 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163, | |
131 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641, | |
132 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573, | |
133 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737, | |
134 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493, | |
135 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373, | |
136 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368, | |
137 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671, | |
138 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767, | |
139 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740, | |
140 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751, | |
141 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635, | |
142 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593, | |
143 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944, | |
144 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933, | |
145 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575, | |
146 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532, | |
220 | 147 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010, |
10207 | 148 16777216 |
220 | 149 }; |
150 | |
1092 | 151 /* Input permutation for the simple_idct_mmx */ |
152 static const uint8_t simple_mmx_permutation[64]={ | |
2979 | 153 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D, |
154 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D, | |
155 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D, | |
156 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F, | |
157 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F, | |
158 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D, | |
159 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F, | |
160 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F, | |
1092 | 161 }; |
162 | |
6600
c3213c91124c
Add a new IDCT permutation, used in xvid_sse2 and possibly future similar IDCTs.
astrange
parents:
6450
diff
changeset
|
163 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7}; |
c3213c91124c
Add a new IDCT permutation, used in xvid_sse2 and possibly future similar IDCTs.
astrange
parents:
6450
diff
changeset
|
164 |
6438 | 165 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){ |
166 int i; | |
167 int end; | |
168 | |
169 st->scantable= src_scantable; | |
170 | |
171 for(i=0; i<64; i++){ | |
172 int j; | |
173 j = src_scantable[i]; | |
174 st->permutated[i] = permutation[j]; | |
8590 | 175 #if ARCH_PPC |
6438 | 176 st->inverse[j] = i; |
177 #endif | |
178 } | |
179 | |
180 end=-1; | |
181 for(i=0; i<64; i++){ | |
182 int j; | |
183 j = st->permutated[i]; | |
184 if(j>end) end=j; | |
185 st->raster_end[i]= end; | |
186 } | |
187 } | |
188 | |
1064 | 189 static int pix_sum_c(uint8_t * pix, int line_size) |
612 | 190 { |
191 int s, i, j; | |
192 | |
193 s = 0; | |
194 for (i = 0; i < 16; i++) { | |
2979 | 195 for (j = 0; j < 16; j += 8) { |
196 s += pix[0]; | |
197 s += pix[1]; | |
198 s += pix[2]; | |
199 s += pix[3]; | |
200 s += pix[4]; | |
201 s += pix[5]; | |
202 s += pix[6]; | |
203 s += pix[7]; | |
204 pix += 8; | |
205 } | |
206 pix += line_size - 16; | |
612 | 207 } |
208 return s; | |
209 } | |
210 | |
1064 | 211 static int pix_norm1_c(uint8_t * pix, int line_size) |
612 | 212 { |
213 int s, i, j; | |
4179 | 214 uint32_t *sq = ff_squareTbl + 256; |
612 | 215 |
216 s = 0; | |
217 for (i = 0; i < 16; i++) { | |
2979 | 218 for (j = 0; j < 16; j += 8) { |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
219 #if 0 |
2979 | 220 s += sq[pix[0]]; |
221 s += sq[pix[1]]; | |
222 s += sq[pix[2]]; | |
223 s += sq[pix[3]]; | |
224 s += sq[pix[4]]; | |
225 s += sq[pix[5]]; | |
226 s += sq[pix[6]]; | |
227 s += sq[pix[7]]; | |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
228 #else |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
229 #if LONG_MAX > 2147483647 |
2979 | 230 register uint64_t x=*(uint64_t*)pix; |
231 s += sq[x&0xff]; | |
232 s += sq[(x>>8)&0xff]; | |
233 s += sq[(x>>16)&0xff]; | |
234 s += sq[(x>>24)&0xff]; | |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
235 s += sq[(x>>32)&0xff]; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
236 s += sq[(x>>40)&0xff]; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
237 s += sq[(x>>48)&0xff]; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
238 s += sq[(x>>56)&0xff]; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
239 #else |
2979 | 240 register uint32_t x=*(uint32_t*)pix; |
241 s += sq[x&0xff]; | |
242 s += sq[(x>>8)&0xff]; | |
243 s += sq[(x>>16)&0xff]; | |
244 s += sq[(x>>24)&0xff]; | |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
245 x=*(uint32_t*)(pix+4); |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
246 s += sq[x&0xff]; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
247 s += sq[(x>>8)&0xff]; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
248 s += sq[(x>>16)&0xff]; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
249 s += sq[(x>>24)&0xff]; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
250 #endif |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
251 #endif |
2979 | 252 pix += 8; |
253 } | |
254 pix += line_size - 16; | |
612 | 255 } |
256 return s; | |
257 } | |
258 | |
6241 | 259 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){ |
1273 | 260 int i; |
2967 | 261 |
1273 | 262 for(i=0; i+8<=w; i+=8){ |
263 dst[i+0]= bswap_32(src[i+0]); | |
264 dst[i+1]= bswap_32(src[i+1]); | |
265 dst[i+2]= bswap_32(src[i+2]); | |
266 dst[i+3]= bswap_32(src[i+3]); | |
267 dst[i+4]= bswap_32(src[i+4]); | |
268 dst[i+5]= bswap_32(src[i+5]); | |
269 dst[i+6]= bswap_32(src[i+6]); | |
270 dst[i+7]= bswap_32(src[i+7]); | |
271 } | |
272 for(;i<w; i++){ | |
273 dst[i+0]= bswap_32(src[i+0]); | |
274 } | |
275 } | |
612 | 276 |
2184 | 277 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) |
278 { | |
279 int s, i; | |
4179 | 280 uint32_t *sq = ff_squareTbl + 256; |
2184 | 281 |
282 s = 0; | |
283 for (i = 0; i < h; i++) { | |
284 s += sq[pix1[0] - pix2[0]]; | |
285 s += sq[pix1[1] - pix2[1]]; | |
286 s += sq[pix1[2] - pix2[2]]; | |
287 s += sq[pix1[3] - pix2[3]]; | |
288 pix1 += line_size; | |
289 pix2 += line_size; | |
290 } | |
291 return s; | |
292 } | |
293 | |
1708 | 294 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) |
936 | 295 { |
296 int s, i; | |
4179 | 297 uint32_t *sq = ff_squareTbl + 256; |
936 | 298 |
299 s = 0; | |
1708 | 300 for (i = 0; i < h; i++) { |
936 | 301 s += sq[pix1[0] - pix2[0]]; |
302 s += sq[pix1[1] - pix2[1]]; | |
303 s += sq[pix1[2] - pix2[2]]; | |
304 s += sq[pix1[3] - pix2[3]]; | |
305 s += sq[pix1[4] - pix2[4]]; | |
306 s += sq[pix1[5] - pix2[5]]; | |
307 s += sq[pix1[6] - pix2[6]]; | |
308 s += sq[pix1[7] - pix2[7]]; | |
309 pix1 += line_size; | |
310 pix2 += line_size; | |
311 } | |
312 return s; | |
313 } | |
314 | |
1708 | 315 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
884 | 316 { |
1012
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
317 int s, i; |
4179 | 318 uint32_t *sq = ff_squareTbl + 256; |
884 | 319 |
320 s = 0; | |
1708 | 321 for (i = 0; i < h; i++) { |
1012
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
322 s += sq[pix1[ 0] - pix2[ 0]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
323 s += sq[pix1[ 1] - pix2[ 1]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
324 s += sq[pix1[ 2] - pix2[ 2]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
325 s += sq[pix1[ 3] - pix2[ 3]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
326 s += sq[pix1[ 4] - pix2[ 4]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
327 s += sq[pix1[ 5] - pix2[ 5]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
328 s += sq[pix1[ 6] - pix2[ 6]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
329 s += sq[pix1[ 7] - pix2[ 7]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
330 s += sq[pix1[ 8] - pix2[ 8]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
331 s += sq[pix1[ 9] - pix2[ 9]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
332 s += sq[pix1[10] - pix2[10]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
333 s += sq[pix1[11] - pix2[11]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
334 s += sq[pix1[12] - pix2[12]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
335 s += sq[pix1[13] - pix2[13]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
336 s += sq[pix1[14] - pix2[14]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
337 s += sq[pix1[15] - pix2[15]]; |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
338 |
1012
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
339 pix1 += line_size; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
340 pix2 += line_size; |
884 | 341 } |
342 return s; | |
343 } | |
344 | |
2184 | 345 |
8590 | 346 #if CONFIG_SNOW_ENCODER //dwt is in snow.c |
2184 | 347 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){ |
348 int s, i, j; | |
349 const int dec_count= w==8 ? 3 : 4; | |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
350 int tmp[32*32]; |
2184 | 351 int level, ori; |
2967 | 352 static const int scale[2][2][4][4]={ |
2184 | 353 { |
354 { | |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
355 // 9/7 8x8 dec=3 |
2184 | 356 {268, 239, 239, 213}, |
357 { 0, 224, 224, 152}, | |
358 { 0, 135, 135, 110}, | |
359 },{ | |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
360 // 9/7 16x16 or 32x32 dec=4 |
2184 | 361 {344, 310, 310, 280}, |
362 { 0, 320, 320, 228}, | |
363 { 0, 175, 175, 136}, | |
364 { 0, 129, 129, 102}, | |
365 } | |
366 },{ | |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
367 { |
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
368 // 5/3 8x8 dec=3 |
2184 | 369 {275, 245, 245, 218}, |
370 { 0, 230, 230, 156}, | |
371 { 0, 138, 138, 113}, | |
372 },{ | |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
373 // 5/3 16x16 or 32x32 dec=4 |
2184 | 374 {352, 317, 317, 286}, |
375 { 0, 328, 328, 233}, | |
376 { 0, 180, 180, 140}, | |
377 { 0, 132, 132, 105}, | |
378 } | |
379 } | |
380 }; | |
381 | |
382 for (i = 0; i < h; i++) { | |
383 for (j = 0; j < w; j+=4) { | |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
384 tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4; |
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
385 tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4; |
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
386 tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4; |
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
387 tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4; |
2184 | 388 } |
389 pix1 += line_size; | |
390 pix2 += line_size; | |
391 } | |
2639 | 392 |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
393 ff_spatial_dwt(tmp, w, h, 32, type, dec_count); |
2184 | 394 |
395 s=0; | |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
396 assert(w==h); |
2184 | 397 for(level=0; level<dec_count; level++){ |
398 for(ori= level ? 1 : 0; ori<4; ori++){ | |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
399 int size= w>>(dec_count-level); |
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
400 int sx= (ori&1) ? size : 0; |
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
401 int stride= 32<<(dec_count-level); |
2184 | 402 int sy= (ori&2) ? stride>>1 : 0; |
2967 | 403 |
2184 | 404 for(i=0; i<size; i++){ |
405 for(j=0; j<size; j++){ | |
406 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori]; | |
4001 | 407 s += FFABS(v); |
2184 | 408 } |
409 } | |
410 } | |
411 } | |
2967 | 412 assert(s>=0); |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
413 return s>>9; |
2184 | 414 } |
415 | |
416 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){ | |
417 return w_c(v, pix1, pix2, line_size, 8, h, 1); | |
418 } | |
419 | |
420 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){ | |
421 return w_c(v, pix1, pix2, line_size, 8, h, 0); | |
422 } | |
423 | |
424 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){ | |
425 return w_c(v, pix1, pix2, line_size, 16, h, 1); | |
426 } | |
427 | |
428 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){ | |
429 return w_c(v, pix1, pix2, line_size, 16, h, 0); | |
430 } | |
431 | |
4197 | 432 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){ |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
433 return w_c(v, pix1, pix2, line_size, 32, h, 1); |
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
434 } |
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
435 |
4197 | 436 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){ |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
437 return w_c(v, pix1, pix2, line_size, 32, h, 0); |
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
438 } |
3373
b8996cc5ccae
Disable w53 and w97 cmp methods when snow encoder is disabled
gpoirier
parents:
3323
diff
changeset
|
439 #endif |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
440 |
6437 | 441 /* draw the edges of width 'w' of an image of size width, height */ |
442 //FIXME check that this is ok for mpeg4 interlaced | |
443 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w) | |
444 { | |
445 uint8_t *ptr, *last_line; | |
446 int i; | |
447 | |
448 last_line = buf + (height - 1) * wrap; | |
449 for(i=0;i<w;i++) { | |
450 /* top and bottom */ | |
451 memcpy(buf - (i + 1) * wrap, buf, width); | |
452 memcpy(last_line + (i + 1) * wrap, last_line, width); | |
453 } | |
454 /* left and right */ | |
455 ptr = buf; | |
456 for(i=0;i<height;i++) { | |
457 memset(ptr - w, ptr[0], w); | |
458 memset(ptr + width, ptr[width-1], w); | |
459 ptr += wrap; | |
460 } | |
461 /* corners */ | |
462 for(i=0;i<w;i++) { | |
463 memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */ | |
464 memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */ | |
465 memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */ | |
466 memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */ | |
467 } | |
468 } | |
469 | |
6445 | 470 /** |
471 * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples. | |
472 * @param buf destination buffer | |
473 * @param src source buffer | |
474 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers | |
475 * @param block_w width of block | |
476 * @param block_h height of block | |
477 * @param src_x x coordinate of the top left sample of the block in the source buffer | |
478 * @param src_y y coordinate of the top left sample of the block in the source buffer | |
479 * @param w width of the source buffer | |
480 * @param h height of the source buffer | |
481 */ | |
482 void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h, | |
483 int src_x, int src_y, int w, int h){ | |
484 int x, y; | |
485 int start_y, start_x, end_y, end_x; | |
486 | |
487 if(src_y>= h){ | |
488 src+= (h-1-src_y)*linesize; | |
489 src_y=h-1; | |
490 }else if(src_y<=-block_h){ | |
491 src+= (1-block_h-src_y)*linesize; | |
492 src_y=1-block_h; | |
493 } | |
494 if(src_x>= w){ | |
495 src+= (w-1-src_x); | |
496 src_x=w-1; | |
497 }else if(src_x<=-block_w){ | |
498 src+= (1-block_w-src_x); | |
499 src_x=1-block_w; | |
500 } | |
501 | |
502 start_y= FFMAX(0, -src_y); | |
503 start_x= FFMAX(0, -src_x); | |
504 end_y= FFMIN(block_h, h-src_y); | |
505 end_x= FFMIN(block_w, w-src_x); | |
506 | |
507 // copy existing part | |
508 for(y=start_y; y<end_y; y++){ | |
509 for(x=start_x; x<end_x; x++){ | |
510 buf[x + y*linesize]= src[x + y*linesize]; | |
511 } | |
512 } | |
513 | |
514 //top | |
515 for(y=0; y<start_y; y++){ | |
516 for(x=start_x; x<end_x; x++){ | |
517 buf[x + y*linesize]= buf[x + start_y*linesize]; | |
518 } | |
519 } | |
520 | |
521 //bottom | |
522 for(y=end_y; y<block_h; y++){ | |
523 for(x=start_x; x<end_x; x++){ | |
524 buf[x + y*linesize]= buf[x + (end_y-1)*linesize]; | |
525 } | |
526 } | |
527 | |
528 for(y=0; y<block_h; y++){ | |
529 //left | |
530 for(x=0; x<start_x; x++){ | |
531 buf[x + y*linesize]= buf[start_x + y*linesize]; | |
532 } | |
533 | |
534 //right | |
535 for(x=end_x; x<block_w; x++){ | |
536 buf[x + y*linesize]= buf[end_x - 1 + y*linesize]; | |
537 } | |
538 } | |
539 } | |
540 | |
1064 | 541 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size) |
0 | 542 { |
543 int i; | |
544 | |
545 /* read the pixels */ | |
546 for(i=0;i<8;i++) { | |
516 | 547 block[0] = pixels[0]; |
548 block[1] = pixels[1]; | |
549 block[2] = pixels[2]; | |
550 block[3] = pixels[3]; | |
551 block[4] = pixels[4]; | |
552 block[5] = pixels[5]; | |
553 block[6] = pixels[6]; | |
554 block[7] = pixels[7]; | |
555 pixels += line_size; | |
556 block += 8; | |
0 | 557 } |
558 } | |
559 | |
1064 | 560 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1, |
2979 | 561 const uint8_t *s2, int stride){ |
324 | 562 int i; |
563 | |
564 /* read the pixels */ | |
565 for(i=0;i<8;i++) { | |
516 | 566 block[0] = s1[0] - s2[0]; |
567 block[1] = s1[1] - s2[1]; | |
568 block[2] = s1[2] - s2[2]; | |
569 block[3] = s1[3] - s2[3]; | |
570 block[4] = s1[4] - s2[4]; | |
571 block[5] = s1[5] - s2[5]; | |
572 block[6] = s1[6] - s2[6]; | |
573 block[7] = s1[7] - s2[7]; | |
324 | 574 s1 += stride; |
575 s2 += stride; | |
516 | 576 block += 8; |
324 | 577 } |
578 } | |
579 | |
580 | |
1064 | 581 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels, |
2979 | 582 int line_size) |
0 | 583 { |
584 int i; | |
4176 | 585 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; |
2967 | 586 |
0 | 587 /* read the pixels */ |
588 for(i=0;i<8;i++) { | |
516 | 589 pixels[0] = cm[block[0]]; |
590 pixels[1] = cm[block[1]]; | |
591 pixels[2] = cm[block[2]]; | |
592 pixels[3] = cm[block[3]]; | |
593 pixels[4] = cm[block[4]]; | |
594 pixels[5] = cm[block[5]]; | |
595 pixels[6] = cm[block[6]]; | |
596 pixels[7] = cm[block[7]]; | |
597 | |
598 pixels += line_size; | |
599 block += 8; | |
0 | 600 } |
601 } | |
602 | |
2256 | 603 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels, |
2979 | 604 int line_size) |
2256 | 605 { |
606 int i; | |
4176 | 607 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; |
2967 | 608 |
2256 | 609 /* read the pixels */ |
610 for(i=0;i<4;i++) { | |
611 pixels[0] = cm[block[0]]; | |
612 pixels[1] = cm[block[1]]; | |
613 pixels[2] = cm[block[2]]; | |
614 pixels[3] = cm[block[3]]; | |
615 | |
616 pixels += line_size; | |
617 block += 8; | |
618 } | |
619 } | |
620 | |
2257 | 621 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels, |
2979 | 622 int line_size) |
2257 | 623 { |
624 int i; | |
4176 | 625 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; |
2967 | 626 |
2257 | 627 /* read the pixels */ |
628 for(i=0;i<2;i++) { | |
629 pixels[0] = cm[block[0]]; | |
630 pixels[1] = cm[block[1]]; | |
631 | |
632 pixels += line_size; | |
633 block += 8; | |
634 } | |
635 } | |
636 | |
2967 | 637 static void put_signed_pixels_clamped_c(const DCTELEM *block, |
1984
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
638 uint8_t *restrict pixels, |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
639 int line_size) |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
640 { |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
641 int i, j; |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
642 |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
643 for (i = 0; i < 8; i++) { |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
644 for (j = 0; j < 8; j++) { |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
645 if (*block < -128) |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
646 *pixels = 0; |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
647 else if (*block > 127) |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
648 *pixels = 255; |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
649 else |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
650 *pixels = (uint8_t)(*block + 128); |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
651 block++; |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
652 pixels++; |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
653 } |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
654 pixels += (line_size - 8); |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
655 } |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
656 } |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
657 |
1064 | 658 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels, |
516 | 659 int line_size) |
0 | 660 { |
661 int i; | |
4176 | 662 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; |
2967 | 663 |
0 | 664 /* read the pixels */ |
665 for(i=0;i<8;i++) { | |
516 | 666 pixels[0] = cm[pixels[0] + block[0]]; |
667 pixels[1] = cm[pixels[1] + block[1]]; | |
668 pixels[2] = cm[pixels[2] + block[2]]; | |
669 pixels[3] = cm[pixels[3] + block[3]]; | |
670 pixels[4] = cm[pixels[4] + block[4]]; | |
671 pixels[5] = cm[pixels[5] + block[5]]; | |
672 pixels[6] = cm[pixels[6] + block[6]]; | |
673 pixels[7] = cm[pixels[7] + block[7]]; | |
674 pixels += line_size; | |
675 block += 8; | |
0 | 676 } |
677 } | |
2256 | 678 |
679 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels, | |
680 int line_size) | |
681 { | |
682 int i; | |
4176 | 683 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; |
2967 | 684 |
2256 | 685 /* read the pixels */ |
686 for(i=0;i<4;i++) { | |
687 pixels[0] = cm[pixels[0] + block[0]]; | |
688 pixels[1] = cm[pixels[1] + block[1]]; | |
689 pixels[2] = cm[pixels[2] + block[2]]; | |
690 pixels[3] = cm[pixels[3] + block[3]]; | |
691 pixels += line_size; | |
692 block += 8; | |
693 } | |
694 } | |
2257 | 695 |
696 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels, | |
697 int line_size) | |
698 { | |
699 int i; | |
4176 | 700 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; |
2967 | 701 |
2257 | 702 /* read the pixels */ |
703 for(i=0;i<2;i++) { | |
704 pixels[0] = cm[pixels[0] + block[0]]; | |
705 pixels[1] = cm[pixels[1] + block[1]]; | |
706 pixels += line_size; | |
707 block += 8; | |
708 } | |
709 } | |
2763 | 710 |
711 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size) | |
712 { | |
713 int i; | |
714 for(i=0;i<8;i++) { | |
715 pixels[0] += block[0]; | |
716 pixels[1] += block[1]; | |
717 pixels[2] += block[2]; | |
718 pixels[3] += block[3]; | |
719 pixels[4] += block[4]; | |
720 pixels[5] += block[5]; | |
721 pixels[6] += block[6]; | |
722 pixels[7] += block[7]; | |
723 pixels += line_size; | |
724 block += 8; | |
725 } | |
726 } | |
727 | |
728 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size) | |
729 { | |
730 int i; | |
731 for(i=0;i<4;i++) { | |
732 pixels[0] += block[0]; | |
733 pixels[1] += block[1]; | |
734 pixels[2] += block[2]; | |
735 pixels[3] += block[3]; | |
736 pixels += line_size; | |
737 block += 4; | |
738 } | |
739 } | |
740 | |
4988
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4749
diff
changeset
|
741 static int sum_abs_dctelem_c(DCTELEM *block) |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4749
diff
changeset
|
742 { |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4749
diff
changeset
|
743 int sum=0, i; |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4749
diff
changeset
|
744 for(i=0; i<64; i++) |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4749
diff
changeset
|
745 sum+= FFABS(block[i]); |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4749
diff
changeset
|
746 return sum; |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4749
diff
changeset
|
747 } |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4749
diff
changeset
|
748 |
385 | 749 #if 0 |
750 | |
751 #define PIXOP2(OPNAME, OP) \ | |
651 | 752 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ |
385 | 753 {\ |
754 int i;\ | |
755 for(i=0; i<h; i++){\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
756 OP(*((uint64_t*)block), AV_RN64(pixels));\ |
385 | 757 pixels+=line_size;\ |
758 block +=line_size;\ | |
759 }\ | |
760 }\ | |
761 \ | |
859 | 762 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ |
385 | 763 {\ |
764 int i;\ | |
765 for(i=0; i<h; i++){\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
766 const uint64_t a= AV_RN64(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
767 const uint64_t b= AV_RN64(pixels+1);\ |
385 | 768 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\ |
769 pixels+=line_size;\ | |
770 block +=line_size;\ | |
771 }\ | |
772 }\ | |
773 \ | |
859 | 774 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ |
385 | 775 {\ |
776 int i;\ | |
777 for(i=0; i<h; i++){\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
778 const uint64_t a= AV_RN64(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
779 const uint64_t b= AV_RN64(pixels+1);\ |
385 | 780 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\ |
781 pixels+=line_size;\ | |
782 block +=line_size;\ | |
783 }\ | |
784 }\ | |
785 \ | |
859 | 786 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ |
385 | 787 {\ |
788 int i;\ | |
789 for(i=0; i<h; i++){\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
790 const uint64_t a= AV_RN64(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
791 const uint64_t b= AV_RN64(pixels+line_size);\ |
385 | 792 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\ |
793 pixels+=line_size;\ | |
794 block +=line_size;\ | |
795 }\ | |
796 }\ | |
797 \ | |
859 | 798 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ |
385 | 799 {\ |
800 int i;\ | |
801 for(i=0; i<h; i++){\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
802 const uint64_t a= AV_RN64(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
803 const uint64_t b= AV_RN64(pixels+line_size);\ |
385 | 804 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\ |
805 pixels+=line_size;\ | |
806 block +=line_size;\ | |
807 }\ | |
808 }\ | |
809 \ | |
859 | 810 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ |
385 | 811 {\ |
812 int i;\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
813 const uint64_t a= AV_RN64(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
814 const uint64_t b= AV_RN64(pixels+1);\ |
385 | 815 uint64_t l0= (a&0x0303030303030303ULL)\ |
816 + (b&0x0303030303030303ULL)\ | |
817 + 0x0202020202020202ULL;\ | |
818 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\ | |
819 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\ | |
820 uint64_t l1,h1;\ | |
821 \ | |
822 pixels+=line_size;\ | |
823 for(i=0; i<h; i+=2){\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
824 uint64_t a= AV_RN64(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
825 uint64_t b= AV_RN64(pixels+1);\ |
385 | 826 l1= (a&0x0303030303030303ULL)\ |
827 + (b&0x0303030303030303ULL);\ | |
828 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\ | |
829 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\ | |
830 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\ | |
831 pixels+=line_size;\ | |
832 block +=line_size;\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
833 a= AV_RN64(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
834 b= AV_RN64(pixels+1);\ |
385 | 835 l0= (a&0x0303030303030303ULL)\ |
836 + (b&0x0303030303030303ULL)\ | |
837 + 0x0202020202020202ULL;\ | |
838 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\ | |
839 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\ | |
840 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\ | |
841 pixels+=line_size;\ | |
842 block +=line_size;\ | |
843 }\ | |
844 }\ | |
845 \ | |
859 | 846 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ |
385 | 847 {\ |
848 int i;\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
849 const uint64_t a= AV_RN64(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
850 const uint64_t b= AV_RN64(pixels+1);\ |
385 | 851 uint64_t l0= (a&0x0303030303030303ULL)\ |
852 + (b&0x0303030303030303ULL)\ | |
853 + 0x0101010101010101ULL;\ | |
854 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\ | |
855 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\ | |
856 uint64_t l1,h1;\ | |
857 \ | |
858 pixels+=line_size;\ | |
859 for(i=0; i<h; i+=2){\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
860 uint64_t a= AV_RN64(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
861 uint64_t b= AV_RN64(pixels+1);\ |
385 | 862 l1= (a&0x0303030303030303ULL)\ |
863 + (b&0x0303030303030303ULL);\ | |
864 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\ | |
865 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\ | |
866 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\ | |
867 pixels+=line_size;\ | |
868 block +=line_size;\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
869 a= AV_RN64(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
870 b= AV_RN64(pixels+1);\ |
385 | 871 l0= (a&0x0303030303030303ULL)\ |
872 + (b&0x0303030303030303ULL)\ | |
873 + 0x0101010101010101ULL;\ | |
874 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\ | |
875 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\ | |
876 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\ | |
877 pixels+=line_size;\ | |
878 block +=line_size;\ | |
879 }\ | |
880 }\ | |
881 \ | |
859 | 882 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\ |
883 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\ | |
884 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\ | |
885 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\ | |
886 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\ | |
887 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\ | |
888 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8) | |
385 | 889 |
890 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) ) | |
891 #else // 64 bit variant | |
892 | |
893 #define PIXOP2(OPNAME, OP) \ | |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
894 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
895 int i;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
896 for(i=0; i<h; i++){\ |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
897 OP(*((uint16_t*)(block )), AV_RN16(pixels ));\ |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
898 pixels+=line_size;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
899 block +=line_size;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
900 }\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
901 }\ |
1168 | 902 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
903 int i;\ | |
904 for(i=0; i<h; i++){\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
905 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\ |
1168 | 906 pixels+=line_size;\ |
907 block +=line_size;\ | |
908 }\ | |
909 }\ | |
859 | 910 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
385 | 911 int i;\ |
912 for(i=0; i<h; i++){\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
913 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
914 OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\ |
385 | 915 pixels+=line_size;\ |
916 block +=line_size;\ | |
917 }\ | |
918 }\ | |
859 | 919 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
920 OPNAME ## _pixels8_c(block, pixels, line_size, h);\ | |
651 | 921 }\ |
385 | 922 \ |
651 | 923 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ |
924 int src_stride1, int src_stride2, int h){\ | |
385 | 925 int i;\ |
926 for(i=0; i<h; i++){\ | |
651 | 927 uint32_t a,b;\ |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
928 a= AV_RN32(&src1[i*src_stride1 ]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
929 b= AV_RN32(&src2[i*src_stride2 ]);\ |
1264 | 930 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\ |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
931 a= AV_RN32(&src1[i*src_stride1+4]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
932 b= AV_RN32(&src2[i*src_stride2+4]);\ |
1264 | 933 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\ |
385 | 934 }\ |
935 }\ | |
936 \ | |
651 | 937 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ |
938 int src_stride1, int src_stride2, int h){\ | |
385 | 939 int i;\ |
940 for(i=0; i<h; i++){\ | |
651 | 941 uint32_t a,b;\ |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
942 a= AV_RN32(&src1[i*src_stride1 ]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
943 b= AV_RN32(&src2[i*src_stride2 ]);\ |
1264 | 944 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\ |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
945 a= AV_RN32(&src1[i*src_stride1+4]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
946 b= AV_RN32(&src2[i*src_stride2+4]);\ |
1264 | 947 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\ |
385 | 948 }\ |
949 }\ | |
950 \ | |
1168 | 951 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ |
952 int src_stride1, int src_stride2, int h){\ | |
953 int i;\ | |
954 for(i=0; i<h; i++){\ | |
955 uint32_t a,b;\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
956 a= AV_RN32(&src1[i*src_stride1 ]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
957 b= AV_RN32(&src2[i*src_stride2 ]);\ |
1264 | 958 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\ |
1168 | 959 }\ |
960 }\ | |
961 \ | |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
962 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
963 int src_stride1, int src_stride2, int h){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
964 int i;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
965 for(i=0; i<h; i++){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
966 uint32_t a,b;\ |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
967 a= AV_RN16(&src1[i*src_stride1 ]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
968 b= AV_RN16(&src2[i*src_stride2 ]);\ |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
969 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
970 }\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
971 }\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
972 \ |
651 | 973 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ |
974 int src_stride1, int src_stride2, int h){\ | |
975 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\ | |
976 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\ | |
977 }\ | |
978 \ | |
979 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ | |
980 int src_stride1, int src_stride2, int h){\ | |
981 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\ | |
982 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\ | |
983 }\ | |
984 \ | |
859 | 985 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
651 | 986 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\ |
987 }\ | |
988 \ | |
859 | 989 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
651 | 990 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\ |
991 }\ | |
992 \ | |
859 | 993 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
651 | 994 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\ |
995 }\ | |
996 \ | |
859 | 997 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
651 | 998 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\ |
385 | 999 }\ |
1000 \ | |
651 | 1001 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\ |
1002 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ | |
1003 int i;\ | |
1004 for(i=0; i<h; i++){\ | |
1005 uint32_t a, b, c, d, l0, l1, h0, h1;\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1006 a= AV_RN32(&src1[i*src_stride1]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1007 b= AV_RN32(&src2[i*src_stride2]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1008 c= AV_RN32(&src3[i*src_stride3]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1009 d= AV_RN32(&src4[i*src_stride4]);\ |
651 | 1010 l0= (a&0x03030303UL)\ |
1011 + (b&0x03030303UL)\ | |
1012 + 0x02020202UL;\ | |
1013 h0= ((a&0xFCFCFCFCUL)>>2)\ | |
1014 + ((b&0xFCFCFCFCUL)>>2);\ | |
1015 l1= (c&0x03030303UL)\ | |
1016 + (d&0x03030303UL);\ | |
1017 h1= ((c&0xFCFCFCFCUL)>>2)\ | |
1018 + ((d&0xFCFCFCFCUL)>>2);\ | |
1019 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1020 a= AV_RN32(&src1[i*src_stride1+4]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1021 b= AV_RN32(&src2[i*src_stride2+4]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1022 c= AV_RN32(&src3[i*src_stride3+4]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1023 d= AV_RN32(&src4[i*src_stride4+4]);\ |
651 | 1024 l0= (a&0x03030303UL)\ |
1025 + (b&0x03030303UL)\ | |
1026 + 0x02020202UL;\ | |
1027 h0= ((a&0xFCFCFCFCUL)>>2)\ | |
1028 + ((b&0xFCFCFCFCUL)>>2);\ | |
1029 l1= (c&0x03030303UL)\ | |
1030 + (d&0x03030303UL);\ | |
1031 h1= ((c&0xFCFCFCFCUL)>>2)\ | |
1032 + ((d&0xFCFCFCFCUL)>>2);\ | |
1033 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ | |
1034 }\ | |
1035 }\ | |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1036 \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1037 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1038 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1039 }\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1040 \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1041 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1042 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1043 }\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1044 \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1045 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1046 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1047 }\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1048 \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1049 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1050 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1051 }\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1052 \ |
651 | 1053 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\ |
1054 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ | |
385 | 1055 int i;\ |
1056 for(i=0; i<h; i++){\ | |
651 | 1057 uint32_t a, b, c, d, l0, l1, h0, h1;\ |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1058 a= AV_RN32(&src1[i*src_stride1]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1059 b= AV_RN32(&src2[i*src_stride2]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1060 c= AV_RN32(&src3[i*src_stride3]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1061 d= AV_RN32(&src4[i*src_stride4]);\ |
651 | 1062 l0= (a&0x03030303UL)\ |
1063 + (b&0x03030303UL)\ | |
1064 + 0x01010101UL;\ | |
1065 h0= ((a&0xFCFCFCFCUL)>>2)\ | |
1066 + ((b&0xFCFCFCFCUL)>>2);\ | |
1067 l1= (c&0x03030303UL)\ | |
1068 + (d&0x03030303UL);\ | |
1069 h1= ((c&0xFCFCFCFCUL)>>2)\ | |
1070 + ((d&0xFCFCFCFCUL)>>2);\ | |
1071 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1072 a= AV_RN32(&src1[i*src_stride1+4]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1073 b= AV_RN32(&src2[i*src_stride2+4]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1074 c= AV_RN32(&src3[i*src_stride3+4]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1075 d= AV_RN32(&src4[i*src_stride4+4]);\ |
651 | 1076 l0= (a&0x03030303UL)\ |
1077 + (b&0x03030303UL)\ | |
1078 + 0x01010101UL;\ | |
1079 h0= ((a&0xFCFCFCFCUL)>>2)\ | |
1080 + ((b&0xFCFCFCFCUL)>>2);\ | |
1081 l1= (c&0x03030303UL)\ | |
1082 + (d&0x03030303UL);\ | |
1083 h1= ((c&0xFCFCFCFCUL)>>2)\ | |
1084 + ((d&0xFCFCFCFCUL)>>2);\ | |
1085 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ | |
385 | 1086 }\ |
1087 }\ | |
651 | 1088 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\ |
1089 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ | |
1090 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\ | |
1091 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\ | |
1092 }\ | |
1093 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\ | |
1094 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ | |
1095 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\ | |
1096 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\ | |
1097 }\ | |
385 | 1098 \ |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1099 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1100 {\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1101 int i, a0, b0, a1, b1;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1102 a0= pixels[0];\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1103 b0= pixels[1] + 2;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1104 a0 += b0;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1105 b0 += pixels[2];\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1106 \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1107 pixels+=line_size;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1108 for(i=0; i<h; i+=2){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1109 a1= pixels[0];\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1110 b1= pixels[1];\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1111 a1 += b1;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1112 b1 += pixels[2];\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1113 \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1114 block[0]= (a1+a0)>>2; /* FIXME non put */\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1115 block[1]= (b1+b0)>>2;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1116 \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1117 pixels+=line_size;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1118 block +=line_size;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1119 \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1120 a0= pixels[0];\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1121 b0= pixels[1] + 2;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1122 a0 += b0;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1123 b0 += pixels[2];\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1124 \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1125 block[0]= (a1+a0)>>2;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1126 block[1]= (b1+b0)>>2;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1127 pixels+=line_size;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1128 block +=line_size;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1129 }\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1130 }\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1131 \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1132 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1133 {\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1134 int i;\ |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1135 const uint32_t a= AV_RN32(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1136 const uint32_t b= AV_RN32(pixels+1);\ |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1137 uint32_t l0= (a&0x03030303UL)\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1138 + (b&0x03030303UL)\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1139 + 0x02020202UL;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1140 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1141 + ((b&0xFCFCFCFCUL)>>2);\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1142 uint32_t l1,h1;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1143 \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1144 pixels+=line_size;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1145 for(i=0; i<h; i+=2){\ |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1146 uint32_t a= AV_RN32(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1147 uint32_t b= AV_RN32(pixels+1);\ |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1148 l1= (a&0x03030303UL)\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1149 + (b&0x03030303UL);\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1150 h1= ((a&0xFCFCFCFCUL)>>2)\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1151 + ((b&0xFCFCFCFCUL)>>2);\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1152 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1153 pixels+=line_size;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1154 block +=line_size;\ |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1155 a= AV_RN32(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1156 b= AV_RN32(pixels+1);\ |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1157 l0= (a&0x03030303UL)\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1158 + (b&0x03030303UL)\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1159 + 0x02020202UL;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1160 h0= ((a&0xFCFCFCFCUL)>>2)\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1161 + ((b&0xFCFCFCFCUL)>>2);\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1162 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1163 pixels+=line_size;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1164 block +=line_size;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1165 }\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1166 }\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1167 \ |
859 | 1168 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ |
385 | 1169 {\ |
1170 int j;\ | |
1171 for(j=0; j<2; j++){\ | |
1172 int i;\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1173 const uint32_t a= AV_RN32(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1174 const uint32_t b= AV_RN32(pixels+1);\ |
385 | 1175 uint32_t l0= (a&0x03030303UL)\ |
1176 + (b&0x03030303UL)\ | |
1177 + 0x02020202UL;\ | |
1178 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\ | |
1179 + ((b&0xFCFCFCFCUL)>>2);\ | |
1180 uint32_t l1,h1;\ | |
1181 \ | |
1182 pixels+=line_size;\ | |
1183 for(i=0; i<h; i+=2){\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1184 uint32_t a= AV_RN32(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1185 uint32_t b= AV_RN32(pixels+1);\ |
385 | 1186 l1= (a&0x03030303UL)\ |
1187 + (b&0x03030303UL);\ | |
1188 h1= ((a&0xFCFCFCFCUL)>>2)\ | |
1189 + ((b&0xFCFCFCFCUL)>>2);\ | |
1190 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ | |
1191 pixels+=line_size;\ | |
1192 block +=line_size;\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1193 a= AV_RN32(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1194 b= AV_RN32(pixels+1);\ |
385 | 1195 l0= (a&0x03030303UL)\ |
1196 + (b&0x03030303UL)\ | |
1197 + 0x02020202UL;\ | |
1198 h0= ((a&0xFCFCFCFCUL)>>2)\ | |
1199 + ((b&0xFCFCFCFCUL)>>2);\ | |
1200 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ | |
1201 pixels+=line_size;\ | |
1202 block +=line_size;\ | |
1203 }\ | |
1204 pixels+=4-line_size*(h+1);\ | |
1205 block +=4-line_size*h;\ | |
1206 }\ | |
1207 }\ | |
1208 \ | |
859 | 1209 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ |
385 | 1210 {\ |
1211 int j;\ | |
1212 for(j=0; j<2; j++){\ | |
1213 int i;\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1214 const uint32_t a= AV_RN32(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1215 const uint32_t b= AV_RN32(pixels+1);\ |
385 | 1216 uint32_t l0= (a&0x03030303UL)\ |
1217 + (b&0x03030303UL)\ | |
1218 + 0x01010101UL;\ | |
1219 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\ | |
1220 + ((b&0xFCFCFCFCUL)>>2);\ | |
1221 uint32_t l1,h1;\ | |
1222 \ | |
1223 pixels+=line_size;\ | |
1224 for(i=0; i<h; i+=2){\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1225 uint32_t a= AV_RN32(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1226 uint32_t b= AV_RN32(pixels+1);\ |
385 | 1227 l1= (a&0x03030303UL)\ |
1228 + (b&0x03030303UL);\ | |
1229 h1= ((a&0xFCFCFCFCUL)>>2)\ | |
1230 + ((b&0xFCFCFCFCUL)>>2);\ | |
1231 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ | |
1232 pixels+=line_size;\ | |
1233 block +=line_size;\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1234 a= AV_RN32(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1235 b= AV_RN32(pixels+1);\ |
385 | 1236 l0= (a&0x03030303UL)\ |
1237 + (b&0x03030303UL)\ | |
1238 + 0x01010101UL;\ | |
1239 h0= ((a&0xFCFCFCFCUL)>>2)\ | |
1240 + ((b&0xFCFCFCFCUL)>>2);\ | |
1241 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ | |
1242 pixels+=line_size;\ | |
1243 block +=line_size;\ | |
1244 }\ | |
1245 pixels+=4-line_size*(h+1);\ | |
1246 block +=4-line_size*h;\ | |
1247 }\ | |
1248 }\ | |
1249 \ | |
859 | 1250 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\ |
1251 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\ | |
1252 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\ | |
1253 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\ | |
1254 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\ | |
1255 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\ | |
1256 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\ | |
1257 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\ | |
651 | 1258 |
1264 | 1259 #define op_avg(a, b) a = rnd_avg32(a, b) |
385 | 1260 #endif |
1261 #define op_put(a, b) a = b | |
1262 | |
1263 PIXOP2(avg, op_avg) | |
1264 PIXOP2(put, op_put) | |
1265 #undef op_avg | |
1266 #undef op_put | |
1267 | |
0 | 1268 #define avg2(a,b) ((a+b+1)>>1) |
1269 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2) | |
1270 | |
1864 | 1271 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){ |
1272 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h); | |
1273 } | |
1274 | |
1275 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){ | |
1276 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h); | |
1277 } | |
753 | 1278 |
1064 | 1279 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder) |
255 | 1280 { |
1281 const int A=(16-x16)*(16-y16); | |
1282 const int B=( x16)*(16-y16); | |
1283 const int C=(16-x16)*( y16); | |
1284 const int D=( x16)*( y16); | |
1285 int i; | |
1286 | |
1287 for(i=0; i<h; i++) | |
1288 { | |
651 | 1289 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8; |
1290 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8; | |
1291 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8; | |
1292 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8; | |
1293 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8; | |
1294 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8; | |
1295 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8; | |
1296 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8; | |
1297 dst+= stride; | |
1298 src+= stride; | |
255 | 1299 } |
1300 } | |
1301 | |
3248
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3245
diff
changeset
|
1302 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, |
753 | 1303 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height) |
1304 { | |
1305 int y, vx, vy; | |
1306 const int s= 1<<shift; | |
2967 | 1307 |
753 | 1308 width--; |
1309 height--; | |
1310 | |
1311 for(y=0; y<h; y++){ | |
1312 int x; | |
1313 | |
1314 vx= ox; | |
1315 vy= oy; | |
1316 for(x=0; x<8; x++){ //XXX FIXME optimize | |
1317 int src_x, src_y, frac_x, frac_y, index; | |
1318 | |
1319 src_x= vx>>16; | |
1320 src_y= vy>>16; | |
1321 frac_x= src_x&(s-1); | |
1322 frac_y= src_y&(s-1); | |
1323 src_x>>=shift; | |
1324 src_y>>=shift; | |
2967 | 1325 |
753 | 1326 if((unsigned)src_x < width){ |
1327 if((unsigned)src_y < height){ | |
1328 index= src_x + src_y*stride; | |
1329 dst[y*stride + x]= ( ( src[index ]*(s-frac_x) | |
1330 + src[index +1]* frac_x )*(s-frac_y) | |
1331 + ( src[index+stride ]*(s-frac_x) | |
1332 + src[index+stride+1]* frac_x )* frac_y | |
1333 + r)>>(shift*2); | |
1334 }else{ | |
4594 | 1335 index= src_x + av_clip(src_y, 0, height)*stride; |
2967 | 1336 dst[y*stride + x]= ( ( src[index ]*(s-frac_x) |
753 | 1337 + src[index +1]* frac_x )*s |
1338 + r)>>(shift*2); | |
1339 } | |
1340 }else{ | |
1341 if((unsigned)src_y < height){ | |
4594 | 1342 index= av_clip(src_x, 0, width) + src_y*stride; |
2967 | 1343 dst[y*stride + x]= ( ( src[index ]*(s-frac_y) |
753 | 1344 + src[index+stride ]* frac_y )*s |
1345 + r)>>(shift*2); | |
1346 }else{ | |
4594 | 1347 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride; |
753 | 1348 dst[y*stride + x]= src[index ]; |
1349 } | |
1350 } | |
2967 | 1351 |
753 | 1352 vx+= dxx; |
1353 vy+= dyx; | |
1354 } | |
1355 ox += dxy; | |
1356 oy += dyy; | |
1357 } | |
1358 } | |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1359 |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1360 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1361 switch(width){ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1362 case 2: put_pixels2_c (dst, src, stride, height); break; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1363 case 4: put_pixels4_c (dst, src, stride, height); break; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1364 case 8: put_pixels8_c (dst, src, stride, height); break; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1365 case 16:put_pixels16_c(dst, src, stride, height); break; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1366 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1367 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1368 |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1369 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1370 int i,j; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1371 for (i=0; i < height; i++) { |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1372 for (j=0; j < width; j++) { |
2979 | 1373 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11; |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1374 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1375 src += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1376 dst += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1377 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1378 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1379 |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1380 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1381 int i,j; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1382 for (i=0; i < height; i++) { |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1383 for (j=0; j < width; j++) { |
2979 | 1384 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11; |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1385 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1386 src += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1387 dst += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1388 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1389 } |
2967 | 1390 |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1391 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1392 int i,j; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1393 for (i=0; i < height; i++) { |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1394 for (j=0; j < width; j++) { |
2979 | 1395 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11; |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1396 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1397 src += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1398 dst += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1399 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1400 } |
2967 | 1401 |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1402 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1403 int i,j; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1404 for (i=0; i < height; i++) { |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1405 for (j=0; j < width; j++) { |
2979 | 1406 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15; |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1407 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1408 src += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1409 dst += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1410 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1411 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1412 |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1413 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1414 int i,j; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1415 for (i=0; i < height; i++) { |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1416 for (j=0; j < width; j++) { |
2979 | 1417 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15; |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1418 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1419 src += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1420 dst += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1421 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1422 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1423 |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1424 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1425 int i,j; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1426 for (i=0; i < height; i++) { |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1427 for (j=0; j < width; j++) { |
2979 | 1428 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11; |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1429 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1430 src += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1431 dst += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1432 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1433 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1434 |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1435 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1436 int i,j; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1437 for (i=0; i < height; i++) { |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1438 for (j=0; j < width; j++) { |
2979 | 1439 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15; |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1440 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1441 src += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1442 dst += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1443 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1444 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1445 |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1446 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1447 int i,j; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1448 for (i=0; i < height; i++) { |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1449 for (j=0; j < width; j++) { |
2979 | 1450 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15; |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1451 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1452 src += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1453 dst += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1454 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1455 } |
1319 | 1456 |
1457 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ | |
1458 switch(width){ | |
1459 case 2: avg_pixels2_c (dst, src, stride, height); break; | |
1460 case 4: avg_pixels4_c (dst, src, stride, height); break; | |
1461 case 8: avg_pixels8_c (dst, src, stride, height); break; | |
1462 case 16:avg_pixels16_c(dst, src, stride, height); break; | |
1463 } | |
1464 } | |
1465 | |
1466 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ | |
1467 int i,j; | |
1468 for (i=0; i < height; i++) { | |
1469 for (j=0; j < width; j++) { | |
2979 | 1470 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1; |
1319 | 1471 } |
1472 src += stride; | |
1473 dst += stride; | |
1474 } | |
1475 } | |
1476 | |
1477 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ | |
1478 int i,j; | |
1479 for (i=0; i < height; i++) { | |
1480 for (j=0; j < width; j++) { | |
2979 | 1481 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1; |
1319 | 1482 } |
1483 src += stride; | |
1484 dst += stride; | |
1485 } | |
1486 } | |
2967 | 1487 |
1319 | 1488 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ |
1489 int i,j; | |
1490 for (i=0; i < height; i++) { | |
1491 for (j=0; j < width; j++) { | |
2979 | 1492 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1; |
1319 | 1493 } |
1494 src += stride; | |
1495 dst += stride; | |
1496 } | |
1497 } | |
2967 | 1498 |
1319 | 1499 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ |
1500 int i,j; | |
1501 for (i=0; i < height; i++) { | |
1502 for (j=0; j < width; j++) { | |
2979 | 1503 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1; |
1319 | 1504 } |
1505 src += stride; | |
1506 dst += stride; | |
1507 } | |
1508 } | |
1509 | |
1510 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ | |
1511 int i,j; | |
1512 for (i=0; i < height; i++) { | |
1513 for (j=0; j < width; j++) { | |
2979 | 1514 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1; |
1319 | 1515 } |
1516 src += stride; | |
1517 dst += stride; | |
1518 } | |
1519 } | |
1520 | |
1521 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ | |
1522 int i,j; | |
1523 for (i=0; i < height; i++) { | |
1524 for (j=0; j < width; j++) { | |
2979 | 1525 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1; |
1319 | 1526 } |
1527 src += stride; | |
1528 dst += stride; | |
1529 } | |
1530 } | |
1531 | |
1532 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ | |
1533 int i,j; | |
1534 for (i=0; i < height; i++) { | |
1535 for (j=0; j < width; j++) { | |
2979 | 1536 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1; |
1319 | 1537 } |
1538 src += stride; | |
1539 dst += stride; | |
1540 } | |
1541 } | |
1542 | |
1543 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ | |
1544 int i,j; | |
1545 for (i=0; i < height; i++) { | |
1546 for (j=0; j < width; j++) { | |
2979 | 1547 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1; |
1319 | 1548 } |
1549 src += stride; | |
1550 dst += stride; | |
1551 } | |
1552 } | |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1553 #if 0 |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1554 #define TPEL_WIDTH(width)\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1555 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1556 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1557 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1558 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1559 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1560 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1561 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1562 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1563 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1564 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1565 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1566 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1567 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1568 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1569 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1570 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1571 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1572 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);} |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1573 #endif |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1574 |
1168 | 1575 #define H264_CHROMA_MC(OPNAME, OP)\ |
1576 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\ | |
1577 const int A=(8-x)*(8-y);\ | |
1578 const int B=( x)*(8-y);\ | |
1579 const int C=(8-x)*( y);\ | |
1580 const int D=( x)*( y);\ | |
1581 int i;\ | |
1582 \ | |
1583 assert(x<8 && y<8 && x>=0 && y>=0);\ | |
1584 \ | |
6052
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1585 if(D){\ |
6054 | 1586 for(i=0; i<h; i++){\ |
6053 | 1587 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\ |
1588 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\ | |
1589 dst+= stride;\ | |
1590 src+= stride;\ | |
1591 }\ | |
6052
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1592 }else{\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1593 const int E= B+C;\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1594 const int step= C ? stride : 1;\ |
6054 | 1595 for(i=0; i<h; i++){\ |
6052
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1596 OP(dst[0], (A*src[0] + E*src[step+0]));\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1597 OP(dst[1], (A*src[1] + E*src[step+1]));\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1598 dst+= stride;\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1599 src+= stride;\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1600 }\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1601 }\ |
1168 | 1602 }\ |
1603 \ | |
1604 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\ | |
1605 const int A=(8-x)*(8-y);\ | |
1606 const int B=( x)*(8-y);\ | |
1607 const int C=(8-x)*( y);\ | |
1608 const int D=( x)*( y);\ | |
1609 int i;\ | |
1610 \ | |
1611 assert(x<8 && y<8 && x>=0 && y>=0);\ | |
1612 \ | |
6052
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1613 if(D){\ |
6054 | 1614 for(i=0; i<h; i++){\ |
6053 | 1615 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\ |
1616 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\ | |
1617 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\ | |
1618 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\ | |
1619 dst+= stride;\ | |
1620 src+= stride;\ | |
1621 }\ | |
6052
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1622 }else{\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1623 const int E= B+C;\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1624 const int step= C ? stride : 1;\ |
6054 | 1625 for(i=0; i<h; i++){\ |
6052
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1626 OP(dst[0], (A*src[0] + E*src[step+0]));\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1627 OP(dst[1], (A*src[1] + E*src[step+1]));\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1628 OP(dst[2], (A*src[2] + E*src[step+2]));\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1629 OP(dst[3], (A*src[3] + E*src[step+3]));\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1630 dst+= stride;\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1631 src+= stride;\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1632 }\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1633 }\ |
1168 | 1634 }\ |
1635 \ | |
1636 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\ | |
1637 const int A=(8-x)*(8-y);\ | |
1638 const int B=( x)*(8-y);\ | |
1639 const int C=(8-x)*( y);\ | |
1640 const int D=( x)*( y);\ | |
1641 int i;\ | |
1642 \ | |
1643 assert(x<8 && y<8 && x>=0 && y>=0);\ | |
1644 \ | |
6051
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1645 if(D){\ |
6054 | 1646 for(i=0; i<h; i++){\ |
6053 | 1647 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\ |
1648 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\ | |
1649 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\ | |
1650 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\ | |
1651 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\ | |
1652 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\ | |
1653 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\ | |
1654 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\ | |
1655 dst+= stride;\ | |
1656 src+= stride;\ | |
1657 }\ | |
6051
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1658 }else{\ |
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1659 const int E= B+C;\ |
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1660 const int step= C ? stride : 1;\ |
6054 | 1661 for(i=0; i<h; i++){\ |
6051
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1662 OP(dst[0], (A*src[0] + E*src[step+0]));\ |
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1663 OP(dst[1], (A*src[1] + E*src[step+1]));\ |
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1664 OP(dst[2], (A*src[2] + E*src[step+2]));\ |
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1665 OP(dst[3], (A*src[3] + E*src[step+3]));\ |
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1666 OP(dst[4], (A*src[4] + E*src[step+4]));\ |
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1667 OP(dst[5], (A*src[5] + E*src[step+5]));\ |
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1668 OP(dst[6], (A*src[6] + E*src[step+6]));\ |
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1669 OP(dst[7], (A*src[7] + E*src[step+7]));\ |
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1670 dst+= stride;\ |
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1671 src+= stride;\ |
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1672 }\ |
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1673 }\ |
1168 | 1674 } |
1675 | |
1676 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1) | |
1677 #define op_put(a, b) a = (((b) + 32)>>6) | |
1678 | |
1679 H264_CHROMA_MC(put_ , op_put) | |
1680 H264_CHROMA_MC(avg_ , op_avg) | |
1681 #undef op_avg | |
1682 #undef op_put | |
1683 | |
9439
ef3a7b711cc0
Rename put_no_rnd_h264_chroma* to reflect its usage in VC1 only
conrad
parents:
9437
diff
changeset
|
1684 static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){ |
3663 | 1685 const int A=(8-x)*(8-y); |
1686 const int B=( x)*(8-y); | |
1687 const int C=(8-x)*( y); | |
1688 const int D=( x)*( y); | |
1689 int i; | |
1690 | |
1691 assert(x<8 && y<8 && x>=0 && y>=0); | |
1692 | |
1693 for(i=0; i<h; i++) | |
1694 { | |
1695 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6; | |
1696 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6; | |
1697 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6; | |
1698 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6; | |
1699 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6; | |
1700 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6; | |
1701 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6; | |
1702 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6; | |
1703 dst+= stride; | |
1704 src+= stride; | |
1705 } | |
1706 } | |
1707 | |
9440 | 1708 static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){ |
1709 const int A=(8-x)*(8-y); | |
1710 const int B=( x)*(8-y); | |
1711 const int C=(8-x)*( y); | |
1712 const int D=( x)*( y); | |
1713 int i; | |
1714 | |
1715 assert(x<8 && y<8 && x>=0 && y>=0); | |
1716 | |
1717 for(i=0; i<h; i++) | |
1718 { | |
1719 dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6)); | |
1720 dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6)); | |
1721 dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6)); | |
1722 dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6)); | |
1723 dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6)); | |
1724 dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6)); | |
1725 dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6)); | |
1726 dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6)); | |
1727 dst+= stride; | |
1728 src+= stride; | |
1729 } | |
1730 } | |
1731 | |
651 | 1732 #define QPEL_MC(r, OPNAME, RND, OP) \ |
1064 | 1733 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ |
4176 | 1734 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ |
651 | 1735 int i;\ |
1736 for(i=0; i<h; i++)\ | |
1737 {\ | |
1738 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\ | |
1739 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\ | |
1740 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\ | |
1741 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\ | |
1742 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\ | |
1743 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\ | |
1744 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\ | |
1745 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\ | |
1746 dst+=dstStride;\ | |
1747 src+=srcStride;\ | |
1748 }\ | |
1749 }\ | |
1750 \ | |
1064 | 1751 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
984 | 1752 const int w=8;\ |
4176 | 1753 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ |
651 | 1754 int i;\ |
1755 for(i=0; i<w; i++)\ | |
1756 {\ | |
1757 const int src0= src[0*srcStride];\ | |
1758 const int src1= src[1*srcStride];\ | |
1759 const int src2= src[2*srcStride];\ | |
1760 const int src3= src[3*srcStride];\ | |
1761 const int src4= src[4*srcStride];\ | |
1762 const int src5= src[5*srcStride];\ | |
1763 const int src6= src[6*srcStride];\ | |
1764 const int src7= src[7*srcStride];\ | |
1765 const int src8= src[8*srcStride];\ | |
1766 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\ | |
1767 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\ | |
1768 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\ | |
1769 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\ | |
1770 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\ | |
1771 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\ | |
1772 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\ | |
1773 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\ | |
1774 dst++;\ | |
1775 src++;\ | |
1776 }\ | |
1777 }\ | |
1778 \ | |
1064 | 1779 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ |
4176 | 1780 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ |
651 | 1781 int i;\ |
954 | 1782 \ |
651 | 1783 for(i=0; i<h; i++)\ |
1784 {\ | |
1785 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\ | |
1786 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\ | |
1787 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\ | |
1788 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\ | |
1789 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\ | |
1790 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\ | |
1791 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\ | |
1792 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\ | |
1793 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\ | |
1794 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\ | |
1795 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\ | |
1796 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\ | |
1797 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\ | |
1798 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\ | |
1799 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\ | |
1800 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\ | |
1801 dst+=dstStride;\ | |
1802 src+=srcStride;\ | |
1803 }\ | |
255 | 1804 }\ |
1805 \ | |
1064 | 1806 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
4176 | 1807 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ |
651 | 1808 int i;\ |
954 | 1809 const int w=16;\ |
651 | 1810 for(i=0; i<w; i++)\ |
1811 {\ | |
1812 const int src0= src[0*srcStride];\ | |
1813 const int src1= src[1*srcStride];\ | |
1814 const int src2= src[2*srcStride];\ | |
1815 const int src3= src[3*srcStride];\ | |
1816 const int src4= src[4*srcStride];\ | |
1817 const int src5= src[5*srcStride];\ | |
1818 const int src6= src[6*srcStride];\ | |
1819 const int src7= src[7*srcStride];\ | |
1820 const int src8= src[8*srcStride];\ | |
1821 const int src9= src[9*srcStride];\ | |
1822 const int src10= src[10*srcStride];\ | |
1823 const int src11= src[11*srcStride];\ | |
1824 const int src12= src[12*srcStride];\ | |
1825 const int src13= src[13*srcStride];\ | |
1826 const int src14= src[14*srcStride];\ | |
1827 const int src15= src[15*srcStride];\ | |
1828 const int src16= src[16*srcStride];\ | |
1829 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\ | |
1830 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\ | |
1831 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\ | |
1832 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\ | |
1833 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\ | |
1834 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\ | |
1835 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\ | |
1836 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\ | |
1837 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\ | |
1838 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\ | |
1839 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\ | |
1840 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\ | |
1841 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\ | |
1842 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\ | |
1843 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\ | |
1844 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\ | |
1845 dst++;\ | |
1846 src++;\ | |
1847 }\ | |
255 | 1848 }\ |
1849 \ | |
1064 | 1850 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\ |
859 | 1851 OPNAME ## pixels8_c(dst, src, stride, 8);\ |
255 | 1852 }\ |
1853 \ | |
1064 | 1854 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\ |
1855 uint8_t half[64];\ | |
651 | 1856 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\ |
1857 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\ | |
1858 }\ | |
1859 \ | |
1064 | 1860 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\ |
651 | 1861 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\ |
255 | 1862 }\ |
1863 \ | |
1064 | 1864 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\ |
1865 uint8_t half[64];\ | |
651 | 1866 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\ |
1867 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\ | |
1868 }\ | |
1869 \ | |
1064 | 1870 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\ |
1871 uint8_t full[16*9];\ | |
1872 uint8_t half[64];\ | |
651 | 1873 copy_block9(full, src, 16, stride, 9);\ |
984 | 1874 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\ |
651 | 1875 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\ |
1876 }\ | |
1877 \ | |
1064 | 1878 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\ |
1879 uint8_t full[16*9];\ | |
651 | 1880 copy_block9(full, src, 16, stride, 9);\ |
984 | 1881 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\ |
255 | 1882 }\ |
1883 \ | |
1064 | 1884 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\ |
1885 uint8_t full[16*9];\ | |
1886 uint8_t half[64];\ | |
651 | 1887 copy_block9(full, src, 16, stride, 9);\ |
984 | 1888 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\ |
651 | 1889 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\ |
1890 }\ | |
1064 | 1891 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
1892 uint8_t full[16*9];\ | |
1893 uint8_t halfH[72];\ | |
1894 uint8_t halfV[64];\ | |
1895 uint8_t halfHV[64];\ | |
651 | 1896 copy_block9(full, src, 16, stride, 9);\ |
1897 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\ | |
984 | 1898 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\ |
1899 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ | |
651 | 1900 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\ |
255 | 1901 }\ |
1064 | 1902 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\ |
1903 uint8_t full[16*9];\ | |
1904 uint8_t halfH[72];\ | |
1905 uint8_t halfHV[64];\ | |
984 | 1906 copy_block9(full, src, 16, stride, 9);\ |
1907 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\ | |
1908 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\ | |
1909 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ | |
1910 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\ | |
1911 }\ | |
1064 | 1912 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
1913 uint8_t full[16*9];\ | |
1914 uint8_t halfH[72];\ | |
1915 uint8_t halfV[64];\ | |
1916 uint8_t halfHV[64];\ | |
651 | 1917 copy_block9(full, src, 16, stride, 9);\ |
1918 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\ | |
984 | 1919 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\ |
1920 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ | |
651 | 1921 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\ |
255 | 1922 }\ |
1064 | 1923 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\ |
1924 uint8_t full[16*9];\ | |
1925 uint8_t halfH[72];\ | |
1926 uint8_t halfHV[64];\ | |
984 | 1927 copy_block9(full, src, 16, stride, 9);\ |
1928 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\ | |
1929 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\ | |
1930 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ | |
1931 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\ | |
1932 }\ | |
1064 | 1933 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
1934 uint8_t full[16*9];\ | |
1935 uint8_t halfH[72];\ | |
1936 uint8_t halfV[64];\ | |
1937 uint8_t halfHV[64];\ | |
651 | 1938 copy_block9(full, src, 16, stride, 9);\ |
1939 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\ | |
984 | 1940 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\ |
1941 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ | |
651 | 1942 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\ |
1943 }\ | |
1064 | 1944 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\ |
1945 uint8_t full[16*9];\ | |
1946 uint8_t halfH[72];\ | |
1947 uint8_t halfHV[64];\ | |
984 | 1948 copy_block9(full, src, 16, stride, 9);\ |
1949 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\ | |
1950 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\ | |
1951 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ | |
1952 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\ | |
1953 }\ | |
1064 | 1954 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
1955 uint8_t full[16*9];\ | |
1956 uint8_t halfH[72];\ | |
1957 uint8_t halfV[64];\ | |
1958 uint8_t halfHV[64];\ | |
651 | 1959 copy_block9(full, src, 16, stride, 9);\ |
1960 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\ | |
984 | 1961 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\ |
1962 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ | |
651 | 1963 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\ |
255 | 1964 }\ |
1064 | 1965 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\ |
1966 uint8_t full[16*9];\ | |
1967 uint8_t halfH[72];\ | |
1968 uint8_t halfHV[64];\ | |
984 | 1969 copy_block9(full, src, 16, stride, 9);\ |
1970 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\ | |
1971 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\ | |
1972 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ | |
1973 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\ | |
1974 }\ | |
1064 | 1975 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\ |
1976 uint8_t halfH[72];\ | |
1977 uint8_t halfHV[64];\ | |
651 | 1978 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\ |
984 | 1979 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ |
651 | 1980 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\ |
1981 }\ | |
1064 | 1982 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\ |
1983 uint8_t halfH[72];\ | |
1984 uint8_t halfHV[64];\ | |
651 | 1985 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\ |
984 | 1986 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ |
651 | 1987 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\ |
1988 }\ | |
1064 | 1989 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
1990 uint8_t full[16*9];\ | |
1991 uint8_t halfH[72];\ | |
1992 uint8_t halfV[64];\ | |
1993 uint8_t halfHV[64];\ | |
651 | 1994 copy_block9(full, src, 16, stride, 9);\ |
1995 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\ | |
984 | 1996 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\ |
1997 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ | |
651 | 1998 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\ |
255 | 1999 }\ |
1064 | 2000 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\ |
2001 uint8_t full[16*9];\ | |
2002 uint8_t halfH[72];\ | |
984 | 2003 copy_block9(full, src, 16, stride, 9);\ |
2004 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\ | |
2005 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\ | |
2006 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\ | |
2007 }\ | |
1064 | 2008 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
2009 uint8_t full[16*9];\ | |
2010 uint8_t halfH[72];\ | |
2011 uint8_t halfV[64];\ | |
2012 uint8_t halfHV[64];\ | |
651 | 2013 copy_block9(full, src, 16, stride, 9);\ |
2014 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\ | |
984 | 2015 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\ |
2016 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ | |
651 | 2017 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\ |
2018 }\ | |
1064 | 2019 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\ |
2020 uint8_t full[16*9];\ | |
2021 uint8_t halfH[72];\ | |
984 | 2022 copy_block9(full, src, 16, stride, 9);\ |
2023 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\ | |
2024 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\ | |
2025 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\ | |
2026 }\ | |
1064 | 2027 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\ |
2028 uint8_t halfH[72];\ | |
651 | 2029 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\ |
984 | 2030 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\ |
651 | 2031 }\ |
1064 | 2032 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\ |
859 | 2033 OPNAME ## pixels16_c(dst, src, stride, 16);\ |
255 | 2034 }\ |
651 | 2035 \ |
1064 | 2036 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\ |
2037 uint8_t half[256];\ | |
651 | 2038 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\ |
2039 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\ | |
2040 }\ | |
2041 \ | |
1064 | 2042 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\ |
651 | 2043 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\ |
2044 }\ | |
2045 \ | |
1064 | 2046 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\ |
2047 uint8_t half[256];\ | |
651 | 2048 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\ |
2049 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\ | |
2050 }\ | |
2051 \ | |
1064 | 2052 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\ |
2053 uint8_t full[24*17];\ | |
2054 uint8_t half[256];\ | |
651 | 2055 copy_block17(full, src, 24, stride, 17);\ |
954 | 2056 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\ |
651 | 2057 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\ |
255 | 2058 }\ |
651 | 2059 \ |
1064 | 2060 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\ |
2061 uint8_t full[24*17];\ | |
651 | 2062 copy_block17(full, src, 24, stride, 17);\ |
954 | 2063 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\ |
651 | 2064 }\ |
2065 \ | |
1064 | 2066 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\ |
2067 uint8_t full[24*17];\ | |
2068 uint8_t half[256];\ | |
651 | 2069 copy_block17(full, src, 24, stride, 17);\ |
954 | 2070 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\ |
651 | 2071 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\ |
255 | 2072 }\ |
1064 | 2073 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
2074 uint8_t full[24*17];\ | |
2075 uint8_t halfH[272];\ | |
2076 uint8_t halfV[256];\ | |
2077 uint8_t halfHV[256];\ | |
651 | 2078 copy_block17(full, src, 24, stride, 17);\ |
2079 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\ | |
954 | 2080 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\ |
2081 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ | |
651 | 2082 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\ |
2083 }\ | |
1064 | 2084 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\ |
2085 uint8_t full[24*17];\ | |
2086 uint8_t halfH[272];\ | |
2087 uint8_t halfHV[256];\ | |
984 | 2088 copy_block17(full, src, 24, stride, 17);\ |
2089 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\ | |
2090 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\ | |
2091 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ | |
2092 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\ | |
2093 }\ | |
1064 | 2094 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
2095 uint8_t full[24*17];\ | |
2096 uint8_t halfH[272];\ | |
2097 uint8_t halfV[256];\ | |
2098 uint8_t halfHV[256];\ | |
651 | 2099 copy_block17(full, src, 24, stride, 17);\ |
2100 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\ | |
954 | 2101 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\ |
2102 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ | |
651 | 2103 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\ |
2104 }\ | |
1064 | 2105 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\ |
2106 uint8_t full[24*17];\ | |
2107 uint8_t halfH[272];\ | |
2108 uint8_t halfHV[256];\ | |
984 | 2109 copy_block17(full, src, 24, stride, 17);\ |
2110 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\ | |
2111 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\ | |
2112 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ | |
2113 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\ | |
2114 }\ | |
1064 | 2115 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
2116 uint8_t full[24*17];\ | |
2117 uint8_t halfH[272];\ | |
2118 uint8_t halfV[256];\ | |
2119 uint8_t halfHV[256];\ | |
651 | 2120 copy_block17(full, src, 24, stride, 17);\ |
2121 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\ | |
954 | 2122 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\ |
2123 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ | |
651 | 2124 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\ |
255 | 2125 }\ |
1064 | 2126 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\ |
2127 uint8_t full[24*17];\ | |
2128 uint8_t halfH[272];\ | |
2129 uint8_t halfHV[256];\ | |
984 | 2130 copy_block17(full, src, 24, stride, 17);\ |
2131 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\ | |
2132 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\ | |
2133 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ | |
2134 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\ | |
2135 }\ | |
1064 | 2136 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
2137 uint8_t full[24*17];\ | |
2138 uint8_t halfH[272];\ | |
2139 uint8_t halfV[256];\ | |
2140 uint8_t halfHV[256];\ | |
651 | 2141 copy_block17(full, src, 24, stride, 17);\ |
2142 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\ | |
954 | 2143 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\ |
2144 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ | |
651 | 2145 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\ |
2146 }\ | |
1064 | 2147 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\ |
2148 uint8_t full[24*17];\ | |
2149 uint8_t halfH[272];\ | |
2150 uint8_t halfHV[256];\ | |
984 | 2151 copy_block17(full, src, 24, stride, 17);\ |
2152 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\ | |
2153 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\ | |
2154 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ | |
2155 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\ | |
2156 }\ | |
1064 | 2157 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\ |
2158 uint8_t halfH[272];\ | |
2159 uint8_t halfHV[256];\ | |
651 | 2160 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\ |
954 | 2161 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ |
651 | 2162 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\ |
255 | 2163 }\ |
1064 | 2164 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\ |
2165 uint8_t halfH[272];\ | |
2166 uint8_t halfHV[256];\ | |
651 | 2167 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\ |
954 | 2168 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ |
651 | 2169 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\ |
2170 }\ | |
1064 | 2171 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
2172 uint8_t full[24*17];\ | |
2173 uint8_t halfH[272];\ | |
2174 uint8_t halfV[256];\ | |
2175 uint8_t halfHV[256];\ | |
651 | 2176 copy_block17(full, src, 24, stride, 17);\ |
2177 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\ | |
954 | 2178 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\ |
2179 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ | |
651 | 2180 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\ |
255 | 2181 }\ |
1064 | 2182 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\ |
2183 uint8_t full[24*17];\ | |
2184 uint8_t halfH[272];\ | |
984 | 2185 copy_block17(full, src, 24, stride, 17);\ |
2186 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\ | |
2187 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\ | |
2188 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\ | |
2189 }\ | |
1064 | 2190 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
2191 uint8_t full[24*17];\ | |
2192 uint8_t halfH[272];\ | |
2193 uint8_t halfV[256];\ | |
2194 uint8_t halfHV[256];\ | |
651 | 2195 copy_block17(full, src, 24, stride, 17);\ |
2196 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\ | |
954 | 2197 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\ |
2198 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ | |
651 | 2199 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\ |
2200 }\ | |
1064 | 2201 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\ |
2202 uint8_t full[24*17];\ | |
2203 uint8_t halfH[272];\ | |
984 | 2204 copy_block17(full, src, 24, stride, 17);\ |
2205 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\ | |
2206 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\ | |
2207 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\ | |
2208 }\ | |
1064 | 2209 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\ |
2210 uint8_t halfH[272];\ | |
651 | 2211 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\ |
954 | 2212 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\ |
859 | 2213 } |
255 | 2214 |
651 | 2215 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1) |
2216 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1) | |
2217 #define op_put(a, b) a = cm[((b) + 16)>>5] | |
2218 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5] | |
2219 | |
2220 QPEL_MC(0, put_ , _ , op_put) | |
2221 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd) | |
2222 QPEL_MC(0, avg_ , _ , op_avg) | |
2223 //QPEL_MC(1, avg_no_rnd , _ , op_avg) | |
2224 #undef op_avg | |
2225 #undef op_avg_no_rnd | |
2226 #undef op_put | |
2227 #undef op_put_no_rnd | |
255 | 2228 |
1168 | 2229 #if 1 |
2230 #define H264_LOWPASS(OPNAME, OP, OP2) \ | |
5151 | 2231 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
3020
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2232 const int h=2;\ |
4176 | 2233 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ |
3020
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2234 int i;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2235 for(i=0; i<h; i++)\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2236 {\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2237 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2238 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2239 dst+=dstStride;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2240 src+=srcStride;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2241 }\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2242 }\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2243 \ |
5151 | 2244 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
3020
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2245 const int w=2;\ |
4176 | 2246 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ |
3020
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2247 int i;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2248 for(i=0; i<w; i++)\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2249 {\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2250 const int srcB= src[-2*srcStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2251 const int srcA= src[-1*srcStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2252 const int src0= src[0 *srcStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2253 const int src1= src[1 *srcStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2254 const int src2= src[2 *srcStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2255 const int src3= src[3 *srcStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2256 const int src4= src[4 *srcStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2257 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2258 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2259 dst++;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2260 src++;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2261 }\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2262 }\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2263 \ |
5151 | 2264 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ |
3020
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2265 const int h=2;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2266 const int w=2;\ |
4176 | 2267 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ |
3020
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2268 int i;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2269 src -= 2*srcStride;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2270 for(i=0; i<h+5; i++)\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2271 {\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2272 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2273 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2274 tmp+=tmpStride;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2275 src+=srcStride;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2276 }\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2277 tmp -= tmpStride*(h+5-2);\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2278 for(i=0; i<w; i++)\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2279 {\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2280 const int tmpB= tmp[-2*tmpStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2281 const int tmpA= tmp[-1*tmpStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2282 const int tmp0= tmp[0 *tmpStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2283 const int tmp1= tmp[1 *tmpStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2284 const int tmp2= tmp[2 *tmpStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2285 const int tmp3= tmp[3 *tmpStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2286 const int tmp4= tmp[4 *tmpStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2287 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2288 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2289 dst++;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2290 tmp++;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2291 }\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2292 }\ |
1168 | 2293 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
2294 const int h=4;\ | |
4176 | 2295 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ |
1168 | 2296 int i;\ |
2297 for(i=0; i<h; i++)\ | |
2298 {\ | |
2299 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\ | |
2300 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\ | |
2301 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\ | |
2302 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\ | |
2303 dst+=dstStride;\ | |
2304 src+=srcStride;\ | |
2305 }\ | |
2306 }\ | |
2307 \ | |
2308 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
2309 const int w=4;\ | |
4176 | 2310 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ |
1168 | 2311 int i;\ |
2312 for(i=0; i<w; i++)\ | |
2313 {\ | |
2314 const int srcB= src[-2*srcStride];\ | |
2315 const int srcA= src[-1*srcStride];\ | |
2316 const int src0= src[0 *srcStride];\ | |
2317 const int src1= src[1 *srcStride];\ | |
2318 const int src2= src[2 *srcStride];\ | |
2319 const int src3= src[3 *srcStride];\ | |
2320 const int src4= src[4 *srcStride];\ | |
2321 const int src5= src[5 *srcStride];\ | |
2322 const int src6= src[6 *srcStride];\ | |
2323 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\ | |
2324 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\ | |
2325 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\ | |
2326 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\ | |
2327 dst++;\ | |
2328 src++;\ | |
2329 }\ | |
2330 }\ | |
2331 \ | |
2332 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ | |
2333 const int h=4;\ | |
2334 const int w=4;\ | |
4176 | 2335 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ |
1168 | 2336 int i;\ |
2337 src -= 2*srcStride;\ | |
2338 for(i=0; i<h+5; i++)\ | |
2339 {\ | |
2340 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\ | |
2341 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\ | |
2342 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\ | |
2343 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\ | |
2344 tmp+=tmpStride;\ | |
2345 src+=srcStride;\ | |
2346 }\ | |
2347 tmp -= tmpStride*(h+5-2);\ | |
2348 for(i=0; i<w; i++)\ | |
2349 {\ | |
2350 const int tmpB= tmp[-2*tmpStride];\ | |
2351 const int tmpA= tmp[-1*tmpStride];\ | |
2352 const int tmp0= tmp[0 *tmpStride];\ | |
2353 const int tmp1= tmp[1 *tmpStride];\ | |
2354 const int tmp2= tmp[2 *tmpStride];\ | |
2355 const int tmp3= tmp[3 *tmpStride];\ | |
2356 const int tmp4= tmp[4 *tmpStride];\ | |
2357 const int tmp5= tmp[5 *tmpStride];\ | |
2358 const int tmp6= tmp[6 *tmpStride];\ | |
2359 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\ | |
2360 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\ | |
2361 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\ | |
2362 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\ | |
2363 dst++;\ | |
2364 tmp++;\ | |
2365 }\ | |
2366 }\ | |
2367 \ | |
2368 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
2369 const int h=8;\ | |
4176 | 2370 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ |
1168 | 2371 int i;\ |
2372 for(i=0; i<h; i++)\ | |
2373 {\ | |
2374 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\ | |
2375 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\ | |
2376 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\ | |
2377 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\ | |
2378 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\ | |
2379 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\ | |
2380 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\ | |
2381 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\ | |
2382 dst+=dstStride;\ | |
2383 src+=srcStride;\ | |
2384 }\ | |
2385 }\ | |
2386 \ | |
2387 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
2388 const int w=8;\ | |
4176 | 2389 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ |
1168 | 2390 int i;\ |
2391 for(i=0; i<w; i++)\ | |
2392 {\ | |
2393 const int srcB= src[-2*srcStride];\ | |
2394 const int srcA= src[-1*srcStride];\ | |
2395 const int src0= src[0 *srcStride];\ | |
2396 const int src1= src[1 *srcStride];\ | |
2397 const int src2= src[2 *srcStride];\ | |
2398 const int src3= src[3 *srcStride];\ | |
2399 const int src4= src[4 *srcStride];\ | |
2400 const int src5= src[5 *srcStride];\ | |
2401 const int src6= src[6 *srcStride];\ | |
2402 const int src7= src[7 *srcStride];\ | |
2403 const int src8= src[8 *srcStride];\ | |
2404 const int src9= src[9 *srcStride];\ | |
2405 const int src10=src[10*srcStride];\ | |
2406 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\ | |
2407 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\ | |
2408 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\ | |
2409 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\ | |
2410 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\ | |
2411 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\ | |
2412 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\ | |
2413 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\ | |
2414 dst++;\ | |
2415 src++;\ | |
2416 }\ | |
2417 }\ | |
2418 \ | |
2419 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ | |
2420 const int h=8;\ | |
2421 const int w=8;\ | |
4176 | 2422 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ |
1168 | 2423 int i;\ |
2424 src -= 2*srcStride;\ | |
2425 for(i=0; i<h+5; i++)\ | |
2426 {\ | |
2427 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\ | |
2428 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\ | |
2429 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\ | |
2430 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\ | |
2431 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\ | |
2432 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\ | |
2433 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\ | |
2434 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\ | |
2435 tmp+=tmpStride;\ | |
2436 src+=srcStride;\ | |
2437 }\ | |
2438 tmp -= tmpStride*(h+5-2);\ | |
2439 for(i=0; i<w; i++)\ | |
2440 {\ | |
2441 const int tmpB= tmp[-2*tmpStride];\ | |
2442 const int tmpA= tmp[-1*tmpStride];\ | |
2443 const int tmp0= tmp[0 *tmpStride];\ | |
2444 const int tmp1= tmp[1 *tmpStride];\ | |
2445 const int tmp2= tmp[2 *tmpStride];\ | |
2446 const int tmp3= tmp[3 *tmpStride];\ | |
2447 const int tmp4= tmp[4 *tmpStride];\ | |
2448 const int tmp5= tmp[5 *tmpStride];\ | |
2449 const int tmp6= tmp[6 *tmpStride];\ | |
2450 const int tmp7= tmp[7 *tmpStride];\ | |
2451 const int tmp8= tmp[8 *tmpStride];\ | |
2452 const int tmp9= tmp[9 *tmpStride];\ | |
2453 const int tmp10=tmp[10*tmpStride];\ | |
2454 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\ | |
2455 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\ | |
2456 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\ | |
2457 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\ | |
2458 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\ | |
2459 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\ | |
2460 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\ | |
2461 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\ | |
2462 dst++;\ | |
2463 tmp++;\ | |
2464 }\ | |
2465 }\ | |
2466 \ | |
2467 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
2468 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\ | |
2469 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\ | |
2470 src += 8*srcStride;\ | |
2471 dst += 8*dstStride;\ | |
2472 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\ | |
2473 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\ | |
2474 }\ | |
2475 \ | |
2476 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
2477 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\ | |
2478 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\ | |
2479 src += 8*srcStride;\ | |
2480 dst += 8*dstStride;\ | |
2481 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\ | |
2482 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\ | |
2483 }\ | |
2484 \ | |
2485 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ | |
2486 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\ | |
2487 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\ | |
2488 src += 8*srcStride;\ | |
2489 dst += 8*dstStride;\ | |
2490 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\ | |
2491 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\ | |
2492 }\ | |
2493 | |
2494 #define H264_MC(OPNAME, SIZE) \ | |
2495 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\ | |
2496 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\ | |
2497 }\ | |
2498 \ | |
2499 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2500 uint8_t half[SIZE*SIZE];\ | |
2501 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\ | |
2502 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\ | |
2503 }\ | |
2504 \ | |
2505 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2506 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\ | |
2507 }\ | |
2508 \ | |
2509 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2510 uint8_t half[SIZE*SIZE];\ | |
2511 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\ | |
2512 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\ | |
2513 }\ | |
2514 \ | |
2515 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2516 uint8_t full[SIZE*(SIZE+5)];\ | |
2517 uint8_t * const full_mid= full + SIZE*2;\ | |
2518 uint8_t half[SIZE*SIZE];\ | |
2519 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ | |
2520 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\ | |
2521 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\ | |
2522 }\ | |
2523 \ | |
2524 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2525 uint8_t full[SIZE*(SIZE+5)];\ | |
2526 uint8_t * const full_mid= full + SIZE*2;\ | |
2527 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ | |
2528 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\ | |
2529 }\ | |
2530 \ | |
2531 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2532 uint8_t full[SIZE*(SIZE+5)];\ | |
2533 uint8_t * const full_mid= full + SIZE*2;\ | |
2534 uint8_t half[SIZE*SIZE];\ | |
2535 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ | |
2536 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\ | |
2537 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\ | |
2538 }\ | |
2539 \ | |
2540 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2541 uint8_t full[SIZE*(SIZE+5)];\ | |
2542 uint8_t * const full_mid= full + SIZE*2;\ | |
2543 uint8_t halfH[SIZE*SIZE];\ | |
2544 uint8_t halfV[SIZE*SIZE];\ | |
2545 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\ | |
2546 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ | |
2547 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\ | |
2548 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\ | |
2549 }\ | |
2550 \ | |
2551 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2552 uint8_t full[SIZE*(SIZE+5)];\ | |
2553 uint8_t * const full_mid= full + SIZE*2;\ | |
2554 uint8_t halfH[SIZE*SIZE];\ | |
2555 uint8_t halfV[SIZE*SIZE];\ | |
2556 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\ | |
2557 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\ | |
2558 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\ | |
2559 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\ | |
2560 }\ | |
2561 \ | |
2562 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2563 uint8_t full[SIZE*(SIZE+5)];\ | |
2564 uint8_t * const full_mid= full + SIZE*2;\ | |
2565 uint8_t halfH[SIZE*SIZE];\ | |
2566 uint8_t halfV[SIZE*SIZE];\ | |
2567 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\ | |
2568 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ | |
2569 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\ | |
2570 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\ | |
2571 }\ | |
2572 \ | |
2573 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2574 uint8_t full[SIZE*(SIZE+5)];\ | |
2575 uint8_t * const full_mid= full + SIZE*2;\ | |
2576 uint8_t halfH[SIZE*SIZE];\ | |
2577 uint8_t halfV[SIZE*SIZE];\ | |
2578 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\ | |
2579 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\ | |
2580 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\ | |
2581 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\ | |
2582 }\ | |
2583 \ | |
2584 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2585 int16_t tmp[SIZE*(SIZE+5)];\ | |
2586 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\ | |
2587 }\ | |
2588 \ | |
2589 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2590 int16_t tmp[SIZE*(SIZE+5)];\ | |
2591 uint8_t halfH[SIZE*SIZE];\ | |
2592 uint8_t halfHV[SIZE*SIZE];\ | |
2593 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\ | |
2594 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\ | |
2595 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\ | |
2596 }\ | |
2597 \ | |
2598 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2599 int16_t tmp[SIZE*(SIZE+5)];\ | |
2600 uint8_t halfH[SIZE*SIZE];\ | |
2601 uint8_t halfHV[SIZE*SIZE];\ | |
2602 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\ | |
2603 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\ | |
2604 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\ | |
2605 }\ | |
2606 \ | |
2607 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2608 uint8_t full[SIZE*(SIZE+5)];\ | |
2609 uint8_t * const full_mid= full + SIZE*2;\ | |
2610 int16_t tmp[SIZE*(SIZE+5)];\ | |
2611 uint8_t halfV[SIZE*SIZE];\ | |
2612 uint8_t halfHV[SIZE*SIZE];\ | |
2613 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ | |
2614 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\ | |
2615 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\ | |
2616 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\ | |
2617 }\ | |
2618 \ | |
2619 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2620 uint8_t full[SIZE*(SIZE+5)];\ | |
2621 uint8_t * const full_mid= full + SIZE*2;\ | |
2622 int16_t tmp[SIZE*(SIZE+5)];\ | |
2623 uint8_t halfV[SIZE*SIZE];\ | |
2624 uint8_t halfHV[SIZE*SIZE];\ | |
2625 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\ | |
2626 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\ | |
2627 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\ | |
2628 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\ | |
2629 }\ | |
2630 | |
2631 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1) | |
2632 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7) | |
2633 #define op_put(a, b) a = cm[((b) + 16)>>5] | |
2634 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1) | |
2635 #define op2_put(a, b) a = cm[((b) + 512)>>10] | |
2636 | |
2637 H264_LOWPASS(put_ , op_put, op2_put) | |
2638 H264_LOWPASS(avg_ , op_avg, op2_avg) | |
3020
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2639 H264_MC(put_, 2) |
1168 | 2640 H264_MC(put_, 4) |
2641 H264_MC(put_, 8) | |
2642 H264_MC(put_, 16) | |
2643 H264_MC(avg_, 4) | |
2644 H264_MC(avg_, 8) | |
2645 H264_MC(avg_, 16) | |
2646 | |
2647 #undef op_avg | |
2648 #undef op_put | |
2649 #undef op2_avg | |
2650 #undef op2_put | |
2651 #endif | |
2652 | |
4594 | 2653 #define op_scale1(x) block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom ) |
2654 #define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1)) | |
2415 | 2655 #define H264_WEIGHT(W,H) \ |
2656 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \ | |
3029 | 2657 int y; \ |
2415 | 2658 offset <<= log2_denom; \ |
2659 if(log2_denom) offset += 1<<(log2_denom-1); \ | |
2660 for(y=0; y<H; y++, block += stride){ \ | |
2661 op_scale1(0); \ | |
2662 op_scale1(1); \ | |
2663 if(W==2) continue; \ | |
2664 op_scale1(2); \ | |
2665 op_scale1(3); \ | |
2666 if(W==4) continue; \ | |
2667 op_scale1(4); \ | |
2668 op_scale1(5); \ | |
2669 op_scale1(6); \ | |
2670 op_scale1(7); \ | |
2671 if(W==8) continue; \ | |
2672 op_scale1(8); \ | |
2673 op_scale1(9); \ | |
2674 op_scale1(10); \ | |
2675 op_scale1(11); \ | |
2676 op_scale1(12); \ | |
2677 op_scale1(13); \ | |
2678 op_scale1(14); \ | |
2679 op_scale1(15); \ | |
2680 } \ | |
2681 } \ | |
3029 | 2682 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \ |
2683 int y; \ | |
2684 offset = ((offset + 1) | 1) << log2_denom; \ | |
2415 | 2685 for(y=0; y<H; y++, dst += stride, src += stride){ \ |
2686 op_scale2(0); \ | |
2687 op_scale2(1); \ | |
2688 if(W==2) continue; \ | |
2689 op_scale2(2); \ | |
2690 op_scale2(3); \ | |
2691 if(W==4) continue; \ | |
2692 op_scale2(4); \ | |
2693 op_scale2(5); \ | |
2694 op_scale2(6); \ | |
2695 op_scale2(7); \ | |
2696 if(W==8) continue; \ | |
2697 op_scale2(8); \ | |
2698 op_scale2(9); \ | |
2699 op_scale2(10); \ | |
2700 op_scale2(11); \ | |
2701 op_scale2(12); \ | |
2702 op_scale2(13); \ | |
2703 op_scale2(14); \ | |
2704 op_scale2(15); \ | |
2705 } \ | |
2706 } | |
2707 | |
2708 H264_WEIGHT(16,16) | |
2709 H264_WEIGHT(16,8) | |
2710 H264_WEIGHT(8,16) | |
2711 H264_WEIGHT(8,8) | |
2712 H264_WEIGHT(8,4) | |
2713 H264_WEIGHT(4,8) | |
2714 H264_WEIGHT(4,4) | |
2715 H264_WEIGHT(4,2) | |
2716 H264_WEIGHT(2,4) | |
2717 H264_WEIGHT(2,2) | |
2718 | |
2719 #undef op_scale1 | |
2720 #undef op_scale2 | |
2721 #undef H264_WEIGHT | |
2722 | |
936 | 2723 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){ |
4176 | 2724 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; |
936 | 2725 int i; |
2726 | |
2727 for(i=0; i<h; i++){ | |
2728 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4]; | |
2729 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4]; | |
2730 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4]; | |
2731 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4]; | |
2732 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4]; | |
2733 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4]; | |
2734 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4]; | |
2735 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4]; | |
2736 dst+=dstStride; | |
2967 | 2737 src+=srcStride; |
936 | 2738 } |
2739 } | |
2740 | |
8590 | 2741 #if CONFIG_CAVS_DECODER |
3395
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2742 /* AVS specific */ |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2743 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx); |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2744 |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2745 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) { |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2746 put_pixels8_c(dst, src, stride, 8); |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2747 } |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2748 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) { |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2749 avg_pixels8_c(dst, src, stride, 8); |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2750 } |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2751 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) { |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2752 put_pixels16_c(dst, src, stride, 16); |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2753 } |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2754 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) { |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2755 avg_pixels16_c(dst, src, stride, 16); |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2756 } |
3432 | 2757 #endif /* CONFIG_CAVS_DECODER */ |
3395
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2758 |
9586
c7420bfe4da0
Don't #if a function declaration and properly indent it.
ramiro
parents:
9585
diff
changeset
|
2759 void ff_mlp_init(DSPContext* c, AVCodecContext *avctx); |
9585 | 2760 |
9995
3141f69e3905
Do not check for both CONFIG_VC1_DECODER and CONFIG_WMV3_DECODER,
diego
parents:
9975
diff
changeset
|
2761 #if CONFIG_VC1_DECODER |
3526 | 2762 /* VC-1 specific */ |
2763 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx); | |
2764 | |
2765 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) { | |
2766 put_pixels8_c(dst, src, stride, 8); | |
2767 } | |
9437 | 2768 void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) { |
2769 avg_pixels8_c(dst, src, stride, 8); | |
2770 } | |
9995
3141f69e3905
Do not check for both CONFIG_VC1_DECODER and CONFIG_WMV3_DECODER,
diego
parents:
9975
diff
changeset
|
2771 #endif /* CONFIG_VC1_DECODER */ |
3526 | 2772 |
5887 | 2773 void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx); |
5899 | 2774 |
4296 | 2775 /* H264 specific */ |
5411
362aec4ef932
Take care of some renames (Doxygen and function name) after the previous pure rename patch.
takis
parents:
5394
diff
changeset
|
2776 void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx); |
4296 | 2777 |
8590 | 2778 #if CONFIG_RV30_DECODER |
8410 | 2779 void ff_rv30dsp_init(DSPContext* c, AVCodecContext *avctx); |
2780 #endif /* CONFIG_RV30_DECODER */ | |
2781 | |
8590 | 2782 #if CONFIG_RV40_DECODER |
8232 | 2783 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){ |
2784 put_pixels16_xy2_c(dst, src, stride, 16); | |
2785 } | |
2786 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){ | |
2787 avg_pixels16_xy2_c(dst, src, stride, 16); | |
2788 } | |
2789 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){ | |
2790 put_pixels8_xy2_c(dst, src, stride, 8); | |
2791 } | |
2792 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){ | |
2793 avg_pixels8_xy2_c(dst, src, stride, 8); | |
2794 } | |
2795 | |
2796 void ff_rv40dsp_init(DSPContext* c, AVCodecContext *avctx); | |
2797 #endif /* CONFIG_RV40_DECODER */ | |
2798 | |
936 | 2799 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){ |
4176 | 2800 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; |
936 | 2801 int i; |
2802 | |
2803 for(i=0; i<w; i++){ | |
2804 const int src_1= src[ -srcStride]; | |
2805 const int src0 = src[0 ]; | |
2806 const int src1 = src[ srcStride]; | |
2807 const int src2 = src[2*srcStride]; | |
2808 const int src3 = src[3*srcStride]; | |
2809 const int src4 = src[4*srcStride]; | |
2810 const int src5 = src[5*srcStride]; | |
2811 const int src6 = src[6*srcStride]; | |
2812 const int src7 = src[7*srcStride]; | |
2813 const int src8 = src[8*srcStride]; | |
2814 const int src9 = src[9*srcStride]; | |
2815 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4]; | |
2816 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4]; | |
2817 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4]; | |
2818 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4]; | |
2819 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4]; | |
2820 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4]; | |
2821 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4]; | |
2822 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4]; | |
2823 src++; | |
2824 dst++; | |
2825 } | |
2826 } | |
2827 | |
2828 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){ | |
2829 put_pixels8_c(dst, src, stride, 8); | |
2830 } | |
2831 | |
2832 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){ | |
2833 uint8_t half[64]; | |
2834 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8); | |
2835 put_pixels8_l2(dst, src, half, stride, stride, 8, 8); | |
2836 } | |
2837 | |
2838 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){ | |
2839 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8); | |
2840 } | |
2841 | |
2842 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){ | |
2843 uint8_t half[64]; | |
2844 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8); | |
2845 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8); | |
2846 } | |
2847 | |
2848 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){ | |
2849 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8); | |
2850 } | |
2851 | |
2852 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){ | |
2853 uint8_t halfH[88]; | |
2854 uint8_t halfV[64]; | |
2855 uint8_t halfHV[64]; | |
2856 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11); | |
2857 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8); | |
2858 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8); | |
2859 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8); | |
2860 } | |
2861 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){ | |
2862 uint8_t halfH[88]; | |
2863 uint8_t halfV[64]; | |
2864 uint8_t halfHV[64]; | |
2865 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11); | |
2866 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8); | |
2867 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8); | |
2868 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8); | |
2869 } | |
2870 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){ | |
2871 uint8_t halfH[88]; | |
2872 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11); | |
2873 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8); | |
2874 } | |
2875 | |
1644 | 2876 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){ |
8596
68e959302527
replace all occurrence of ENABLE_ by the corresponding CONFIG_, HAVE_ or ARCH_
aurel
parents:
8590
diff
changeset
|
2877 if(CONFIG_ANY_H263) { |
1644 | 2878 int x; |
2879 const int strength= ff_h263_loop_filter_strength[qscale]; | |
2967 | 2880 |
1644 | 2881 for(x=0; x<8; x++){ |
2882 int d1, d2, ad1; | |
2883 int p0= src[x-2*stride]; | |
2884 int p1= src[x-1*stride]; | |
2885 int p2= src[x+0*stride]; | |
2886 int p3= src[x+1*stride]; | |
2887 int d = (p0 - p3 + 4*(p2 - p1)) / 8; | |
2888 | |
2889 if (d<-2*strength) d1= 0; | |
2890 else if(d<- strength) d1=-2*strength - d; | |
2891 else if(d< strength) d1= d; | |
2892 else if(d< 2*strength) d1= 2*strength - d; | |
2893 else d1= 0; | |
2967 | 2894 |
1644 | 2895 p1 += d1; |
2896 p2 -= d1; | |
2897 if(p1&256) p1= ~(p1>>31); | |
2898 if(p2&256) p2= ~(p2>>31); | |
2967 | 2899 |
1644 | 2900 src[x-1*stride] = p1; |
2901 src[x+0*stride] = p2; | |
2902 | |
4001 | 2903 ad1= FFABS(d1)>>1; |
2967 | 2904 |
4594 | 2905 d2= av_clip((p0-p3)/4, -ad1, ad1); |
2967 | 2906 |
1644 | 2907 src[x-2*stride] = p0 - d2; |
2908 src[x+ stride] = p3 + d2; | |
2909 } | |
5394
e9a6215f4e3a
help some gcc version to optimize out those functions
aurel
parents:
5291
diff
changeset
|
2910 } |
1644 | 2911 } |
2912 | |
2913 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){ | |
8596
68e959302527
replace all occurrence of ENABLE_ by the corresponding CONFIG_, HAVE_ or ARCH_
aurel
parents:
8590
diff
changeset
|
2914 if(CONFIG_ANY_H263) { |
1644 | 2915 int y; |
2916 const int strength= ff_h263_loop_filter_strength[qscale]; | |
2967 | 2917 |
1644 | 2918 for(y=0; y<8; y++){ |
2919 int d1, d2, ad1; | |
2920 int p0= src[y*stride-2]; | |
2921 int p1= src[y*stride-1]; | |
2922 int p2= src[y*stride+0]; | |
2923 int p3= src[y*stride+1]; | |
2924 int d = (p0 - p3 + 4*(p2 - p1)) / 8; | |
2925 | |
2926 if (d<-2*strength) d1= 0; | |
2927 else if(d<- strength) d1=-2*strength - d; | |
2928 else if(d< strength) d1= d; | |
2929 else if(d< 2*strength) d1= 2*strength - d; | |
2930 else d1= 0; | |
2967 | 2931 |
1644 | 2932 p1 += d1; |
2933 p2 -= d1; | |
2934 if(p1&256) p1= ~(p1>>31); | |
2935 if(p2&256) p2= ~(p2>>31); | |
2967 | 2936 |
1644 | 2937 src[y*stride-1] = p1; |
2938 src[y*stride+0] = p2; | |
2939 | |
4001 | 2940 ad1= FFABS(d1)>>1; |
2967 | 2941 |
4594 | 2942 d2= av_clip((p0-p3)/4, -ad1, ad1); |
2967 | 2943 |
1644 | 2944 src[y*stride-2] = p0 - d2; |
2945 src[y*stride+1] = p3 + d2; | |
2946 } | |
5394
e9a6215f4e3a
help some gcc version to optimize out those functions
aurel
parents:
5291
diff
changeset
|
2947 } |
1644 | 2948 } |
936 | 2949 |
2045 | 2950 static void h261_loop_filter_c(uint8_t *src, int stride){ |
2951 int x,y,xy,yz; | |
2952 int temp[64]; | |
2953 | |
2954 for(x=0; x<8; x++){ | |
2955 temp[x ] = 4*src[x ]; | |
2956 temp[x + 7*8] = 4*src[x + 7*stride]; | |
2957 } | |
2958 for(y=1; y<7; y++){ | |
2959 for(x=0; x<8; x++){ | |
2960 xy = y * stride + x; | |
2961 yz = y * 8 + x; | |
2962 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride]; | |
2044
b6f2add2511e
h261 decoder by (Maarten Daniels <maarten.daniels at student dot luc dot ac dot be>)
michael
parents:
1984
diff
changeset
|
2963 } |
b6f2add2511e
h261 decoder by (Maarten Daniels <maarten.daniels at student dot luc dot ac dot be>)
michael
parents:
1984
diff
changeset
|
2964 } |
2967 | 2965 |
2045 | 2966 for(y=0; y<8; y++){ |
2967 src[ y*stride] = (temp[ y*8] + 2)>>2; | |
2968 src[7+y*stride] = (temp[7+y*8] + 2)>>2; | |
2969 for(x=1; x<7; x++){ | |
2970 xy = y * stride + x; | |
2971 yz = y * 8 + x; | |
2972 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4; | |
2044
b6f2add2511e
h261 decoder by (Maarten Daniels <maarten.daniels at student dot luc dot ac dot be>)
michael
parents:
1984
diff
changeset
|
2973 } |
b6f2add2511e
h261 decoder by (Maarten Daniels <maarten.daniels at student dot luc dot ac dot be>)
michael
parents:
1984
diff
changeset
|
2974 } |
b6f2add2511e
h261 decoder by (Maarten Daniels <maarten.daniels at student dot luc dot ac dot be>)
michael
parents:
1984
diff
changeset
|
2975 } |
b6f2add2511e
h261 decoder by (Maarten Daniels <maarten.daniels at student dot luc dot ac dot be>)
michael
parents:
1984
diff
changeset
|
2976 |
2707
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
2977 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0) |
2633 | 2978 { |
2979 int i, d; | |
2980 for( i = 0; i < 4; i++ ) { | |
2981 if( tc0[i] < 0 ) { | |
2982 pix += 4*ystride; | |
2983 continue; | |
2984 } | |
2985 for( d = 0; d < 4; d++ ) { | |
2986 const int p0 = pix[-1*xstride]; | |
2987 const int p1 = pix[-2*xstride]; | |
2988 const int p2 = pix[-3*xstride]; | |
2989 const int q0 = pix[0]; | |
2990 const int q1 = pix[1*xstride]; | |
2991 const int q2 = pix[2*xstride]; | |
2967 | 2992 |
4001 | 2993 if( FFABS( p0 - q0 ) < alpha && |
2994 FFABS( p1 - p0 ) < beta && | |
2995 FFABS( q1 - q0 ) < beta ) { | |
2967 | 2996 |
2633 | 2997 int tc = tc0[i]; |
2998 int i_delta; | |
2967 | 2999 |
4001 | 3000 if( FFABS( p2 - p0 ) < beta ) { |
4594 | 3001 pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] ); |
2633 | 3002 tc++; |
3003 } | |
4001 | 3004 if( FFABS( q2 - q0 ) < beta ) { |
4594 | 3005 pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] ); |
2633 | 3006 tc++; |
3007 } | |
2967 | 3008 |
4594 | 3009 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); |
3010 pix[-xstride] = av_clip_uint8( p0 + i_delta ); /* p0' */ | |
3011 pix[0] = av_clip_uint8( q0 - i_delta ); /* q0' */ | |
2633 | 3012 } |
3013 pix += ystride; | |
3014 } | |
3015 } | |
3016 } | |
2707
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3017 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) |
2633 | 3018 { |
3019 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0); | |
3020 } | |
2707
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3021 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) |
2633 | 3022 { |
3023 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0); | |
3024 } | |
3025 | |
8395
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3026 static inline void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta) |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3027 { |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3028 int d; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3029 for( d = 0; d < 16; d++ ) { |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3030 const int p2 = pix[-3*xstride]; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3031 const int p1 = pix[-2*xstride]; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3032 const int p0 = pix[-1*xstride]; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3033 |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3034 const int q0 = pix[ 0*xstride]; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3035 const int q1 = pix[ 1*xstride]; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3036 const int q2 = pix[ 2*xstride]; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3037 |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3038 if( FFABS( p0 - q0 ) < alpha && |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3039 FFABS( p1 - p0 ) < beta && |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3040 FFABS( q1 - q0 ) < beta ) { |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3041 |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3042 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){ |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3043 if( FFABS( p2 - p0 ) < beta) |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3044 { |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3045 const int p3 = pix[-4*xstride]; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3046 /* p0', p1', p2' */ |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3047 pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3048 pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3049 pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3050 } else { |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3051 /* p0' */ |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3052 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3053 } |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3054 if( FFABS( q2 - q0 ) < beta) |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3055 { |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3056 const int q3 = pix[3*xstride]; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3057 /* q0', q1', q2' */ |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3058 pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3059 pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3060 pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3061 } else { |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3062 /* q0' */ |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3063 pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3064 } |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3065 }else{ |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3066 /* p0', q0' */ |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3067 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3068 pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3069 } |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3070 } |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3071 pix += ystride; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3072 } |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3073 } |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3074 static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta) |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3075 { |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3076 h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta); |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3077 } |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3078 static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta) |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3079 { |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3080 h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta); |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3081 } |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3082 |
2707
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3083 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0) |
2633 | 3084 { |
3085 int i, d; | |
3086 for( i = 0; i < 4; i++ ) { | |
3087 const int tc = tc0[i]; | |
3088 if( tc <= 0 ) { | |
3089 pix += 2*ystride; | |
3090 continue; | |
3091 } | |
3092 for( d = 0; d < 2; d++ ) { | |
3093 const int p0 = pix[-1*xstride]; | |
3094 const int p1 = pix[-2*xstride]; | |
3095 const int q0 = pix[0]; | |
3096 const int q1 = pix[1*xstride]; | |
3097 | |
4001 | 3098 if( FFABS( p0 - q0 ) < alpha && |
3099 FFABS( p1 - p0 ) < beta && | |
3100 FFABS( q1 - q0 ) < beta ) { | |
2633 | 3101 |
4594 | 3102 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); |
3103 | |
3104 pix[-xstride] = av_clip_uint8( p0 + delta ); /* p0' */ | |
3105 pix[0] = av_clip_uint8( q0 - delta ); /* q0' */ | |
2633 | 3106 } |
3107 pix += ystride; | |
3108 } | |
3109 } | |
3110 } | |
2707
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3111 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) |
2633 | 3112 { |
3113 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0); | |
3114 } | |
2707
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3115 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) |
2633 | 3116 { |
3117 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0); | |
3118 } | |
3119 | |
2707
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3120 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta) |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3121 { |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3122 int d; |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3123 for( d = 0; d < 8; d++ ) { |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3124 const int p0 = pix[-1*xstride]; |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3125 const int p1 = pix[-2*xstride]; |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3126 const int q0 = pix[0]; |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3127 const int q1 = pix[1*xstride]; |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3128 |
4001 | 3129 if( FFABS( p0 - q0 ) < alpha && |
3130 FFABS( p1 - p0 ) < beta && | |
3131 FFABS( q1 - q0 ) < beta ) { | |
2707
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3132 |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3133 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */ |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3134 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */ |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3135 } |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3136 pix += ystride; |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3137 } |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3138 } |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3139 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta) |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3140 { |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3141 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta); |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3142 } |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3143 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta) |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3144 { |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3145 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta); |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3146 } |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3147 |
1708 | 3148 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
0 | 3149 { |
3150 int s, i; | |
3151 | |
3152 s = 0; | |
1708 | 3153 for(i=0;i<h;i++) { |
0 | 3154 s += abs(pix1[0] - pix2[0]); |
3155 s += abs(pix1[1] - pix2[1]); | |
3156 s += abs(pix1[2] - pix2[2]); | |
3157 s += abs(pix1[3] - pix2[3]); | |
3158 s += abs(pix1[4] - pix2[4]); | |
3159 s += abs(pix1[5] - pix2[5]); | |
3160 s += abs(pix1[6] - pix2[6]); | |
3161 s += abs(pix1[7] - pix2[7]); | |
3162 s += abs(pix1[8] - pix2[8]); | |
3163 s += abs(pix1[9] - pix2[9]); | |
3164 s += abs(pix1[10] - pix2[10]); | |
3165 s += abs(pix1[11] - pix2[11]); | |
3166 s += abs(pix1[12] - pix2[12]); | |
3167 s += abs(pix1[13] - pix2[13]); | |
3168 s += abs(pix1[14] - pix2[14]); | |
3169 s += abs(pix1[15] - pix2[15]); | |
3170 pix1 += line_size; | |
3171 pix2 += line_size; | |
3172 } | |
3173 return s; | |
3174 } | |
3175 | |
1708 | 3176 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
0 | 3177 { |
3178 int s, i; | |
3179 | |
3180 s = 0; | |
1708 | 3181 for(i=0;i<h;i++) { |
0 | 3182 s += abs(pix1[0] - avg2(pix2[0], pix2[1])); |
3183 s += abs(pix1[1] - avg2(pix2[1], pix2[2])); | |
3184 s += abs(pix1[2] - avg2(pix2[2], pix2[3])); | |
3185 s += abs(pix1[3] - avg2(pix2[3], pix2[4])); | |
3186 s += abs(pix1[4] - avg2(pix2[4], pix2[5])); | |
3187 s += abs(pix1[5] - avg2(pix2[5], pix2[6])); | |
3188 s += abs(pix1[6] - avg2(pix2[6], pix2[7])); | |
3189 s += abs(pix1[7] - avg2(pix2[7], pix2[8])); | |
3190 s += abs(pix1[8] - avg2(pix2[8], pix2[9])); | |
3191 s += abs(pix1[9] - avg2(pix2[9], pix2[10])); | |
3192 s += abs(pix1[10] - avg2(pix2[10], pix2[11])); | |
3193 s += abs(pix1[11] - avg2(pix2[11], pix2[12])); | |
3194 s += abs(pix1[12] - avg2(pix2[12], pix2[13])); | |
3195 s += abs(pix1[13] - avg2(pix2[13], pix2[14])); | |
3196 s += abs(pix1[14] - avg2(pix2[14], pix2[15])); | |
3197 s += abs(pix1[15] - avg2(pix2[15], pix2[16])); | |
3198 pix1 += line_size; | |
3199 pix2 += line_size; | |
3200 } | |
3201 return s; | |
3202 } | |
3203 | |
1708 | 3204 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
0 | 3205 { |
3206 int s, i; | |
1064 | 3207 uint8_t *pix3 = pix2 + line_size; |
0 | 3208 |
3209 s = 0; | |
1708 | 3210 for(i=0;i<h;i++) { |
0 | 3211 s += abs(pix1[0] - avg2(pix2[0], pix3[0])); |
3212 s += abs(pix1[1] - avg2(pix2[1], pix3[1])); | |
3213 s += abs(pix1[2] - avg2(pix2[2], pix3[2])); | |
3214 s += abs(pix1[3] - avg2(pix2[3], pix3[3])); | |
3215 s += abs(pix1[4] - avg2(pix2[4], pix3[4])); | |
3216 s += abs(pix1[5] - avg2(pix2[5], pix3[5])); | |
3217 s += abs(pix1[6] - avg2(pix2[6], pix3[6])); | |
3218 s += abs(pix1[7] - avg2(pix2[7], pix3[7])); | |
3219 s += abs(pix1[8] - avg2(pix2[8], pix3[8])); | |
3220 s += abs(pix1[9] - avg2(pix2[9], pix3[9])); | |
3221 s += abs(pix1[10] - avg2(pix2[10], pix3[10])); | |
3222 s += abs(pix1[11] - avg2(pix2[11], pix3[11])); | |
3223 s += abs(pix1[12] - avg2(pix2[12], pix3[12])); | |
3224 s += abs(pix1[13] - avg2(pix2[13], pix3[13])); | |
3225 s += abs(pix1[14] - avg2(pix2[14], pix3[14])); | |
3226 s += abs(pix1[15] - avg2(pix2[15], pix3[15])); | |
3227 pix1 += line_size; | |
3228 pix2 += line_size; | |
3229 pix3 += line_size; | |
3230 } | |
3231 return s; | |
3232 } | |
3233 | |
1708 | 3234 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
0 | 3235 { |
3236 int s, i; | |
1064 | 3237 uint8_t *pix3 = pix2 + line_size; |
0 | 3238 |
3239 s = 0; | |
1708 | 3240 for(i=0;i<h;i++) { |
0 | 3241 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1])); |
3242 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2])); | |
3243 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3])); | |
3244 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4])); | |
3245 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5])); | |
3246 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6])); | |
3247 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7])); | |
3248 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8])); | |
3249 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9])); | |
3250 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10])); | |
3251 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11])); | |
3252 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12])); | |
3253 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13])); | |
3254 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14])); | |
3255 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15])); | |
3256 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16])); | |
3257 pix1 += line_size; | |
3258 pix2 += line_size; | |
3259 pix3 += line_size; | |
3260 } | |
3261 return s; | |
3262 } | |
3263 | |
1708 | 3264 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
294 | 3265 { |
3266 int s, i; | |
3267 | |
3268 s = 0; | |
1708 | 3269 for(i=0;i<h;i++) { |
294 | 3270 s += abs(pix1[0] - pix2[0]); |
3271 s += abs(pix1[1] - pix2[1]); | |
3272 s += abs(pix1[2] - pix2[2]); | |
3273 s += abs(pix1[3] - pix2[3]); | |
3274 s += abs(pix1[4] - pix2[4]); | |
3275 s += abs(pix1[5] - pix2[5]); | |
3276 s += abs(pix1[6] - pix2[6]); | |
3277 s += abs(pix1[7] - pix2[7]); | |
3278 pix1 += line_size; | |
3279 pix2 += line_size; | |
3280 } | |
3281 return s; | |
3282 } | |
3283 | |
1708 | 3284 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
294 | 3285 { |
3286 int s, i; | |
3287 | |
3288 s = 0; | |
1708 | 3289 for(i=0;i<h;i++) { |
294 | 3290 s += abs(pix1[0] - avg2(pix2[0], pix2[1])); |
3291 s += abs(pix1[1] - avg2(pix2[1], pix2[2])); | |
3292 s += abs(pix1[2] - avg2(pix2[2], pix2[3])); | |
3293 s += abs(pix1[3] - avg2(pix2[3], pix2[4])); | |
3294 s += abs(pix1[4] - avg2(pix2[4], pix2[5])); | |
3295 s += abs(pix1[5] - avg2(pix2[5], pix2[6])); | |
3296 s += abs(pix1[6] - avg2(pix2[6], pix2[7])); | |
3297 s += abs(pix1[7] - avg2(pix2[7], pix2[8])); | |
3298 pix1 += line_size; | |
3299 pix2 += line_size; | |
3300 } | |
3301 return s; | |
3302 } | |
3303 | |
1708 | 3304 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
294 | 3305 { |
3306 int s, i; | |
1064 | 3307 uint8_t *pix3 = pix2 + line_size; |
294 | 3308 |
3309 s = 0; | |
1708 | 3310 for(i=0;i<h;i++) { |
294 | 3311 s += abs(pix1[0] - avg2(pix2[0], pix3[0])); |
3312 s += abs(pix1[1] - avg2(pix2[1], pix3[1])); | |
3313 s += abs(pix1[2] - avg2(pix2[2], pix3[2])); | |
3314 s += abs(pix1[3] - avg2(pix2[3], pix3[3])); | |
3315 s += abs(pix1[4] - avg2(pix2[4], pix3[4])); | |
3316 s += abs(pix1[5] - avg2(pix2[5], pix3[5])); | |
3317 s += abs(pix1[6] - avg2(pix2[6], pix3[6])); | |
3318 s += abs(pix1[7] - avg2(pix2[7], pix3[7])); | |
3319 pix1 += line_size; | |
3320 pix2 += line_size; | |
3321 pix3 += line_size; | |
3322 } | |
3323 return s; | |
3324 } | |
3325 | |
1708 | 3326 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
294 | 3327 { |
3328 int s, i; | |
1064 | 3329 uint8_t *pix3 = pix2 + line_size; |
294 | 3330 |
3331 s = 0; | |
1708 | 3332 for(i=0;i<h;i++) { |
294 | 3333 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1])); |
3334 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2])); | |
3335 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3])); | |
3336 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4])); | |
3337 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5])); | |
3338 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6])); | |
3339 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7])); | |
3340 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8])); | |
3341 pix1 += line_size; | |
3342 pix2 += line_size; | |
3343 pix3 += line_size; | |
3344 } | |
3345 return s; | |
3346 } | |
3347 | |
2834 | 3348 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){ |
3349 MpegEncContext *c = v; | |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3350 int score1=0; |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3351 int score2=0; |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3352 int x,y; |
2066 | 3353 |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3354 for(y=0; y<h; y++){ |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3355 for(x=0; x<16; x++){ |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3356 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]); |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3357 } |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3358 if(y+1<h){ |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3359 for(x=0; x<15; x++){ |
4001 | 3360 score2+= FFABS( s1[x ] - s1[x +stride] |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3361 - s1[x+1] + s1[x+1+stride]) |
4001 | 3362 -FFABS( s2[x ] - s2[x +stride] |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3363 - s2[x+1] + s2[x+1+stride]); |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3364 } |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3365 } |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3366 s1+= stride; |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3367 s2+= stride; |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3368 } |
2066 | 3369 |
4001 | 3370 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight; |
3371 else return score1 + FFABS(score2)*8; | |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3372 } |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3373 |
2834 | 3374 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){ |
3375 MpegEncContext *c = v; | |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3376 int score1=0; |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3377 int score2=0; |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3378 int x,y; |
2967 | 3379 |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3380 for(y=0; y<h; y++){ |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3381 for(x=0; x<8; x++){ |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3382 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]); |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3383 } |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3384 if(y+1<h){ |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3385 for(x=0; x<7; x++){ |
4001 | 3386 score2+= FFABS( s1[x ] - s1[x +stride] |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3387 - s1[x+1] + s1[x+1+stride]) |
4001 | 3388 -FFABS( s2[x ] - s2[x +stride] |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3389 - s2[x+1] + s2[x+1+stride]); |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3390 } |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3391 } |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3392 s1+= stride; |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3393 s2+= stride; |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3394 } |
2967 | 3395 |
4001 | 3396 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight; |
3397 else return score1 + FFABS(score2)*8; | |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3398 } |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3399 |
1784 | 3400 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){ |
3401 int i; | |
3402 unsigned int sum=0; | |
3403 | |
3404 for(i=0; i<8*8; i++){ | |
3405 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT)); | |
3406 int w= weight[i]; | |
3407 b>>= RECON_SHIFT; | |
3408 assert(-512<b && b<512); | |
3409 | |
3410 sum += (w*b)*(w*b)>>4; | |
3411 } | |
3412 return sum>>2; | |
3413 } | |
3414 | |
3415 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){ | |
3416 int i; | |
3417 | |
3418 for(i=0; i<8*8; i++){ | |
3419 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT); | |
2967 | 3420 } |
1784 | 3421 } |
3422 | |
1100 | 3423 /** |
3424 * permutes an 8x8 block. | |
1101 | 3425 * @param block the block which will be permuted according to the given permutation vector |
1100 | 3426 * @param permutation the permutation vector |
3427 * @param last the last non zero coefficient in scantable order, used to speed the permutation up | |
2967 | 3428 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not |
1101 | 3429 * (inverse) permutated to scantable order! |
1100 | 3430 */ |
1064 | 3431 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last) |
174
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
88
diff
changeset
|
3432 { |
764 | 3433 int i; |
945 | 3434 DCTELEM temp[64]; |
2967 | 3435 |
764 | 3436 if(last<=0) return; |
5129 | 3437 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations |
174
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
88
diff
changeset
|
3438 |
764 | 3439 for(i=0; i<=last; i++){ |
3440 const int j= scantable[i]; | |
3441 temp[j]= block[j]; | |
3442 block[j]=0; | |
3443 } | |
2967 | 3444 |
764 | 3445 for(i=0; i<=last; i++){ |
3446 const int j= scantable[i]; | |
3447 const int perm_j= permutation[j]; | |
3448 block[perm_j]= temp[j]; | |
3449 } | |
174
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
88
diff
changeset
|
3450 } |
34 | 3451 |
1729 | 3452 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){ |
3453 return 0; | |
3454 } | |
3455 | |
3456 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){ | |
3457 int i; | |
2967 | 3458 |
8976
e7d87561b42b
Making the arrays accomodate an extra intra 8x8 cmp function
romansh
parents:
8785
diff
changeset
|
3459 memset(cmp, 0, sizeof(void*)*6); |
e7d87561b42b
Making the arrays accomodate an extra intra 8x8 cmp function
romansh
parents:
8785
diff
changeset
|
3460 |
e7d87561b42b
Making the arrays accomodate an extra intra 8x8 cmp function
romansh
parents:
8785
diff
changeset
|
3461 for(i=0; i<6; i++){ |
1729 | 3462 switch(type&0xFF){ |
3463 case FF_CMP_SAD: | |
3464 cmp[i]= c->sad[i]; | |
3465 break; | |
3466 case FF_CMP_SATD: | |
3467 cmp[i]= c->hadamard8_diff[i]; | |
3468 break; | |
3469 case FF_CMP_SSE: | |
3470 cmp[i]= c->sse[i]; | |
3471 break; | |
3472 case FF_CMP_DCT: | |
3473 cmp[i]= c->dct_sad[i]; | |
3474 break; | |
3010
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3475 case FF_CMP_DCT264: |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3476 cmp[i]= c->dct264_sad[i]; |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3477 break; |
2382 | 3478 case FF_CMP_DCTMAX: |
3479 cmp[i]= c->dct_max[i]; | |
3480 break; | |
1729 | 3481 case FF_CMP_PSNR: |
3482 cmp[i]= c->quant_psnr[i]; | |
3483 break; | |
3484 case FF_CMP_BIT: | |
3485 cmp[i]= c->bit[i]; | |
3486 break; | |
3487 case FF_CMP_RD: | |
3488 cmp[i]= c->rd[i]; | |
3489 break; | |
3490 case FF_CMP_VSAD: | |
3491 cmp[i]= c->vsad[i]; | |
3492 break; | |
3493 case FF_CMP_VSSE: | |
3494 cmp[i]= c->vsse[i]; | |
3495 break; | |
3496 case FF_CMP_ZERO: | |
3497 cmp[i]= zero_cmp; | |
3498 break; | |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3499 case FF_CMP_NSSE: |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3500 cmp[i]= c->nsse[i]; |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3501 break; |
8590 | 3502 #if CONFIG_SNOW_ENCODER |
2184 | 3503 case FF_CMP_W53: |
3504 cmp[i]= c->w53[i]; | |
3505 break; | |
3506 case FF_CMP_W97: | |
3507 cmp[i]= c->w97[i]; | |
3508 break; | |
3373
b8996cc5ccae
Disable w53 and w97 cmp methods when snow encoder is disabled
gpoirier
parents:
3323
diff
changeset
|
3509 #endif |
1729 | 3510 default: |
3511 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n"); | |
3512 } | |
3513 } | |
3514 } | |
3515 | |
8288 | 3516 static void clear_block_c(DCTELEM *block) |
3517 { | |
3518 memset(block, 0, sizeof(DCTELEM)*64); | |
3519 } | |
3520 | |
1101 | 3521 /** |
3522 * memset(blocks, 0, sizeof(DCTELEM)*6*64) | |
3523 */ | |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
3524 static void clear_blocks_c(DCTELEM *blocks) |
296 | 3525 { |
3526 memset(blocks, 0, sizeof(DCTELEM)*6*64); | |
3527 } | |
3528 | |
866 | 3529 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){ |
6385 | 3530 long i; |
3531 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){ | |
3532 long a = *(long*)(src+i); | |
3533 long b = *(long*)(dst+i); | |
3534 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80); | |
866 | 3535 } |
3536 for(; i<w; i++) | |
3537 dst[i+0] += src[i+0]; | |
3538 } | |
3539 | |
6384 | 3540 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ |
6385 | 3541 long i; |
6384 | 3542 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){ |
3543 long a = *(long*)(src1+i); | |
3544 long b = *(long*)(src2+i); | |
6385 | 3545 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80); |
6384 | 3546 } |
3547 for(; i<w; i++) | |
3548 dst[i] = src1[i]+src2[i]; | |
3549 } | |
3550 | |
866 | 3551 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ |
6385 | 3552 long i; |
8590 | 3553 #if !HAVE_FAST_UNALIGNED |
6385 | 3554 if((long)src2 & (sizeof(long)-1)){ |
6386 | 3555 for(i=0; i+7<w; i+=8){ |
3556 dst[i+0] = src1[i+0]-src2[i+0]; | |
3557 dst[i+1] = src1[i+1]-src2[i+1]; | |
3558 dst[i+2] = src1[i+2]-src2[i+2]; | |
3559 dst[i+3] = src1[i+3]-src2[i+3]; | |
3560 dst[i+4] = src1[i+4]-src2[i+4]; | |
3561 dst[i+5] = src1[i+5]-src2[i+5]; | |
3562 dst[i+6] = src1[i+6]-src2[i+6]; | |
3563 dst[i+7] = src1[i+7]-src2[i+7]; | |
3564 } | |
6385 | 3565 }else |
3566 #endif | |
3567 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){ | |
3568 long a = *(long*)(src1+i); | |
3569 long b = *(long*)(src2+i); | |
3570 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80); | |
3571 } | |
866 | 3572 for(; i<w; i++) |
3573 dst[i+0] = src1[i+0]-src2[i+0]; | |
3574 } | |
3575 | |
8760 | 3576 static void add_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *diff, int w, int *left, int *left_top){ |
3577 int i; | |
3578 uint8_t l, lt; | |
3579 | |
3580 l= *left; | |
3581 lt= *left_top; | |
3582 | |
3583 for(i=0; i<w; i++){ | |
3584 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i]; | |
3585 lt= src1[i]; | |
3586 dst[i]= l; | |
3587 } | |
3588 | |
3589 *left= l; | |
3590 *left_top= lt; | |
3591 } | |
3592 | |
1527 | 3593 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){ |
3594 int i; | |
3595 uint8_t l, lt; | |
3596 | |
3597 l= *left; | |
3598 lt= *left_top; | |
3599 | |
3600 for(i=0; i<w; i++){ | |
3601 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF); | |
3602 lt= src1[i]; | |
3603 l= src2[i]; | |
3604 dst[i]= l - pred; | |
2967 | 3605 } |
1527 | 3606 |
3607 *left= l; | |
3608 *left_top= lt; | |
3609 } | |
3610 | |
936 | 3611 #define BUTTERFLY2(o1,o2,i1,i2) \ |
3612 o1= (i1)+(i2);\ | |
3613 o2= (i1)-(i2); | |
3614 | |
3615 #define BUTTERFLY1(x,y) \ | |
3616 {\ | |
3617 int a,b;\ | |
3618 a= x;\ | |
3619 b= y;\ | |
3620 x= a+b;\ | |
3621 y= a-b;\ | |
3622 } | |
3623 | |
4001 | 3624 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y))) |
936 | 3625 |
1708 | 3626 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){ |
936 | 3627 int i; |
3628 int temp[64]; | |
3629 int sum=0; | |
2967 | 3630 |
1708 | 3631 assert(h==8); |
936 | 3632 |
3633 for(i=0; i<8; i++){ | |
3634 //FIXME try pointer walks | |
3635 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]); | |
3636 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]); | |
3637 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]); | |
3638 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]); | |
2967 | 3639 |
936 | 3640 BUTTERFLY1(temp[8*i+0], temp[8*i+2]); |
3641 BUTTERFLY1(temp[8*i+1], temp[8*i+3]); | |
3642 BUTTERFLY1(temp[8*i+4], temp[8*i+6]); | |
3643 BUTTERFLY1(temp[8*i+5], temp[8*i+7]); | |
2967 | 3644 |
936 | 3645 BUTTERFLY1(temp[8*i+0], temp[8*i+4]); |
3646 BUTTERFLY1(temp[8*i+1], temp[8*i+5]); | |
3647 BUTTERFLY1(temp[8*i+2], temp[8*i+6]); | |
3648 BUTTERFLY1(temp[8*i+3], temp[8*i+7]); | |
3649 } | |
3650 | |
3651 for(i=0; i<8; i++){ | |
3652 BUTTERFLY1(temp[8*0+i], temp[8*1+i]); | |
3653 BUTTERFLY1(temp[8*2+i], temp[8*3+i]); | |
3654 BUTTERFLY1(temp[8*4+i], temp[8*5+i]); | |
3655 BUTTERFLY1(temp[8*6+i], temp[8*7+i]); | |
2967 | 3656 |
936 | 3657 BUTTERFLY1(temp[8*0+i], temp[8*2+i]); |
3658 BUTTERFLY1(temp[8*1+i], temp[8*3+i]); | |
3659 BUTTERFLY1(temp[8*4+i], temp[8*6+i]); | |
3660 BUTTERFLY1(temp[8*5+i], temp[8*7+i]); | |
3661 | |
2967 | 3662 sum += |
936 | 3663 BUTTERFLYA(temp[8*0+i], temp[8*4+i]) |
3664 +BUTTERFLYA(temp[8*1+i], temp[8*5+i]) | |
3665 +BUTTERFLYA(temp[8*2+i], temp[8*6+i]) | |
3666 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]); | |
3667 } | |
3668 #if 0 | |
3669 static int maxi=0; | |
3670 if(sum>maxi){ | |
3671 maxi=sum; | |
3672 printf("MAX:%d\n", maxi); | |
3673 } | |
3674 #endif | |
3675 return sum; | |
3676 } | |
3677 | |
1729 | 3678 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){ |
936 | 3679 int i; |
3680 int temp[64]; | |
3681 int sum=0; | |
2967 | 3682 |
1729 | 3683 assert(h==8); |
2967 | 3684 |
936 | 3685 for(i=0; i<8; i++){ |
3686 //FIXME try pointer walks | |
1729 | 3687 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]); |
3688 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]); | |
3689 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]); | |
3690 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]); | |
2967 | 3691 |
936 | 3692 BUTTERFLY1(temp[8*i+0], temp[8*i+2]); |
3693 BUTTERFLY1(temp[8*i+1], temp[8*i+3]); | |
3694 BUTTERFLY1(temp[8*i+4], temp[8*i+6]); | |
3695 BUTTERFLY1(temp[8*i+5], temp[8*i+7]); | |
2967 | 3696 |
936 | 3697 BUTTERFLY1(temp[8*i+0], temp[8*i+4]); |
3698 BUTTERFLY1(temp[8*i+1], temp[8*i+5]); | |
3699 BUTTERFLY1(temp[8*i+2], temp[8*i+6]); | |
3700 BUTTERFLY1(temp[8*i+3], temp[8*i+7]); | |
3701 } | |
3702 | |
3703 for(i=0; i<8; i++){ | |
3704 BUTTERFLY1(temp[8*0+i], temp[8*1+i]); | |
3705 BUTTERFLY1(temp[8*2+i], temp[8*3+i]); | |
3706 BUTTERFLY1(temp[8*4+i], temp[8*5+i]); | |
3707 BUTTERFLY1(temp[8*6+i], temp[8*7+i]); | |
2967 | 3708 |
936 | 3709 BUTTERFLY1(temp[8*0+i], temp[8*2+i]); |
3710 BUTTERFLY1(temp[8*1+i], temp[8*3+i]); | |
3711 BUTTERFLY1(temp[8*4+i], temp[8*6+i]); | |
3712 BUTTERFLY1(temp[8*5+i], temp[8*7+i]); | |
2967 | 3713 |
3714 sum += | |
936 | 3715 BUTTERFLYA(temp[8*0+i], temp[8*4+i]) |
3716 +BUTTERFLYA(temp[8*1+i], temp[8*5+i]) | |
3717 +BUTTERFLYA(temp[8*2+i], temp[8*6+i]) | |
3718 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]); | |
3719 } | |
2967 | 3720 |
4001 | 3721 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean |
2967 | 3722 |
936 | 3723 return sum; |
3724 } | |
3725 | |
1708 | 3726 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){ |
936 | 3727 MpegEncContext * const s= (MpegEncContext *)c; |
4988
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4749
diff
changeset
|
3728 DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]); |
1016 | 3729 DCTELEM * const temp= (DCTELEM*)aligned_temp; |
2967 | 3730 |
1708 | 3731 assert(h==8); |
936 | 3732 |
3733 s->dsp.diff_pixels(temp, src1, src2, stride); | |
1092 | 3734 s->dsp.fdct(temp); |
4988
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4749
diff
changeset
|
3735 return s->dsp.sum_abs_dctelem(temp); |
936 | 3736 } |
3737 | |
8590 | 3738 #if CONFIG_GPL |
3010
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3739 #define DCT8_1D {\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3740 const int s07 = SRC(0) + SRC(7);\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3741 const int s16 = SRC(1) + SRC(6);\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3742 const int s25 = SRC(2) + SRC(5);\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3743 const int s34 = SRC(3) + SRC(4);\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3744 const int a0 = s07 + s34;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3745 const int a1 = s16 + s25;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3746 const int a2 = s07 - s34;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3747 const int a3 = s16 - s25;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3748 const int d07 = SRC(0) - SRC(7);\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3749 const int d16 = SRC(1) - SRC(6);\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3750 const int d25 = SRC(2) - SRC(5);\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3751 const int d34 = SRC(3) - SRC(4);\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3752 const int a4 = d16 + d25 + (d07 + (d07>>1));\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3753 const int a5 = d07 - d34 - (d25 + (d25>>1));\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3754 const int a6 = d07 + d34 - (d16 + (d16>>1));\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3755 const int a7 = d16 - d25 + (d34 + (d34>>1));\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3756 DST(0, a0 + a1 ) ;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3757 DST(1, a4 + (a7>>2)) ;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3758 DST(2, a2 + (a3>>1)) ;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3759 DST(3, a5 + (a6>>2)) ;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3760 DST(4, a0 - a1 ) ;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3761 DST(5, a6 - (a5>>2)) ;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3762 DST(6, (a2>>1) - a3 ) ;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3763 DST(7, (a4>>2) - a7 ) ;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3764 } |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3765 |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3766 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3767 MpegEncContext * const s= (MpegEncContext *)c; |
5256 | 3768 DCTELEM dct[8][8]; |
3010
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3769 int i; |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3770 int sum=0; |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3771 |
5256 | 3772 s->dsp.diff_pixels(dct[0], src1, src2, stride); |
3010
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3773 |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3774 #define SRC(x) dct[i][x] |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3775 #define DST(x,v) dct[i][x]= v |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3776 for( i = 0; i < 8; i++ ) |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3777 DCT8_1D |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3778 #undef SRC |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3779 #undef DST |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3780 |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3781 #define SRC(x) dct[x][i] |
4001 | 3782 #define DST(x,v) sum += FFABS(v) |
3010
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3783 for( i = 0; i < 8; i++ ) |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3784 DCT8_1D |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3785 #undef SRC |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3786 #undef DST |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3787 return sum; |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3788 } |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3789 #endif |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3790 |
2382 | 3791 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){ |
3792 MpegEncContext * const s= (MpegEncContext *)c; | |
10094 | 3793 DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]); |
2382 | 3794 DCTELEM * const temp= (DCTELEM*)aligned_temp; |
3795 int sum=0, i; | |
2967 | 3796 |
2382 | 3797 assert(h==8); |
3798 | |
3799 s->dsp.diff_pixels(temp, src1, src2, stride); | |
3800 s->dsp.fdct(temp); | |
3801 | |
3802 for(i=0; i<64; i++) | |
4001 | 3803 sum= FFMAX(sum, FFABS(temp[i])); |
2967 | 3804 |
2382 | 3805 return sum; |
3806 } | |
3807 | |
1708 | 3808 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){ |
936 | 3809 MpegEncContext * const s= (MpegEncContext *)c; |
10094 | 3810 DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]); |
1016 | 3811 DCTELEM * const temp= (DCTELEM*)aligned_temp; |
3812 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64; | |
936 | 3813 int sum=0, i; |
3814 | |
1708 | 3815 assert(h==8); |
936 | 3816 s->mb_intra=0; |
2967 | 3817 |
936 | 3818 s->dsp.diff_pixels(temp, src1, src2, stride); |
2967 | 3819 |
936 | 3820 memcpy(bak, temp, 64*sizeof(DCTELEM)); |
2967 | 3821 |
1013 | 3822 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i); |
1689 | 3823 s->dct_unquantize_inter(s, temp, 0, s->qscale); |
6001 | 3824 ff_simple_idct(temp); //FIXME |
2967 | 3825 |
936 | 3826 for(i=0; i<64; i++) |
3827 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]); | |
2967 | 3828 |
936 | 3829 return sum; |
3830 } | |
3831 | |
1708 | 3832 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){ |
1007 | 3833 MpegEncContext * const s= (MpegEncContext *)c; |
1064 | 3834 const uint8_t *scantable= s->intra_scantable.permutated; |
10094 | 3835 DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]); |
3836 DECLARE_ALIGNED_16(uint64_t, aligned_src1[8]); | |
3837 DECLARE_ALIGNED_16(uint64_t, aligned_src2[8]); | |
1016 | 3838 DCTELEM * const temp= (DCTELEM*)aligned_temp; |
10068 | 3839 uint8_t * const lsrc1 = (uint8_t*)aligned_src1; |
3840 uint8_t * const lsrc2 = (uint8_t*)aligned_src2; | |
6719 | 3841 int i, last, run, bits, level, distortion, start_i; |
1007 | 3842 const int esc_length= s->ac_esc_length; |
3843 uint8_t * length; | |
3844 uint8_t * last_length; | |
2967 | 3845 |
1708 | 3846 assert(h==8); |
3847 | |
10068 | 3848 copy_block8(lsrc1, src1, 8, stride, 8); |
3849 copy_block8(lsrc2, src2, 8, stride, 8); | |
3850 | |
3851 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8); | |
1007 | 3852 |
1013 | 3853 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i); |
3854 | |
3855 bits=0; | |
2967 | 3856 |
1013 | 3857 if (s->mb_intra) { |
2967 | 3858 start_i = 1; |
1013 | 3859 length = s->intra_ac_vlc_length; |
3860 last_length= s->intra_ac_vlc_last_length; | |
3861 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma | |
3862 } else { | |
3863 start_i = 0; | |
3864 length = s->inter_ac_vlc_length; | |
3865 last_length= s->inter_ac_vlc_last_length; | |
3866 } | |
2967 | 3867 |
1013 | 3868 if(last>=start_i){ |
1007 | 3869 run=0; |
3870 for(i=start_i; i<last; i++){ | |
3871 int j= scantable[i]; | |
3872 level= temp[j]; | |
2967 | 3873 |
1007 | 3874 if(level){ |
3875 level+=64; | |
3876 if((level&(~127)) == 0){ | |
3877 bits+= length[UNI_AC_ENC_INDEX(run, level)]; | |
3878 }else | |
3879 bits+= esc_length; | |
3880 run=0; | |
3881 }else | |
3882 run++; | |
3883 } | |
3884 i= scantable[last]; | |
2967 | 3885 |
1011 | 3886 level= temp[i] + 64; |
3887 | |
3888 assert(level - 64); | |
2967 | 3889 |
1007 | 3890 if((level&(~127)) == 0){ |
3891 bits+= last_length[UNI_AC_ENC_INDEX(run, level)]; | |
3892 }else | |
3893 bits+= esc_length; | |
2967 | 3894 |
1013 | 3895 } |
3896 | |
3897 if(last>=0){ | |
1689 | 3898 if(s->mb_intra) |
3899 s->dct_unquantize_intra(s, temp, 0, s->qscale); | |
3900 else | |
3901 s->dct_unquantize_inter(s, temp, 0, s->qscale); | |
1007 | 3902 } |
2967 | 3903 |
10068 | 3904 s->dsp.idct_add(lsrc2, 8, temp); |
3905 | |
3906 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8); | |
6719 | 3907 |
3908 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7); | |
1007 | 3909 } |
3910 | |
1708 | 3911 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){ |
1007 | 3912 MpegEncContext * const s= (MpegEncContext *)c; |
1064 | 3913 const uint8_t *scantable= s->intra_scantable.permutated; |
10094 | 3914 DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]); |
1016 | 3915 DCTELEM * const temp= (DCTELEM*)aligned_temp; |
1007 | 3916 int i, last, run, bits, level, start_i; |
3917 const int esc_length= s->ac_esc_length; | |
3918 uint8_t * length; | |
3919 uint8_t * last_length; | |
1708 | 3920 |
3921 assert(h==8); | |
2967 | 3922 |
1013 | 3923 s->dsp.diff_pixels(temp, src1, src2, stride); |
1007 | 3924 |
1013 | 3925 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i); |
3926 | |
3927 bits=0; | |
2967 | 3928 |
1007 | 3929 if (s->mb_intra) { |
2967 | 3930 start_i = 1; |
1007 | 3931 length = s->intra_ac_vlc_length; |
3932 last_length= s->intra_ac_vlc_last_length; | |
1013 | 3933 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma |
1007 | 3934 } else { |
3935 start_i = 0; | |
3936 length = s->inter_ac_vlc_length; | |
3937 last_length= s->inter_ac_vlc_last_length; | |
3938 } | |
2967 | 3939 |
1013 | 3940 if(last>=start_i){ |
1007 | 3941 run=0; |
3942 for(i=start_i; i<last; i++){ | |
3943 int j= scantable[i]; | |
3944 level= temp[j]; | |
2967 | 3945 |
1007 | 3946 if(level){ |
3947 level+=64; | |
3948 if((level&(~127)) == 0){ | |
3949 bits+= length[UNI_AC_ENC_INDEX(run, level)]; | |
3950 }else | |
3951 bits+= esc_length; | |
3952 run=0; | |
3953 }else | |
3954 run++; | |
3955 } | |
3956 i= scantable[last]; | |
2967 | 3957 |
1013 | 3958 level= temp[i] + 64; |
2967 | 3959 |
1013 | 3960 assert(level - 64); |
2967 | 3961 |
1007 | 3962 if((level&(~127)) == 0){ |
3963 bits+= last_length[UNI_AC_ENC_INDEX(run, level)]; | |
3964 }else | |
3965 bits+= esc_length; | |
3966 } | |
3967 | |
3968 return bits; | |
3969 } | |
3970 | |
8978 | 3971 #define VSAD_INTRA(size) \ |
3972 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \ | |
3973 int score=0; \ | |
3974 int x,y; \ | |
3975 \ | |
3976 for(y=1; y<h; y++){ \ | |
3977 for(x=0; x<size; x+=4){ \ | |
3978 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \ | |
3979 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \ | |
3980 } \ | |
3981 s+= stride; \ | |
3982 } \ | |
3983 \ | |
3984 return score; \ | |
1729 | 3985 } |
8978 | 3986 VSAD_INTRA(8) |
3987 VSAD_INTRA(16) | |
1729 | 3988 |
3989 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){ | |
3990 int score=0; | |
3991 int x,y; | |
2967 | 3992 |
1729 | 3993 for(y=1; y<h; y++){ |
3994 for(x=0; x<16; x++){ | |
4001 | 3995 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]); |
1729 | 3996 } |
3997 s1+= stride; | |
3998 s2+= stride; | |
3999 } | |
2967 | 4000 |
1729 | 4001 return score; |
4002 } | |
4003 | |
4004 #define SQ(a) ((a)*(a)) | |
8978 | 4005 #define VSSE_INTRA(size) \ |
4006 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \ | |
4007 int score=0; \ | |
4008 int x,y; \ | |
4009 \ | |
4010 for(y=1; y<h; y++){ \ | |
4011 for(x=0; x<size; x+=4){ \ | |
4012 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \ | |
4013 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \ | |
4014 } \ | |
4015 s+= stride; \ | |
4016 } \ | |
4017 \ | |
4018 return score; \ | |
1729 | 4019 } |
8978 | 4020 VSSE_INTRA(8) |
4021 VSSE_INTRA(16) | |
1729 | 4022 |
4023 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){ | |
4024 int score=0; | |
4025 int x,y; | |
2967 | 4026 |
1729 | 4027 for(y=1; y<h; y++){ |
4028 for(x=0; x<16; x++){ | |
4029 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]); | |
4030 } | |
4031 s1+= stride; | |
4032 s2+= stride; | |
4033 } | |
2967 | 4034 |
1729 | 4035 return score; |
4036 } | |
4037 | |
5255 | 4038 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2, |
4039 int size){ | |
4749 | 4040 int score=0; |
4041 int i; | |
4042 for(i=0; i<size; i++) | |
4043 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]); | |
4044 return score; | |
4045 } | |
4046 | |
6056
558c1fd0ee72
Fix typo in macro name: WARPER8_16_SQ --> WRAPPER8_16_SQ.
diego
parents:
6054
diff
changeset
|
4047 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c) |
558c1fd0ee72
Fix typo in macro name: WARPER8_16_SQ --> WRAPPER8_16_SQ.
diego
parents:
6054
diff
changeset
|
4048 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c) |
558c1fd0ee72
Fix typo in macro name: WARPER8_16_SQ --> WRAPPER8_16_SQ.
diego
parents:
6054
diff
changeset
|
4049 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c) |
8590 | 4050 #if CONFIG_GPL |
6056
558c1fd0ee72
Fix typo in macro name: WARPER8_16_SQ --> WRAPPER8_16_SQ.
diego
parents:
6054
diff
changeset
|
4051 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c) |
3013 | 4052 #endif |
6056
558c1fd0ee72
Fix typo in macro name: WARPER8_16_SQ --> WRAPPER8_16_SQ.
diego
parents:
6054
diff
changeset
|
4053 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c) |
558c1fd0ee72
Fix typo in macro name: WARPER8_16_SQ --> WRAPPER8_16_SQ.
diego
parents:
6054
diff
changeset
|
4054 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c) |
558c1fd0ee72
Fix typo in macro name: WARPER8_16_SQ --> WRAPPER8_16_SQ.
diego
parents:
6054
diff
changeset
|
4055 WRAPPER8_16_SQ(rd8x8_c, rd16_c) |
558c1fd0ee72
Fix typo in macro name: WARPER8_16_SQ --> WRAPPER8_16_SQ.
diego
parents:
6054
diff
changeset
|
4056 WRAPPER8_16_SQ(bit8x8_c, bit16_c) |
936 | 4057 |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4058 static void vector_fmul_c(float *dst, const float *src, int len){ |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4059 int i; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4060 for(i=0; i<len; i++) |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4061 dst[i] *= src[i]; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4062 } |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4063 |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4064 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){ |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4065 int i; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4066 src1 += len-1; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4067 for(i=0; i<len; i++) |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4068 dst[i] = src0[i] * src1[-i]; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4069 } |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4070 |
10300
4d1b9ca628fc
Drop unused args from vector_fmul_add_add, simpify code, and rename
mru
parents:
10219
diff
changeset
|
4071 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){ |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4072 int i; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4073 for(i=0; i<len; i++) |
10300
4d1b9ca628fc
Drop unused args from vector_fmul_add_add, simpify code, and rename
mru
parents:
10219
diff
changeset
|
4074 dst[i] = src0[i] * src1[i] + src2[i]; |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4075 } |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4076 |
7261 | 4077 void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){ |
7263 | 4078 int i,j; |
4079 dst += len; | |
4080 win += len; | |
4081 src0+= len; | |
4082 for(i=-len, j=len-1; i<0; i++, j--) { | |
4083 float s0 = src0[i]; | |
4084 float s1 = src1[j]; | |
4085 float wi = win[i]; | |
4086 float wj = win[j]; | |
4087 dst[i] = s0*wj - s1*wi + add_bias; | |
4088 dst[j] = s0*wi + s1*wj + add_bias; | |
4089 } | |
7261 | 4090 } |
4091 | |
10219 | 4092 static void vector_fmul_scalar_c(float *dst, const float *src, float mul, |
4093 int len) | |
4094 { | |
4095 int i; | |
4096 for (i = 0; i < len; i++) | |
4097 dst[i] = src[i] * mul; | |
4098 } | |
4099 | |
4100 static void vector_fmul_sv_scalar_2_c(float *dst, const float *src, | |
4101 const float **sv, float mul, int len) | |
4102 { | |
4103 int i; | |
4104 for (i = 0; i < len; i += 2, sv++) { | |
4105 dst[i ] = src[i ] * sv[0][0] * mul; | |
4106 dst[i+1] = src[i+1] * sv[0][1] * mul; | |
4107 } | |
4108 } | |
4109 | |
4110 static void vector_fmul_sv_scalar_4_c(float *dst, const float *src, | |
4111 const float **sv, float mul, int len) | |
4112 { | |
4113 int i; | |
4114 for (i = 0; i < len; i += 4, sv++) { | |
4115 dst[i ] = src[i ] * sv[0][0] * mul; | |
4116 dst[i+1] = src[i+1] * sv[0][1] * mul; | |
4117 dst[i+2] = src[i+2] * sv[0][2] * mul; | |
4118 dst[i+3] = src[i+3] * sv[0][3] * mul; | |
4119 } | |
4120 } | |
4121 | |
4122 static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul, | |
4123 int len) | |
4124 { | |
4125 int i; | |
4126 for (i = 0; i < len; i += 2, sv++) { | |
4127 dst[i ] = sv[0][0] * mul; | |
4128 dst[i+1] = sv[0][1] * mul; | |
4129 } | |
4130 } | |
4131 | |
4132 static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul, | |
4133 int len) | |
4134 { | |
4135 int i; | |
4136 for (i = 0; i < len; i += 4, sv++) { | |
4137 dst[i ] = sv[0][0] * mul; | |
4138 dst[i+1] = sv[0][1] * mul; | |
4139 dst[i+2] = sv[0][2] * mul; | |
4140 dst[i+3] = sv[0][3] * mul; | |
4141 } | |
4142 } | |
4143 | |
4144 static void butterflies_float_c(float *restrict v1, float *restrict v2, | |
4145 int len) | |
4146 { | |
4147 int i; | |
4148 for (i = 0; i < len; i++) { | |
4149 float t = v1[i] - v2[i]; | |
4150 v1[i] += v2[i]; | |
4151 v2[i] = t; | |
4152 } | |
4153 } | |
4154 | |
4155 static float scalarproduct_float_c(const float *v1, const float *v2, int len) | |
4156 { | |
4157 float p = 0.0; | |
4158 int i; | |
4159 | |
4160 for (i = 0; i < len; i++) | |
4161 p += v1[i] * v2[i]; | |
4162 | |
4163 return p; | |
4164 } | |
4165 | |
7564 | 4166 static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){ |
4167 int i; | |
4168 for(i=0; i<len; i++) | |
4169 dst[i] = src[i] * mul; | |
4170 } | |
4171 | |
10104
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4172 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini, |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4173 uint32_t maxi, uint32_t maxisign) |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4174 { |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4175 |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4176 if(a > mini) return mini; |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4177 else if((a^(1<<31)) > maxisign) return maxi; |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4178 else return a; |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4179 } |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4180 |
10105 | 4181 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){ |
10104
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4182 int i; |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4183 uint32_t mini = *(uint32_t*)min; |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4184 uint32_t maxi = *(uint32_t*)max; |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4185 uint32_t maxisign = maxi ^ (1<<31); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4186 uint32_t *dsti = (uint32_t*)dst; |
10105 | 4187 const uint32_t *srci = (const uint32_t*)src; |
10104
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4188 for(i=0; i<len; i+=8) { |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4189 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4190 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4191 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4192 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4193 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4194 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4195 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4196 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4197 } |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4198 } |
10105 | 4199 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){ |
10104
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4200 int i; |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4201 if(min < 0 && max > 0) { |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4202 vector_clipf_c_opposite_sign(dst, src, &min, &max, len); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4203 } else { |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4204 for(i=0; i < len; i+=8) { |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4205 dst[i ] = av_clipf(src[i ], min, max); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4206 dst[i + 1] = av_clipf(src[i + 1], min, max); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4207 dst[i + 2] = av_clipf(src[i + 2], min, max); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4208 dst[i + 3] = av_clipf(src[i + 3], min, max); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4209 dst[i + 4] = av_clipf(src[i + 4], min, max); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4210 dst[i + 5] = av_clipf(src[i + 5], min, max); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4211 dst[i + 6] = av_clipf(src[i + 6], min, max); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4212 dst[i + 7] = av_clipf(src[i + 7], min, max); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4213 } |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4214 } |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4215 } |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4216 |
7261 | 4217 static av_always_inline int float_to_int16_one(const float *src){ |
4218 int_fast32_t tmp = *(const int32_t*)src; | |
4219 if(tmp & 0xf0000){ | |
4220 tmp = (0x43c0ffff - tmp)>>31; | |
4221 // is this faster on some gcc/cpu combinations? | |
4222 // if(tmp > 0x43c0ffff) tmp = 0xFFFF; | |
4223 // else tmp = 0; | |
4224 } | |
4225 return tmp - 0x8000; | |
4226 } | |
4227 | |
7218 | 4228 void ff_float_to_int16_c(int16_t *dst, const float *src, long len){ |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4229 int i; |
7261 | 4230 for(i=0; i<len; i++) |
4231 dst[i] = float_to_int16_one(src+i); | |
4232 } | |
4233 | |
7286
e267f2519248
float_to_int16_interleave: change src to an array of pointers instead of assuming it's contiguous.
lorenm
parents:
7263
diff
changeset
|
4234 void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){ |
7261 | 4235 int i,j,c; |
4236 if(channels==2){ | |
4237 for(i=0; i<len; i++){ | |
7286
e267f2519248
float_to_int16_interleave: change src to an array of pointers instead of assuming it's contiguous.
lorenm
parents:
7263
diff
changeset
|
4238 dst[2*i] = float_to_int16_one(src[0]+i); |
e267f2519248
float_to_int16_interleave: change src to an array of pointers instead of assuming it's contiguous.
lorenm
parents:
7263
diff
changeset
|
4239 dst[2*i+1] = float_to_int16_one(src[1]+i); |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4240 } |
7261 | 4241 }else{ |
7286
e267f2519248
float_to_int16_interleave: change src to an array of pointers instead of assuming it's contiguous.
lorenm
parents:
7263
diff
changeset
|
4242 for(c=0; c<channels; c++) |
7261 | 4243 for(i=0, j=c; i<len; i++, j+=channels) |
7286
e267f2519248
float_to_int16_interleave: change src to an array of pointers instead of assuming it's contiguous.
lorenm
parents:
7263
diff
changeset
|
4244 dst[j] = float_to_int16_one(src[c]+i); |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4245 } |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4246 } |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4247 |
7203
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4248 static void add_int16_c(int16_t * v1, int16_t * v2, int order) |
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4249 { |
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4250 while (order--) |
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4251 *v1++ += *v2++; |
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4252 } |
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4253 |
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4254 static void sub_int16_c(int16_t * v1, int16_t * v2, int order) |
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4255 { |
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4256 while (order--) |
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4257 *v1++ -= *v2++; |
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4258 } |
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4259 |
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4260 static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift) |
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4261 { |
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4262 int res = 0; |
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4263 |
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4264 while (order--) |
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4265 res += (*v1++ * *v2++) >> shift; |
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4266 |
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4267 return res; |
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4268 } |
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4269 |
5887 | 4270 #define W0 2048 |
4271 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */ | |
4272 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */ | |
4273 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */ | |
4274 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */ | |
4275 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */ | |
4276 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */ | |
4277 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */ | |
4278 | |
4279 static void wmv2_idct_row(short * b) | |
4280 { | |
4281 int s1,s2; | |
4282 int a0,a1,a2,a3,a4,a5,a6,a7; | |
4283 /*step 1*/ | |
4284 a1 = W1*b[1]+W7*b[7]; | |
4285 a7 = W7*b[1]-W1*b[7]; | |
4286 a5 = W5*b[5]+W3*b[3]; | |
4287 a3 = W3*b[5]-W5*b[3]; | |
4288 a2 = W2*b[2]+W6*b[6]; | |
4289 a6 = W6*b[2]-W2*b[6]; | |
4290 a0 = W0*b[0]+W0*b[4]; | |
4291 a4 = W0*b[0]-W0*b[4]; | |
4292 /*step 2*/ | |
4293 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7, | |
4294 s2 = (181*(a1-a5-a7+a3)+128)>>8; | |
4295 /*step 3*/ | |
4296 b[0] = (a0+a2+a1+a5 + (1<<7))>>8; | |
4297 b[1] = (a4+a6 +s1 + (1<<7))>>8; | |
4298 b[2] = (a4-a6 +s2 + (1<<7))>>8; | |
4299 b[3] = (a0-a2+a7+a3 + (1<<7))>>8; | |
4300 b[4] = (a0-a2-a7-a3 + (1<<7))>>8; | |
4301 b[5] = (a4-a6 -s2 + (1<<7))>>8; | |
4302 b[6] = (a4+a6 -s1 + (1<<7))>>8; | |
4303 b[7] = (a0+a2-a1-a5 + (1<<7))>>8; | |
4304 } | |
4305 static void wmv2_idct_col(short * b) | |
4306 { | |
4307 int s1,s2; | |
4308 int a0,a1,a2,a3,a4,a5,a6,a7; | |
4309 /*step 1, with extended precision*/ | |
4310 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3; | |
4311 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3; | |
4312 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3; | |
4313 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3; | |
4314 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3; | |
4315 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3; | |
4316 a0 = (W0*b[8*0]+W0*b[8*4] )>>3; | |
4317 a4 = (W0*b[8*0]-W0*b[8*4] )>>3; | |
4318 /*step 2*/ | |
4319 s1 = (181*(a1-a5+a7-a3)+128)>>8; | |
4320 s2 = (181*(a1-a5-a7+a3)+128)>>8; | |
4321 /*step 3*/ | |
4322 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14; | |
4323 b[8*1] = (a4+a6 +s1 + (1<<13))>>14; | |
4324 b[8*2] = (a4-a6 +s2 + (1<<13))>>14; | |
4325 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14; | |
4326 | |
4327 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14; | |
4328 b[8*5] = (a4-a6 -s2 + (1<<13))>>14; | |
4329 b[8*6] = (a4+a6 -s1 + (1<<13))>>14; | |
4330 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14; | |
4331 } | |
4332 void ff_wmv2_idct_c(short * block){ | |
4333 int i; | |
4334 | |
4335 for(i=0;i<64;i+=8){ | |
4336 wmv2_idct_row(block+i); | |
4337 } | |
4338 for(i=0;i<8;i++){ | |
4339 wmv2_idct_col(block+i); | |
4340 } | |
4341 } | |
1092 | 4342 /* XXX: those functions should be suppressed ASAP when all IDCTs are |
4343 converted */ | |
5887 | 4344 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block) |
4345 { | |
4346 ff_wmv2_idct_c(block); | |
4347 put_pixels_clamped_c(block, dest, line_size); | |
4348 } | |
4349 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block) | |
4350 { | |
4351 ff_wmv2_idct_c(block); | |
4352 add_pixels_clamped_c(block, dest, line_size); | |
4353 } | |
1092 | 4354 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block) |
4355 { | |
4356 j_rev_dct (block); | |
4357 put_pixels_clamped_c(block, dest, line_size); | |
4358 } | |
4359 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block) | |
4360 { | |
4361 j_rev_dct (block); | |
4362 add_pixels_clamped_c(block, dest, line_size); | |
4363 } | |
4364 | |
2256 | 4365 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block) |
4366 { | |
4367 j_rev_dct4 (block); | |
4368 put_pixels_clamped4_c(block, dest, line_size); | |
4369 } | |
4370 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block) | |
4371 { | |
4372 j_rev_dct4 (block); | |
4373 add_pixels_clamped4_c(block, dest, line_size); | |
4374 } | |
4375 | |
2257 | 4376 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block) |
4377 { | |
4378 j_rev_dct2 (block); | |
4379 put_pixels_clamped2_c(block, dest, line_size); | |
4380 } | |
4381 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block) | |
4382 { | |
4383 j_rev_dct2 (block); | |
4384 add_pixels_clamped2_c(block, dest, line_size); | |
4385 } | |
4386 | |
2259 | 4387 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block) |
4388 { | |
4176 | 4389 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; |
2259 | 4390 |
4391 dest[0] = cm[(block[0] + 4)>>3]; | |
4392 } | |
4393 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block) | |
4394 { | |
4176 | 4395 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; |
2259 | 4396 |
4397 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)]; | |
4398 } | |
4399 | |
5143 | 4400 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; } |
3215
06f98047ff26
prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents:
3199
diff
changeset
|
4401 |
1201 | 4402 /* init static data */ |
4197 | 4403 void dsputil_static_init(void) |
0 | 4404 { |
751 | 4405 int i; |
0 | 4406 |
4176 | 4407 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i; |
1201 | 4408 for(i=0;i<MAX_NEG_CROP;i++) { |
4176 | 4409 ff_cropTbl[i] = 0; |
4410 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255; | |
1201 | 4411 } |
2967 | 4412 |
1201 | 4413 for(i=0;i<512;i++) { |
4179 | 4414 ff_squareTbl[i] = (i - 256) * (i - 256); |
1201 | 4415 } |
2967 | 4416 |
4197 | 4417 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1; |
1201 | 4418 } |
0 | 4419 |
4281
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4240
diff
changeset
|
4420 int ff_check_alignment(void){ |
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4240
diff
changeset
|
4421 static int did_fail=0; |
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4240
diff
changeset
|
4422 DECLARE_ALIGNED_16(int, aligned); |
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4240
diff
changeset
|
4423 |
9259 | 4424 if((intptr_t)&aligned & 15){ |
4281
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4240
diff
changeset
|
4425 if(!did_fail){ |
8590 | 4426 #if HAVE_MMX || HAVE_ALTIVEC |
4281
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4240
diff
changeset
|
4427 av_log(NULL, AV_LOG_ERROR, |
4292 | 4428 "Compiler did not align stack variables. Libavcodec has been miscompiled\n" |
4429 "and may be very slow or crash. This is not a bug in libavcodec,\n" | |
5542
b0a566346fb1
Add attribute that forces alignment of stack to functions that need it.
ramiro
parents:
5520
diff
changeset
|
4430 "but in the compiler. You may try recompiling using gcc >= 4.2.\n" |
b0a566346fb1
Add attribute that forces alignment of stack to functions that need it.
ramiro
parents:
5520
diff
changeset
|
4431 "Do not report crashes to FFmpeg developers.\n"); |
4281
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4240
diff
changeset
|
4432 #endif |
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4240
diff
changeset
|
4433 did_fail=1; |
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4240
diff
changeset
|
4434 } |
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4240
diff
changeset
|
4435 return -1; |
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4240
diff
changeset
|
4436 } |
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4240
diff
changeset
|
4437 return 0; |
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4240
diff
changeset
|
4438 } |
861 | 4439 |
1201 | 4440 void dsputil_init(DSPContext* c, AVCodecContext *avctx) |
4441 { | |
4442 int i; | |
0 | 4443 |
4281
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4240
diff
changeset
|
4444 ff_check_alignment(); |
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4240
diff
changeset
|
4445 |
8590 | 4446 #if CONFIG_ENCODERS |
1567 | 4447 if(avctx->dct_algo==FF_DCT_FASTINT) { |
1092 | 4448 c->fdct = fdct_ifast; |
2979 | 4449 c->fdct248 = fdct_ifast248; |
2967 | 4450 } |
1567 | 4451 else if(avctx->dct_algo==FF_DCT_FAAN) { |
1557 | 4452 c->fdct = ff_faandct; |
2979 | 4453 c->fdct248 = ff_faandct248; |
2967 | 4454 } |
1567 | 4455 else { |
1092 | 4456 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default |
2979 | 4457 c->fdct248 = ff_fdct248_islow; |
1567 | 4458 } |
1092 | 4459 #endif //CONFIG_ENCODERS |
4460 | |
2256 | 4461 if(avctx->lowres==1){ |
8596
68e959302527
replace all occurrence of ENABLE_ by the corresponding CONFIG_, HAVE_ or ARCH_
aurel
parents:
8590
diff
changeset
|
4462 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){ |
2272
cd43603c46f9
move h264 idct to its own file and call via function pointer in DspContext
michael
parents:
2259
diff
changeset
|
4463 c->idct_put= ff_jref_idct4_put; |
cd43603c46f9
move h264 idct to its own file and call via function pointer in DspContext
michael
parents:
2259
diff
changeset
|
4464 c->idct_add= ff_jref_idct4_add; |
cd43603c46f9
move h264 idct to its own file and call via function pointer in DspContext
michael
parents:
2259
diff
changeset
|
4465 }else{ |
cd43603c46f9
move h264 idct to its own file and call via function pointer in DspContext
michael
parents:
2259
diff
changeset
|
4466 c->idct_put= ff_h264_lowres_idct_put_c; |
cd43603c46f9
move h264 idct to its own file and call via function pointer in DspContext
michael
parents:
2259
diff
changeset
|
4467 c->idct_add= ff_h264_lowres_idct_add_c; |
cd43603c46f9
move h264 idct to its own file and call via function pointer in DspContext
michael
parents:
2259
diff
changeset
|
4468 } |
2256 | 4469 c->idct = j_rev_dct4; |
1092 | 4470 c->idct_permutation_type= FF_NO_IDCT_PERM; |
2257 | 4471 }else if(avctx->lowres==2){ |
4472 c->idct_put= ff_jref_idct2_put; | |
4473 c->idct_add= ff_jref_idct2_add; | |
4474 c->idct = j_rev_dct2; | |
4475 c->idct_permutation_type= FF_NO_IDCT_PERM; | |
2259 | 4476 }else if(avctx->lowres==3){ |
4477 c->idct_put= ff_jref_idct1_put; | |
4478 c->idct_add= ff_jref_idct1_add; | |
4479 c->idct = j_rev_dct1; | |
4480 c->idct_permutation_type= FF_NO_IDCT_PERM; | |
2256 | 4481 }else{ |
4482 if(avctx->idct_algo==FF_IDCT_INT){ | |
4483 c->idct_put= ff_jref_idct_put; | |
4484 c->idct_add= ff_jref_idct_add; | |
4485 c->idct = j_rev_dct; | |
4486 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM; | |
9975
d6d7e8d4a04d
Do not redundantly check for both CONFIG_THEORA_DECODER and CONFIG_VP3_DECODER.
diego
parents:
9586
diff
changeset
|
4487 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) && |
5007 | 4488 avctx->idct_algo==FF_IDCT_VP3){ |
2693 | 4489 c->idct_put= ff_vp3_idct_put_c; |
4490 c->idct_add= ff_vp3_idct_add_c; | |
4491 c->idct = ff_vp3_idct_c; | |
4492 c->idct_permutation_type= FF_NO_IDCT_PERM; | |
5887 | 4493 }else if(avctx->idct_algo==FF_IDCT_WMV2){ |
4494 c->idct_put= ff_wmv2_idct_put_c; | |
4495 c->idct_add= ff_wmv2_idct_add_c; | |
4496 c->idct = ff_wmv2_idct_c; | |
4497 c->idct_permutation_type= FF_NO_IDCT_PERM; | |
6407 | 4498 }else if(avctx->idct_algo==FF_IDCT_FAAN){ |
4499 c->idct_put= ff_faanidct_put; | |
4500 c->idct_add= ff_faanidct_add; | |
4501 c->idct = ff_faanidct; | |
4502 c->idct_permutation_type= FF_NO_IDCT_PERM; | |
8596
68e959302527
replace all occurrence of ENABLE_ by the corresponding CONFIG_, HAVE_ or ARCH_
aurel
parents:
8590
diff
changeset
|
4503 }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) { |
8120 | 4504 c->idct_put= ff_ea_idct_put_c; |
4505 c->idct_permutation_type= FF_NO_IDCT_PERM; | |
2256 | 4506 }else{ //accurate/default |
6001 | 4507 c->idct_put= ff_simple_idct_put; |
4508 c->idct_add= ff_simple_idct_add; | |
4509 c->idct = ff_simple_idct; | |
2256 | 4510 c->idct_permutation_type= FF_NO_IDCT_PERM; |
4511 } | |
1092 | 4512 } |
4513 | |
8596
68e959302527
replace all occurrence of ENABLE_ by the corresponding CONFIG_, HAVE_ or ARCH_
aurel
parents:
8590
diff
changeset
|
4514 if (CONFIG_H264_DECODER) { |
5065 | 4515 c->h264_idct_add= ff_h264_idct_add_c; |
4516 c->h264_idct8_add= ff_h264_idct8_add_c; | |
4517 c->h264_idct_dc_add= ff_h264_idct_dc_add_c; | |
4518 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c; | |
8375
de2509cf3c44
H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents:
8359
diff
changeset
|
4519 c->h264_idct_add16 = ff_h264_idct_add16_c; |
de2509cf3c44
H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents:
8359
diff
changeset
|
4520 c->h264_idct8_add4 = ff_h264_idct8_add4_c; |
de2509cf3c44
H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents:
8359
diff
changeset
|
4521 c->h264_idct_add8 = ff_h264_idct_add8_c; |
de2509cf3c44
H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents:
8359
diff
changeset
|
4522 c->h264_idct_add16intra= ff_h264_idct_add16intra_c; |
5064 | 4523 } |
2272
cd43603c46f9
move h264 idct to its own file and call via function pointer in DspContext
michael
parents:
2259
diff
changeset
|
4524 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
4525 c->get_pixels = get_pixels_c; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
4526 c->diff_pixels = diff_pixels_c; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
4527 c->put_pixels_clamped = put_pixels_clamped_c; |
1984
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
4528 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c; |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
4529 c->add_pixels_clamped = add_pixels_clamped_c; |
2763 | 4530 c->add_pixels8 = add_pixels8_c; |
4531 c->add_pixels4 = add_pixels4_c; | |
4988
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4749
diff
changeset
|
4532 c->sum_abs_dctelem = sum_abs_dctelem_c; |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
4533 c->gmc1 = gmc1_c; |
3248
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3245
diff
changeset
|
4534 c->gmc = ff_gmc_c; |
8288 | 4535 c->clear_block = clear_block_c; |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
4536 c->clear_blocks = clear_blocks_c; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
4537 c->pix_sum = pix_sum_c; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
4538 c->pix_norm1 = pix_norm1_c; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
4539 |
859 | 4540 /* TODO [0] 16 [1] 8 */ |
1708 | 4541 c->pix_abs[0][0] = pix_abs16_c; |
4542 c->pix_abs[0][1] = pix_abs16_x2_c; | |
4543 c->pix_abs[0][2] = pix_abs16_y2_c; | |
4544 c->pix_abs[0][3] = pix_abs16_xy2_c; | |
4545 c->pix_abs[1][0] = pix_abs8_c; | |
4546 c->pix_abs[1][1] = pix_abs8_x2_c; | |
4547 c->pix_abs[1][2] = pix_abs8_y2_c; | |
4548 c->pix_abs[1][3] = pix_abs8_xy2_c; | |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
4549 |
859 | 4550 #define dspfunc(PFX, IDX, NUM) \ |
4551 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \ | |
4552 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \ | |
4553 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \ | |
4554 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c | |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
4555 |
859 | 4556 dspfunc(put, 0, 16); |
4557 dspfunc(put_no_rnd, 0, 16); | |
4558 dspfunc(put, 1, 8); | |
4559 dspfunc(put_no_rnd, 1, 8); | |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
4560 dspfunc(put, 2, 4); |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
4561 dspfunc(put, 3, 2); |
0 | 4562 |
859 | 4563 dspfunc(avg, 0, 16); |
4564 dspfunc(avg_no_rnd, 0, 16); | |
4565 dspfunc(avg, 1, 8); | |
4566 dspfunc(avg_no_rnd, 1, 8); | |
1319 | 4567 dspfunc(avg, 2, 4); |
4568 dspfunc(avg, 3, 2); | |
859 | 4569 #undef dspfunc |
857 | 4570 |
1864 | 4571 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c; |
4572 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c; | |
4573 | |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
4574 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
4575 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
4576 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
4577 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
4578 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
4579 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
4580 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
4581 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
4582 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
4583 |
1319 | 4584 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c; |
4585 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c; | |
4586 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c; | |
4587 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c; | |
4588 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c; | |
4589 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c; | |
4590 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c; | |
4591 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c; | |
4592 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c; | |
4593 | |
859 | 4594 #define dspfunc(PFX, IDX, NUM) \ |
4595 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \ | |
4596 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \ | |
4597 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \ | |
4598 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \ | |
4599 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \ | |
4600 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \ | |
4601 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \ | |
4602 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \ | |
4603 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \ | |
4604 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \ | |
4605 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \ | |
4606 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \ | |
4607 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \ | |
4608 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \ | |
4609 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \ | |
4610 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c | |
857 | 4611 |
859 | 4612 dspfunc(put_qpel, 0, 16); |
4613 dspfunc(put_no_rnd_qpel, 0, 16); | |
4614 | |
4615 dspfunc(avg_qpel, 0, 16); | |
4616 /* dspfunc(avg_no_rnd_qpel, 0, 16); */ | |
857 | 4617 |
859 | 4618 dspfunc(put_qpel, 1, 8); |
4619 dspfunc(put_no_rnd_qpel, 1, 8); | |
4620 | |
4621 dspfunc(avg_qpel, 1, 8); | |
4622 /* dspfunc(avg_no_rnd_qpel, 1, 8); */ | |
1168 | 4623 |
4624 dspfunc(put_h264_qpel, 0, 16); | |
4625 dspfunc(put_h264_qpel, 1, 8); | |
4626 dspfunc(put_h264_qpel, 2, 4); | |
3020
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
4627 dspfunc(put_h264_qpel, 3, 2); |
1168 | 4628 dspfunc(avg_h264_qpel, 0, 16); |
4629 dspfunc(avg_h264_qpel, 1, 8); | |
4630 dspfunc(avg_h264_qpel, 2, 4); | |
4631 | |
859 | 4632 #undef dspfunc |
1168 | 4633 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c; |
4634 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c; | |
4635 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c; | |
4636 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c; | |
4637 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c; | |
4638 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c; | |
9439
ef3a7b711cc0
Rename put_no_rnd_h264_chroma* to reflect its usage in VC1 only
conrad
parents:
9437
diff
changeset
|
4639 c->put_no_rnd_vc1_chroma_pixels_tab[0]= put_no_rnd_vc1_chroma_mc8_c; |
9440 | 4640 c->avg_no_rnd_vc1_chroma_pixels_tab[0]= avg_no_rnd_vc1_chroma_mc8_c; |
857 | 4641 |
2415 | 4642 c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c; |
4643 c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c; | |
4644 c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c; | |
4645 c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c; | |
4646 c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c; | |
4647 c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c; | |
4648 c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c; | |
4649 c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c; | |
4650 c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c; | |
4651 c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c; | |
4652 c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c; | |
4653 c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c; | |
4654 c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c; | |
4655 c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c; | |
4656 c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c; | |
4657 c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c; | |
4658 c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c; | |
4659 c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c; | |
4660 c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c; | |
4661 c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c; | |
4662 | |
6437 | 4663 c->draw_edges = draw_edges_c; |
4664 | |
8590 | 4665 #if CONFIG_CAVS_DECODER |
3395
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
4666 ff_cavsdsp_init(c,avctx); |
3432 | 4667 #endif |
9585 | 4668 |
4669 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER | |
4670 ff_mlp_init(c, avctx); | |
4671 #endif | |
9995
3141f69e3905
Do not check for both CONFIG_VC1_DECODER and CONFIG_WMV3_DECODER,
diego
parents:
9975
diff
changeset
|
4672 #if CONFIG_VC1_DECODER |
3526 | 4673 ff_vc1dsp_init(c,avctx); |
4674 #endif | |
9995
3141f69e3905
Do not check for both CONFIG_VC1_DECODER and CONFIG_WMV3_DECODER,
diego
parents:
9975
diff
changeset
|
4675 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER |
5887 | 4676 ff_intrax8dsp_init(c,avctx); |
4677 #endif | |
8590 | 4678 #if CONFIG_RV30_DECODER |
8410 | 4679 ff_rv30dsp_init(c,avctx); |
4680 #endif | |
8590 | 4681 #if CONFIG_RV40_DECODER |
8232 | 4682 ff_rv40dsp_init(c,avctx); |
4683 c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c; | |
4684 c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c; | |
4685 c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c; | |
4686 c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c; | |
4687 #endif | |
3395
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
4688 |
936 | 4689 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c; |
4690 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c; | |
4691 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c; | |
4692 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c; | |
4693 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c; | |
4694 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c; | |
4695 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c; | |
4696 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c; | |
2967 | 4697 |
1708 | 4698 #define SET_CMP_FUNC(name) \ |
4699 c->name[0]= name ## 16_c;\ | |
4700 c->name[1]= name ## 8x8_c; | |
2967 | 4701 |
1708 | 4702 SET_CMP_FUNC(hadamard8_diff) |
1729 | 4703 c->hadamard8_diff[4]= hadamard8_intra16_c; |
8978 | 4704 c->hadamard8_diff[5]= hadamard8_intra8x8_c; |
1708 | 4705 SET_CMP_FUNC(dct_sad) |
2382 | 4706 SET_CMP_FUNC(dct_max) |
8590 | 4707 #if CONFIG_GPL |
3010
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
4708 SET_CMP_FUNC(dct264_sad) |
3013 | 4709 #endif |
1708 | 4710 c->sad[0]= pix_abs16_c; |
4711 c->sad[1]= pix_abs8_c; | |
4712 c->sse[0]= sse16_c; | |
4713 c->sse[1]= sse8_c; | |
2184 | 4714 c->sse[2]= sse4_c; |
1708 | 4715 SET_CMP_FUNC(quant_psnr) |
4716 SET_CMP_FUNC(rd) | |
4717 SET_CMP_FUNC(bit) | |
1729 | 4718 c->vsad[0]= vsad16_c; |
4719 c->vsad[4]= vsad_intra16_c; | |
8978 | 4720 c->vsad[5]= vsad_intra8_c; |
1729 | 4721 c->vsse[0]= vsse16_c; |
4722 c->vsse[4]= vsse_intra16_c; | |
8978 | 4723 c->vsse[5]= vsse_intra8_c; |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
4724 c->nsse[0]= nsse16_c; |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
4725 c->nsse[1]= nsse8_c; |
8590 | 4726 #if CONFIG_SNOW_ENCODER |
2184 | 4727 c->w53[0]= w53_16_c; |
4728 c->w53[1]= w53_8_c; | |
4729 c->w97[0]= w97_16_c; | |
4730 c->w97[1]= w97_8_c; | |
3373
b8996cc5ccae
Disable w53 and w97 cmp methods when snow encoder is disabled
gpoirier
parents:
3323
diff
changeset
|
4731 #endif |
2184 | 4732 |
4749 | 4733 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c; |
4734 | |
866 | 4735 c->add_bytes= add_bytes_c; |
6384 | 4736 c->add_bytes_l2= add_bytes_l2_c; |
866 | 4737 c->diff_bytes= diff_bytes_c; |
8760 | 4738 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c; |
1527 | 4739 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c; |
1273 | 4740 c->bswap_buf= bswap_buf; |
8590 | 4741 #if CONFIG_PNG_DECODER |
6384 | 4742 c->add_png_paeth_prediction= ff_add_png_paeth_prediction; |
4743 #endif | |
2633 | 4744 |
4745 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c; | |
4746 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c; | |
8395
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
4747 c->h264_v_loop_filter_luma_intra= h264_v_loop_filter_luma_intra_c; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
4748 c->h264_h_loop_filter_luma_intra= h264_h_loop_filter_luma_intra_c; |
2633 | 4749 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c; |
4750 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c; | |
2707
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
4751 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c; |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
4752 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c; |
3645
47821be55b6c
mmx implementation of deblocking strength decision.
lorenm
parents:
3568
diff
changeset
|
4753 c->h264_loop_filter_strength= NULL; |
2967 | 4754 |
8596
68e959302527
replace all occurrence of ENABLE_ by the corresponding CONFIG_, HAVE_ or ARCH_
aurel
parents:
8590
diff
changeset
|
4755 if (CONFIG_ANY_H263) { |
5278 | 4756 c->h263_h_loop_filter= h263_h_loop_filter_c; |
4757 c->h263_v_loop_filter= h263_v_loop_filter_c; | |
5277
7b3fcb7c61ce
Avoid linking with h263.c functions when the relevant codecs
aurel
parents:
5256
diff
changeset
|
4758 } |
2967 | 4759 |
9975
d6d7e8d4a04d
Do not redundantly check for both CONFIG_THEORA_DECODER and CONFIG_VP3_DECODER.
diego
parents:
9586
diff
changeset
|
4760 if (CONFIG_VP3_DECODER) { |
7995 | 4761 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c; |
4762 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c; | |
4763 } | |
8785
bee83b3f9a6b
move vp6_filter_diag4() to a new vp6dsp.c file and use it throught dsputil
aurel
parents:
8760
diff
changeset
|
4764 if (CONFIG_VP6_DECODER) { |
bee83b3f9a6b
move vp6_filter_diag4() to a new vp6dsp.c file and use it throught dsputil
aurel
parents:
8760
diff
changeset
|
4765 c->vp6_filter_diag4= ff_vp6_filter_diag4_c; |
bee83b3f9a6b
move vp6_filter_diag4() to a new vp6dsp.c file and use it throught dsputil
aurel
parents:
8760
diff
changeset
|
4766 } |
7995 | 4767 |
2045 | 4768 c->h261_loop_filter= h261_loop_filter_c; |
2967 | 4769 |
1784 | 4770 c->try_8x8basis= try_8x8basis_c; |
4771 c->add_8x8basis= add_8x8basis_c; | |
866 | 4772 |
8590 | 4773 #if CONFIG_SNOW_DECODER |
3198
6b9f0c4fbdbe
First part of a series of speed-enchancing patches.
gpoirier
parents:
3105
diff
changeset
|
4774 c->vertical_compose97i = ff_snow_vertical_compose97i; |
6b9f0c4fbdbe
First part of a series of speed-enchancing patches.
gpoirier
parents:
3105
diff
changeset
|
4775 c->horizontal_compose97i = ff_snow_horizontal_compose97i; |
6b9f0c4fbdbe
First part of a series of speed-enchancing patches.
gpoirier
parents:
3105
diff
changeset
|
4776 c->inner_add_yblock = ff_snow_inner_add_yblock; |
3199
1651e69b9f7a
10l: Only set *compose97i *add_yblock to dsputils context if we are building with Snow enabled
gpoirier
parents:
3198
diff
changeset
|
4777 #endif |
3198
6b9f0c4fbdbe
First part of a series of speed-enchancing patches.
gpoirier
parents:
3105
diff
changeset
|
4778 |
8590 | 4779 #if CONFIG_VORBIS_DECODER |
3536
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3526
diff
changeset
|
4780 c->vorbis_inverse_coupling = vorbis_inverse_coupling; |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3526
diff
changeset
|
4781 #endif |
8590 | 4782 #if CONFIG_AC3_DECODER |
7563 | 4783 c->ac3_downmix = ff_ac3_downmix_c; |
4784 #endif | |
8590 | 4785 #if CONFIG_FLAC_ENCODER |
5737 | 4786 c->flac_compute_autocorr = ff_flac_compute_autocorr; |
4787 #endif | |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4788 c->vector_fmul = vector_fmul_c; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4789 c->vector_fmul_reverse = vector_fmul_reverse_c; |
10300
4d1b9ca628fc
Drop unused args from vector_fmul_add_add, simpify code, and rename
mru
parents:
10219
diff
changeset
|
4790 c->vector_fmul_add = vector_fmul_add_c; |
7261 | 4791 c->vector_fmul_window = ff_vector_fmul_window_c; |
7564 | 4792 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c; |
10104
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4793 c->vector_clipf = vector_clipf_c; |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4794 c->float_to_int16 = ff_float_to_int16_c; |
7261 | 4795 c->float_to_int16_interleave = ff_float_to_int16_interleave_c; |
7203
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4796 c->add_int16 = add_int16_c; |
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4797 c->sub_int16 = sub_int16_c; |
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4798 c->scalarproduct_int16 = scalarproduct_int16_c; |
10219 | 4799 c->scalarproduct_float = scalarproduct_float_c; |
4800 c->butterflies_float = butterflies_float_c; | |
4801 c->vector_fmul_scalar = vector_fmul_scalar_c; | |
4802 | |
4803 c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c; | |
4804 c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c; | |
4805 | |
4806 c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c; | |
4807 c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c; | |
3536
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3526
diff
changeset
|
4808 |
3245 | 4809 c->shrink[0]= ff_img_copy_plane; |
4810 c->shrink[1]= ff_shrink22; | |
4811 c->shrink[2]= ff_shrink44; | |
4812 c->shrink[3]= ff_shrink88; | |
4813 | |
3215
06f98047ff26
prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents:
3199
diff
changeset
|
4814 c->prefetch= just_return; |
06f98047ff26
prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents:
3199
diff
changeset
|
4815 |
3807
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3728
diff
changeset
|
4816 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab)); |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3728
diff
changeset
|
4817 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab)); |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3728
diff
changeset
|
4818 |
8596
68e959302527
replace all occurrence of ENABLE_ by the corresponding CONFIG_, HAVE_ or ARCH_
aurel
parents:
8590
diff
changeset
|
4819 if (HAVE_MMX) dsputil_init_mmx (c, avctx); |
68e959302527
replace all occurrence of ENABLE_ by the corresponding CONFIG_, HAVE_ or ARCH_
aurel
parents:
8590
diff
changeset
|
4820 if (ARCH_ARM) dsputil_init_arm (c, avctx); |
68e959302527
replace all occurrence of ENABLE_ by the corresponding CONFIG_, HAVE_ or ARCH_
aurel
parents:
8590
diff
changeset
|
4821 if (CONFIG_MLIB) dsputil_init_mlib (c, avctx); |
68e959302527
replace all occurrence of ENABLE_ by the corresponding CONFIG_, HAVE_ or ARCH_
aurel
parents:
8590
diff
changeset
|
4822 if (HAVE_VIS) dsputil_init_vis (c, avctx); |
68e959302527
replace all occurrence of ENABLE_ by the corresponding CONFIG_, HAVE_ or ARCH_
aurel
parents:
8590
diff
changeset
|
4823 if (ARCH_ALPHA) dsputil_init_alpha (c, avctx); |
68e959302527
replace all occurrence of ENABLE_ by the corresponding CONFIG_, HAVE_ or ARCH_
aurel
parents:
8590
diff
changeset
|
4824 if (ARCH_PPC) dsputil_init_ppc (c, avctx); |
68e959302527
replace all occurrence of ENABLE_ by the corresponding CONFIG_, HAVE_ or ARCH_
aurel
parents:
8590
diff
changeset
|
4825 if (HAVE_MMI) dsputil_init_mmi (c, avctx); |
68e959302527
replace all occurrence of ENABLE_ by the corresponding CONFIG_, HAVE_ or ARCH_
aurel
parents:
8590
diff
changeset
|
4826 if (ARCH_SH4) dsputil_init_sh4 (c, avctx); |
68e959302527
replace all occurrence of ENABLE_ by the corresponding CONFIG_, HAVE_ or ARCH_
aurel
parents:
8590
diff
changeset
|
4827 if (ARCH_BFIN) dsputil_init_bfin (c, avctx); |
1092 | 4828 |
3807
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3728
diff
changeset
|
4829 for(i=0; i<64; i++){ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3728
diff
changeset
|
4830 if(!c->put_2tap_qpel_pixels_tab[0][i]) |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3728
diff
changeset
|
4831 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i]; |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3728
diff
changeset
|
4832 if(!c->avg_2tap_qpel_pixels_tab[0][i]) |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3728
diff
changeset
|
4833 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i]; |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3728
diff
changeset
|
4834 } |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3728
diff
changeset
|
4835 |
1092 | 4836 switch(c->idct_permutation_type){ |
4837 case FF_NO_IDCT_PERM: | |
4838 for(i=0; i<64; i++) | |
4839 c->idct_permutation[i]= i; | |
4840 break; | |
4841 case FF_LIBMPEG2_IDCT_PERM: | |
4842 for(i=0; i<64; i++) | |
4843 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2); | |
4844 break; | |
4845 case FF_SIMPLE_IDCT_PERM: | |
4846 for(i=0; i<64; i++) | |
4847 c->idct_permutation[i]= simple_mmx_permutation[i]; | |
4848 break; | |
4849 case FF_TRANSPOSE_IDCT_PERM: | |
4850 for(i=0; i<64; i++) | |
4851 c->idct_permutation[i]= ((i&7)<<3) | (i>>3); | |
4852 break; | |
2696
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2693
diff
changeset
|
4853 case FF_PARTTRANS_IDCT_PERM: |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2693
diff
changeset
|
4854 for(i=0; i<64; i++) |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2693
diff
changeset
|
4855 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3); |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2693
diff
changeset
|
4856 break; |
6600
c3213c91124c
Add a new IDCT permutation, used in xvid_sse2 and possibly future similar IDCTs.
astrange
parents:
6450
diff
changeset
|
4857 case FF_SSE2_IDCT_PERM: |
c3213c91124c
Add a new IDCT permutation, used in xvid_sse2 and possibly future similar IDCTs.
astrange
parents:
6450
diff
changeset
|
4858 for(i=0; i<64; i++) |
c3213c91124c
Add a new IDCT permutation, used in xvid_sse2 and possibly future similar IDCTs.
astrange
parents:
6450
diff
changeset
|
4859 c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7]; |
c3213c91124c
Add a new IDCT permutation, used in xvid_sse2 and possibly future similar IDCTs.
astrange
parents:
6450
diff
changeset
|
4860 break; |
1092 | 4861 default: |
1598
932d306bf1dc
av_log() patch by (Michel Bardiaux <mbardiaux at peaktime dot be>)
michael
parents:
1571
diff
changeset
|
4862 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n"); |
1092 | 4863 } |
0 | 4864 } |
252
ddb1a0e94cf4
- Added PSNR feature to libavcodec and ffmpeg. By now just Y PSNR until I'm
pulento
parents:
220
diff
changeset
|
4865 |