comparison src/ffmpeg/libffwma/dsputil.c @ 806:74abcb9cafae trunk

[svn] - fork wma plugin
author nenolod
date Mon, 12 Mar 2007 10:59:21 -0700
parents src/wma/libffwma/dsputil.c@3da1b8942b8b
children
comparison
equal deleted inserted replaced
805:1ba5f86aeac9 806:74abcb9cafae
1 /*
2 * DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
21 */
22
23 /**
24 * @file dsputil.c
25 * DSP utils
26 */
27
28 #include "avcodec.h"
29 #include "dsputil.h"
30 #include "simple_idct.h"
31
32 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
33 uint32_t squareTbl[512];
34
35 const uint8_t ff_zigzag_direct[64] = {
36 0, 1, 8, 16, 9, 2, 3, 10,
37 17, 24, 32, 25, 18, 11, 4, 5,
38 12, 19, 26, 33, 40, 48, 41, 34,
39 27, 20, 13, 6, 7, 14, 21, 28,
40 35, 42, 49, 56, 57, 50, 43, 36,
41 29, 22, 15, 23, 30, 37, 44, 51,
42 58, 59, 52, 45, 38, 31, 39, 46,
43 53, 60, 61, 54, 47, 55, 62, 63
44 };
45
46 /* Specific zigzag scan for 248 idct. NOTE that unlike the
47 specification, we interleave the fields */
48 const uint8_t ff_zigzag248_direct[64] = {
49 0, 8, 1, 9, 16, 24, 2, 10,
50 17, 25, 32, 40, 48, 56, 33, 41,
51 18, 26, 3, 11, 4, 12, 19, 27,
52 34, 42, 49, 57, 50, 58, 35, 43,
53 20, 28, 5, 13, 6, 14, 21, 29,
54 36, 44, 51, 59, 52, 60, 37, 45,
55 22, 30, 7, 15, 23, 31, 38, 46,
56 53, 61, 54, 62, 39, 47, 55, 63,
57 };
58
59 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
60 uint16_t __align8 inv_zigzag_direct16[64];
61
62 const uint8_t ff_alternate_horizontal_scan[64] = {
63 0, 1, 2, 3, 8, 9, 16, 17,
64 10, 11, 4, 5, 6, 7, 15, 14,
65 13, 12, 19, 18, 24, 25, 32, 33,
66 26, 27, 20, 21, 22, 23, 28, 29,
67 30, 31, 34, 35, 40, 41, 48, 49,
68 42, 43, 36, 37, 38, 39, 44, 45,
69 46, 47, 50, 51, 56, 57, 58, 59,
70 52, 53, 54, 55, 60, 61, 62, 63,
71 };
72
73 const uint8_t ff_alternate_vertical_scan[64] = {
74 0, 8, 16, 24, 1, 9, 2, 10,
75 17, 25, 32, 40, 48, 56, 57, 49,
76 41, 33, 26, 18, 3, 11, 4, 12,
77 19, 27, 34, 42, 50, 58, 35, 43,
78 51, 59, 20, 28, 5, 13, 6, 14,
79 21, 29, 36, 44, 52, 60, 37, 45,
80 53, 61, 22, 30, 7, 15, 23, 31,
81 38, 46, 54, 62, 39, 47, 55, 63,
82 };
83
84 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
85 const uint32_t inverse[256]={
86 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
87 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
88 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
89 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
90 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
91 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
92 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
93 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
94 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
95 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
96 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
97 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
98 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
99 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
100 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
101 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
102 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
103 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
104 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
105 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
106 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
107 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
108 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
109 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
110 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
111 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
112 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
113 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
114 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
115 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
116 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
117 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
118 };
119
120 /* Input permutation for the simple_idct_mmx */
121 static const uint8_t simple_mmx_permutation[64]={
122 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
123 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
124 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
125 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
126 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
127 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
128 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
129 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
130 };
131 #if 0
132 static int pix_sum_c(uint8_t * pix, int line_size)
133 {
134 int s, i, j;
135
136 s = 0;
137 for (i = 0; i < 16; i++) {
138 for (j = 0; j < 16; j += 8) {
139 s += pix[0];
140 s += pix[1];
141 s += pix[2];
142 s += pix[3];
143 s += pix[4];
144 s += pix[5];
145 s += pix[6];
146 s += pix[7];
147 pix += 8;
148 }
149 pix += line_size - 16;
150 }
151 return s;
152 }
153
154 static int pix_norm1_c(uint8_t * pix, int line_size)
155 {
156 int s, i, j;
157 uint32_t *sq = squareTbl + 256;
158
159 s = 0;
160 for (i = 0; i < 16; i++) {
161 for (j = 0; j < 16; j += 8) {
162 #if 0
163 s += sq[pix[0]];
164 s += sq[pix[1]];
165 s += sq[pix[2]];
166 s += sq[pix[3]];
167 s += sq[pix[4]];
168 s += sq[pix[5]];
169 s += sq[pix[6]];
170 s += sq[pix[7]];
171 #else
172 #if LONG_MAX > 2147483647
173 register uint64_t x=*(uint64_t*)pix;
174 s += sq[x&0xff];
175 s += sq[(x>>8)&0xff];
176 s += sq[(x>>16)&0xff];
177 s += sq[(x>>24)&0xff];
178 s += sq[(x>>32)&0xff];
179 s += sq[(x>>40)&0xff];
180 s += sq[(x>>48)&0xff];
181 s += sq[(x>>56)&0xff];
182 #else
183 register uint32_t x=*(uint32_t*)pix;
184 s += sq[x&0xff];
185 s += sq[(x>>8)&0xff];
186 s += sq[(x>>16)&0xff];
187 s += sq[(x>>24)&0xff];
188 x=*(uint32_t*)(pix+4);
189 s += sq[x&0xff];
190 s += sq[(x>>8)&0xff];
191 s += sq[(x>>16)&0xff];
192 s += sq[(x>>24)&0xff];
193 #endif
194 #endif
195 pix += 8;
196 }
197 pix += line_size - 16;
198 }
199 return s;
200 }
201
202 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
203 int i;
204
205 for(i=0; i+8<=w; i+=8){
206 dst[i+0]= bswap_32(src[i+0]);
207 dst[i+1]= bswap_32(src[i+1]);
208 dst[i+2]= bswap_32(src[i+2]);
209 dst[i+3]= bswap_32(src[i+3]);
210 dst[i+4]= bswap_32(src[i+4]);
211 dst[i+5]= bswap_32(src[i+5]);
212 dst[i+6]= bswap_32(src[i+6]);
213 dst[i+7]= bswap_32(src[i+7]);
214 }
215 for(;i<w; i++){
216 dst[i+0]= bswap_32(src[i+0]);
217 }
218 }
219
220 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
221 {
222 int s, i;
223 uint32_t *sq = squareTbl + 256;
224
225 s = 0;
226 for (i = 0; i < h; i++) {
227 s += sq[pix1[0] - pix2[0]];
228 s += sq[pix1[1] - pix2[1]];
229 s += sq[pix1[2] - pix2[2]];
230 s += sq[pix1[3] - pix2[3]];
231 s += sq[pix1[4] - pix2[4]];
232 s += sq[pix1[5] - pix2[5]];
233 s += sq[pix1[6] - pix2[6]];
234 s += sq[pix1[7] - pix2[7]];
235 pix1 += line_size;
236 pix2 += line_size;
237 }
238 return s;
239 }
240
241 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
242 {
243 int s, i;
244 uint32_t *sq = squareTbl + 256;
245
246 s = 0;
247 for (i = 0; i < h; i++) {
248 s += sq[pix1[ 0] - pix2[ 0]];
249 s += sq[pix1[ 1] - pix2[ 1]];
250 s += sq[pix1[ 2] - pix2[ 2]];
251 s += sq[pix1[ 3] - pix2[ 3]];
252 s += sq[pix1[ 4] - pix2[ 4]];
253 s += sq[pix1[ 5] - pix2[ 5]];
254 s += sq[pix1[ 6] - pix2[ 6]];
255 s += sq[pix1[ 7] - pix2[ 7]];
256 s += sq[pix1[ 8] - pix2[ 8]];
257 s += sq[pix1[ 9] - pix2[ 9]];
258 s += sq[pix1[10] - pix2[10]];
259 s += sq[pix1[11] - pix2[11]];
260 s += sq[pix1[12] - pix2[12]];
261 s += sq[pix1[13] - pix2[13]];
262 s += sq[pix1[14] - pix2[14]];
263 s += sq[pix1[15] - pix2[15]];
264
265 pix1 += line_size;
266 pix2 += line_size;
267 }
268 return s;
269 }
270
271 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
272 {
273 int i;
274
275 /* read the pixels */
276 for(i=0;i<8;i++) {
277 block[0] = pixels[0];
278 block[1] = pixels[1];
279 block[2] = pixels[2];
280 block[3] = pixels[3];
281 block[4] = pixels[4];
282 block[5] = pixels[5];
283 block[6] = pixels[6];
284 block[7] = pixels[7];
285 pixels += line_size;
286 block += 8;
287 }
288 }
289
290 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
291 const uint8_t *s2, int stride){
292 int i;
293
294 /* read the pixels */
295 for(i=0;i<8;i++) {
296 block[0] = s1[0] - s2[0];
297 block[1] = s1[1] - s2[1];
298 block[2] = s1[2] - s2[2];
299 block[3] = s1[3] - s2[3];
300 block[4] = s1[4] - s2[4];
301 block[5] = s1[5] - s2[5];
302 block[6] = s1[6] - s2[6];
303 block[7] = s1[7] - s2[7];
304 s1 += stride;
305 s2 += stride;
306 block += 8;
307 }
308 }
309
310
311 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
312 int line_size)
313 {
314 int i;
315 uint8_t *cm = cropTbl + MAX_NEG_CROP;
316
317 /* read the pixels */
318 for(i=0;i<8;i++) {
319 pixels[0] = cm[block[0]];
320 pixels[1] = cm[block[1]];
321 pixels[2] = cm[block[2]];
322 pixels[3] = cm[block[3]];
323 pixels[4] = cm[block[4]];
324 pixels[5] = cm[block[5]];
325 pixels[6] = cm[block[6]];
326 pixels[7] = cm[block[7]];
327
328 pixels += line_size;
329 block += 8;
330 }
331 }
332
333 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
334 int line_size)
335 {
336 int i;
337 uint8_t *cm = cropTbl + MAX_NEG_CROP;
338
339 /* read the pixels */
340 for(i=0;i<8;i++) {
341 pixels[0] = cm[pixels[0] + block[0]];
342 pixels[1] = cm[pixels[1] + block[1]];
343 pixels[2] = cm[pixels[2] + block[2]];
344 pixels[3] = cm[pixels[3] + block[3]];
345 pixels[4] = cm[pixels[4] + block[4]];
346 pixels[5] = cm[pixels[5] + block[5]];
347 pixels[6] = cm[pixels[6] + block[6]];
348 pixels[7] = cm[pixels[7] + block[7]];
349 pixels += line_size;
350 block += 8;
351 }
352 }
353 #endif
354 #if 0
355
356 #define PIXOP2(OPNAME, OP) \
357 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
358 {\
359 int i;\
360 for(i=0; i<h; i++){\
361 OP(*((uint64_t*)block), LD64(pixels));\
362 pixels+=line_size;\
363 block +=line_size;\
364 }\
365 }\
366 \
367 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
368 {\
369 int i;\
370 for(i=0; i<h; i++){\
371 const uint64_t a= LD64(pixels );\
372 const uint64_t b= LD64(pixels+1);\
373 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
374 pixels+=line_size;\
375 block +=line_size;\
376 }\
377 }\
378 \
379 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
380 {\
381 int i;\
382 for(i=0; i<h; i++){\
383 const uint64_t a= LD64(pixels );\
384 const uint64_t b= LD64(pixels+1);\
385 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
386 pixels+=line_size;\
387 block +=line_size;\
388 }\
389 }\
390 \
391 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
392 {\
393 int i;\
394 for(i=0; i<h; i++){\
395 const uint64_t a= LD64(pixels );\
396 const uint64_t b= LD64(pixels+line_size);\
397 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
398 pixels+=line_size;\
399 block +=line_size;\
400 }\
401 }\
402 \
403 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
404 {\
405 int i;\
406 for(i=0; i<h; i++){\
407 const uint64_t a= LD64(pixels );\
408 const uint64_t b= LD64(pixels+line_size);\
409 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
410 pixels+=line_size;\
411 block +=line_size;\
412 }\
413 }\
414 \
415 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
416 {\
417 int i;\
418 const uint64_t a= LD64(pixels );\
419 const uint64_t b= LD64(pixels+1);\
420 uint64_t l0= (a&0x0303030303030303ULL)\
421 + (b&0x0303030303030303ULL)\
422 + 0x0202020202020202ULL;\
423 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
424 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
425 uint64_t l1,h1;\
426 \
427 pixels+=line_size;\
428 for(i=0; i<h; i+=2){\
429 uint64_t a= LD64(pixels );\
430 uint64_t b= LD64(pixels+1);\
431 l1= (a&0x0303030303030303ULL)\
432 + (b&0x0303030303030303ULL);\
433 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
434 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
435 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
436 pixels+=line_size;\
437 block +=line_size;\
438 a= LD64(pixels );\
439 b= LD64(pixels+1);\
440 l0= (a&0x0303030303030303ULL)\
441 + (b&0x0303030303030303ULL)\
442 + 0x0202020202020202ULL;\
443 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
444 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
445 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
446 pixels+=line_size;\
447 block +=line_size;\
448 }\
449 }\
450 \
451 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
452 {\
453 int i;\
454 const uint64_t a= LD64(pixels );\
455 const uint64_t b= LD64(pixels+1);\
456 uint64_t l0= (a&0x0303030303030303ULL)\
457 + (b&0x0303030303030303ULL)\
458 + 0x0101010101010101ULL;\
459 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
460 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
461 uint64_t l1,h1;\
462 \
463 pixels+=line_size;\
464 for(i=0; i<h; i+=2){\
465 uint64_t a= LD64(pixels );\
466 uint64_t b= LD64(pixels+1);\
467 l1= (a&0x0303030303030303ULL)\
468 + (b&0x0303030303030303ULL);\
469 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
470 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
471 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
472 pixels+=line_size;\
473 block +=line_size;\
474 a= LD64(pixels );\
475 b= LD64(pixels+1);\
476 l0= (a&0x0303030303030303ULL)\
477 + (b&0x0303030303030303ULL)\
478 + 0x0101010101010101ULL;\
479 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
480 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
481 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
482 pixels+=line_size;\
483 block +=line_size;\
484 }\
485 }\
486 \
487 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
488 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
489 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
490 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
491 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
492 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
493 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
494
495 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
496 #else // 64 bit variant
497
498 #define PIXOP2(OPNAME, OP) \
499 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
500 int i;\
501 for(i=0; i<h; i++){\
502 OP(*((uint16_t*)(block )), LD16(pixels ));\
503 pixels+=line_size;\
504 block +=line_size;\
505 }\
506 }\
507 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
508 int i;\
509 for(i=0; i<h; i++){\
510 OP(*((uint32_t*)(block )), LD32(pixels ));\
511 pixels+=line_size;\
512 block +=line_size;\
513 }\
514 }\
515 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
516 int i;\
517 for(i=0; i<h; i++){\
518 OP(*((uint32_t*)(block )), LD32(pixels ));\
519 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
520 pixels+=line_size;\
521 block +=line_size;\
522 }\
523 }\
524 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
525 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
526 }\
527 \
528 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
529 int src_stride1, int src_stride2, int h){\
530 int i;\
531 for(i=0; i<h; i++){\
532 uint32_t a,b;\
533 a= LD32(&src1[i*src_stride1 ]);\
534 b= LD32(&src2[i*src_stride2 ]);\
535 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
536 a= LD32(&src1[i*src_stride1+4]);\
537 b= LD32(&src2[i*src_stride2+4]);\
538 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
539 }\
540 }\
541 \
542 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
543 int src_stride1, int src_stride2, int h){\
544 int i;\
545 for(i=0; i<h; i++){\
546 uint32_t a,b;\
547 a= LD32(&src1[i*src_stride1 ]);\
548 b= LD32(&src2[i*src_stride2 ]);\
549 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
550 a= LD32(&src1[i*src_stride1+4]);\
551 b= LD32(&src2[i*src_stride2+4]);\
552 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
553 }\
554 }\
555 \
556 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
557 int src_stride1, int src_stride2, int h){\
558 int i;\
559 for(i=0; i<h; i++){\
560 uint32_t a,b;\
561 a= LD32(&src1[i*src_stride1 ]);\
562 b= LD32(&src2[i*src_stride2 ]);\
563 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
564 }\
565 }\
566 \
567 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
568 int src_stride1, int src_stride2, int h){\
569 int i;\
570 for(i=0; i<h; i++){\
571 uint32_t a,b;\
572 a= LD16(&src1[i*src_stride1 ]);\
573 b= LD16(&src2[i*src_stride2 ]);\
574 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
575 }\
576 }\
577 \
578 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
579 int src_stride1, int src_stride2, int h){\
580 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
581 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
582 }\
583 \
584 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
585 int src_stride1, int src_stride2, int h){\
586 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
587 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
588 }\
589 \
590 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
591 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
592 }\
593 \
594 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
595 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
596 }\
597 \
598 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
599 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
600 }\
601 \
602 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
603 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
604 }\
605 \
606 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
607 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
608 int i;\
609 for(i=0; i<h; i++){\
610 uint32_t a, b, c, d, l0, l1, h0, h1;\
611 a= LD32(&src1[i*src_stride1]);\
612 b= LD32(&src2[i*src_stride2]);\
613 c= LD32(&src3[i*src_stride3]);\
614 d= LD32(&src4[i*src_stride4]);\
615 l0= (a&0x03030303UL)\
616 + (b&0x03030303UL)\
617 + 0x02020202UL;\
618 h0= ((a&0xFCFCFCFCUL)>>2)\
619 + ((b&0xFCFCFCFCUL)>>2);\
620 l1= (c&0x03030303UL)\
621 + (d&0x03030303UL);\
622 h1= ((c&0xFCFCFCFCUL)>>2)\
623 + ((d&0xFCFCFCFCUL)>>2);\
624 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
625 a= LD32(&src1[i*src_stride1+4]);\
626 b= LD32(&src2[i*src_stride2+4]);\
627 c= LD32(&src3[i*src_stride3+4]);\
628 d= LD32(&src4[i*src_stride4+4]);\
629 l0= (a&0x03030303UL)\
630 + (b&0x03030303UL)\
631 + 0x02020202UL;\
632 h0= ((a&0xFCFCFCFCUL)>>2)\
633 + ((b&0xFCFCFCFCUL)>>2);\
634 l1= (c&0x03030303UL)\
635 + (d&0x03030303UL);\
636 h1= ((c&0xFCFCFCFCUL)>>2)\
637 + ((d&0xFCFCFCFCUL)>>2);\
638 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
639 }\
640 }\
641 \
642 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
643 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
644 }\
645 \
646 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
647 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
648 }\
649 \
650 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
651 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
652 }\
653 \
654 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
655 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
656 }\
657 \
658 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
659 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
660 int i;\
661 for(i=0; i<h; i++){\
662 uint32_t a, b, c, d, l0, l1, h0, h1;\
663 a= LD32(&src1[i*src_stride1]);\
664 b= LD32(&src2[i*src_stride2]);\
665 c= LD32(&src3[i*src_stride3]);\
666 d= LD32(&src4[i*src_stride4]);\
667 l0= (a&0x03030303UL)\
668 + (b&0x03030303UL)\
669 + 0x01010101UL;\
670 h0= ((a&0xFCFCFCFCUL)>>2)\
671 + ((b&0xFCFCFCFCUL)>>2);\
672 l1= (c&0x03030303UL)\
673 + (d&0x03030303UL);\
674 h1= ((c&0xFCFCFCFCUL)>>2)\
675 + ((d&0xFCFCFCFCUL)>>2);\
676 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
677 a= LD32(&src1[i*src_stride1+4]);\
678 b= LD32(&src2[i*src_stride2+4]);\
679 c= LD32(&src3[i*src_stride3+4]);\
680 d= LD32(&src4[i*src_stride4+4]);\
681 l0= (a&0x03030303UL)\
682 + (b&0x03030303UL)\
683 + 0x01010101UL;\
684 h0= ((a&0xFCFCFCFCUL)>>2)\
685 + ((b&0xFCFCFCFCUL)>>2);\
686 l1= (c&0x03030303UL)\
687 + (d&0x03030303UL);\
688 h1= ((c&0xFCFCFCFCUL)>>2)\
689 + ((d&0xFCFCFCFCUL)>>2);\
690 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
691 }\
692 }\
693 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
694 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
695 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
696 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
697 }\
698 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
699 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
700 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
701 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
702 }\
703 \
704 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
705 {\
706 int i, a0, b0, a1, b1;\
707 a0= pixels[0];\
708 b0= pixels[1] + 2;\
709 a0 += b0;\
710 b0 += pixels[2];\
711 \
712 pixels+=line_size;\
713 for(i=0; i<h; i+=2){\
714 a1= pixels[0];\
715 b1= pixels[1];\
716 a1 += b1;\
717 b1 += pixels[2];\
718 \
719 block[0]= (a1+a0)>>2; /* FIXME non put */\
720 block[1]= (b1+b0)>>2;\
721 \
722 pixels+=line_size;\
723 block +=line_size;\
724 \
725 a0= pixels[0];\
726 b0= pixels[1] + 2;\
727 a0 += b0;\
728 b0 += pixels[2];\
729 \
730 block[0]= (a1+a0)>>2;\
731 block[1]= (b1+b0)>>2;\
732 pixels+=line_size;\
733 block +=line_size;\
734 }\
735 }\
736 \
737 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
738 {\
739 int i;\
740 const uint32_t a= LD32(pixels );\
741 const uint32_t b= LD32(pixels+1);\
742 uint32_t l0= (a&0x03030303UL)\
743 + (b&0x03030303UL)\
744 + 0x02020202UL;\
745 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
746 + ((b&0xFCFCFCFCUL)>>2);\
747 uint32_t l1,h1;\
748 \
749 pixels+=line_size;\
750 for(i=0; i<h; i+=2){\
751 uint32_t a= LD32(pixels );\
752 uint32_t b= LD32(pixels+1);\
753 l1= (a&0x03030303UL)\
754 + (b&0x03030303UL);\
755 h1= ((a&0xFCFCFCFCUL)>>2)\
756 + ((b&0xFCFCFCFCUL)>>2);\
757 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
758 pixels+=line_size;\
759 block +=line_size;\
760 a= LD32(pixels );\
761 b= LD32(pixels+1);\
762 l0= (a&0x03030303UL)\
763 + (b&0x03030303UL)\
764 + 0x02020202UL;\
765 h0= ((a&0xFCFCFCFCUL)>>2)\
766 + ((b&0xFCFCFCFCUL)>>2);\
767 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
768 pixels+=line_size;\
769 block +=line_size;\
770 }\
771 }\
772 \
773 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
774 {\
775 int j;\
776 for(j=0; j<2; j++){\
777 int i;\
778 const uint32_t a= LD32(pixels );\
779 const uint32_t b= LD32(pixels+1);\
780 uint32_t l0= (a&0x03030303UL)\
781 + (b&0x03030303UL)\
782 + 0x02020202UL;\
783 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
784 + ((b&0xFCFCFCFCUL)>>2);\
785 uint32_t l1,h1;\
786 \
787 pixels+=line_size;\
788 for(i=0; i<h; i+=2){\
789 uint32_t a= LD32(pixels );\
790 uint32_t b= LD32(pixels+1);\
791 l1= (a&0x03030303UL)\
792 + (b&0x03030303UL);\
793 h1= ((a&0xFCFCFCFCUL)>>2)\
794 + ((b&0xFCFCFCFCUL)>>2);\
795 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
796 pixels+=line_size;\
797 block +=line_size;\
798 a= LD32(pixels );\
799 b= LD32(pixels+1);\
800 l0= (a&0x03030303UL)\
801 + (b&0x03030303UL)\
802 + 0x02020202UL;\
803 h0= ((a&0xFCFCFCFCUL)>>2)\
804 + ((b&0xFCFCFCFCUL)>>2);\
805 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
806 pixels+=line_size;\
807 block +=line_size;\
808 }\
809 pixels+=4-line_size*(h+1);\
810 block +=4-line_size*h;\
811 }\
812 }\
813 \
814 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
815 {\
816 int j;\
817 for(j=0; j<2; j++){\
818 int i;\
819 const uint32_t a= LD32(pixels );\
820 const uint32_t b= LD32(pixels+1);\
821 uint32_t l0= (a&0x03030303UL)\
822 + (b&0x03030303UL)\
823 + 0x01010101UL;\
824 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
825 + ((b&0xFCFCFCFCUL)>>2);\
826 uint32_t l1,h1;\
827 \
828 pixels+=line_size;\
829 for(i=0; i<h; i+=2){\
830 uint32_t a= LD32(pixels );\
831 uint32_t b= LD32(pixels+1);\
832 l1= (a&0x03030303UL)\
833 + (b&0x03030303UL);\
834 h1= ((a&0xFCFCFCFCUL)>>2)\
835 + ((b&0xFCFCFCFCUL)>>2);\
836 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
837 pixels+=line_size;\
838 block +=line_size;\
839 a= LD32(pixels );\
840 b= LD32(pixels+1);\
841 l0= (a&0x03030303UL)\
842 + (b&0x03030303UL)\
843 + 0x01010101UL;\
844 h0= ((a&0xFCFCFCFCUL)>>2)\
845 + ((b&0xFCFCFCFCUL)>>2);\
846 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
847 pixels+=line_size;\
848 block +=line_size;\
849 }\
850 pixels+=4-line_size*(h+1);\
851 block +=4-line_size*h;\
852 }\
853 }\
854 \
855 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
856 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
857 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
858 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
859 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
860 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
861 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
862 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
863
864 #define op_avg(a, b) a = rnd_avg32(a, b)
865 #endif
866 #define op_put(a, b) a = b
867
868 //PIXOP2(avg, op_avg)
869 //PIXOP2(put, op_put)
870 #undef op_avg
871 #undef op_put
872
873 #define avg2(a,b) ((a+b+1)>>1)
874 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
875
876 /* init static data */
877 void dsputil_static_init(void)
878 {
879 int i;
880
881 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
882 for(i=0;i<MAX_NEG_CROP;i++) {
883 cropTbl[i] = 0;
884 cropTbl[i + MAX_NEG_CROP + 256] = 255;
885 }
886
887 for(i=0;i<512;i++) {
888 squareTbl[i] = (i - 256) * (i - 256);
889 }
890
891 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
892 }