Mercurial > libavcodec.hg
annotate dsputil.h @ 1708:dea5b2946999 libavcodec
interlaced motion estimation
interlaced mpeg2 encoding
P & B frames
rate distored interlaced mb decission
alternate scantable support
4mv encoding fixes (thats also why the regression tests change)
passing height to most dsp functions
interlaced mpeg4 encoding (no direct mode MBs yet)
various related cleanups
disabled old motion estimaton algorithms (log, full, ...) they will either be fixed or removed
author | michael |
---|---|
date | Tue, 30 Dec 2003 16:07:57 +0000 |
parents | 11433ade9e06 |
children | 39a209b0c82c |
rev | line source |
---|---|
429 | 1 /* |
2 * DSP utils | |
3 * Copyright (c) 2000, 2001, 2002 Fabrice Bellard. | |
4 * | |
5 * This library is free software; you can redistribute it and/or | |
6 * modify it under the terms of the GNU Lesser General Public | |
7 * License as published by the Free Software Foundation; either | |
8 * version 2 of the License, or (at your option) any later version. | |
9 * | |
10 * This library is distributed in the hope that it will be useful, | |
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 * Lesser General Public License for more details. | |
14 * | |
15 * You should have received a copy of the GNU Lesser General Public | |
16 * License along with this library; if not, write to the Free Software | |
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
18 */ | |
1102 | 19 |
20 /** | |
21 * @file dsputil.h | |
1106 | 22 * DSP utils. |
1213 | 23 * note, many functions in here may use MMX which trashes the FPU state, it is |
24 * absolutely necessary to call emms_c() between dsp & float/double code | |
1102 | 25 */ |
26 | |
0 | 27 #ifndef DSPUTIL_H |
28 #define DSPUTIL_H | |
29 | |
30 #include "common.h" | |
252
ddb1a0e94cf4
- Added PSNR feature to libavcodec and ffmpeg. By now just Y PSNR until I'm
pulento
parents:
214
diff
changeset
|
31 #include "avcodec.h" |
0 | 32 |
1102 | 33 |
255 | 34 //#define DEBUG |
0 | 35 /* dct code */ |
36 typedef short DCTELEM; | |
37 | |
474
11dbd00682fc
avoid name clash with libjpeg - added missing externs
bellard
parents:
429
diff
changeset
|
38 void fdct_ifast (DCTELEM *data); |
1571
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1567
diff
changeset
|
39 void fdct_ifast248 (DCTELEM *data); |
625
bb6a69f9d409
slow but accurate integer dct from IJG (should be ok with the LGPL as the old DCT is the fast integer DCT from IJG)
michaelni
parents:
623
diff
changeset
|
40 void ff_jpeg_fdct_islow (DCTELEM *data); |
1567 | 41 void ff_fdct248_islow (DCTELEM *data); |
0 | 42 |
43 void j_rev_dct (DCTELEM *data); | |
44 | |
687
9abb13c21fbe
fdct_mmx -> ff_fdct_mmx (renamed to avoid namespace conflict with xvid)
arpi_esp
parents:
675
diff
changeset
|
45 void ff_fdct_mmx(DCTELEM *block); |
1565 | 46 void ff_fdct_mmx2(DCTELEM *block); |
0 | 47 |
34 | 48 /* encoding scans */ |
1064 | 49 extern const uint8_t ff_alternate_horizontal_scan[64]; |
50 extern const uint8_t ff_alternate_vertical_scan[64]; | |
51 extern const uint8_t ff_zigzag_direct[64]; | |
1567 | 52 extern const uint8_t ff_zigzag248_direct[64]; |
190
9e0e56869d05
fix for non-mmx runtimedetect encoding bugs - patch by Michael Niedermayer <michaelni@gmx.at>
uid46427
parents:
174
diff
changeset
|
53 |
0 | 54 /* pixel operations */ |
55 #define MAX_NEG_CROP 384 | |
56 | |
57 /* temporary */ | |
1064 | 58 extern uint32_t squareTbl[512]; |
59 extern uint8_t cropTbl[256 + 2 * MAX_NEG_CROP]; | |
0 | 60 |
61 | |
675 | 62 /* minimum alignment rules ;) |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
838
diff
changeset
|
63 if u notice errors in the align stuff, need more alignment for some asm code for some cpu |
675 | 64 or need to use a function with less aligned data then send a mail to the ffmpeg-dev list, ... |
65 | |
66 !warning these alignments might not match reallity, (missing attribute((align)) stuff somewhere possible) | |
67 i (michael) didnt check them, these are just the alignents which i think could be reached easily ... | |
68 | |
69 !future video codecs might need functions with less strict alignment | |
70 */ | |
0 | 71 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
838
diff
changeset
|
72 /* |
1064 | 73 void get_pixels_c(DCTELEM *block, const uint8_t *pixels, int line_size); |
74 void diff_pixels_c(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride); | |
75 void put_pixels_clamped_c(const DCTELEM *block, uint8_t *pixels, int line_size); | |
76 void add_pixels_clamped_c(const DCTELEM *block, uint8_t *pixels, int line_size); | |
296 | 77 void clear_blocks_c(DCTELEM *blocks); |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
838
diff
changeset
|
78 */ |
0 | 79 |
80 /* add and put pixel (decoding) */ | |
675 | 81 // blocksizes for op_pixels_func are 8x4,8x8 16x8 16x16 |
1064 | 82 typedef void (*op_pixels_func)(uint8_t *block/*align width (8 or 16)*/, const uint8_t *pixels/*align 1*/, int line_size, int h); |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
83 typedef void (*tpel_mc_func)(uint8_t *block/*align width (8 or 16)*/, const uint8_t *pixels/*align 1*/, int line_size, int w, int h); |
1064 | 84 typedef void (*qpel_mc_func)(uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride); |
1168 | 85 typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y); |
0 | 86 |
984 | 87 #define DEF_OLD_QPEL(name)\ |
1064 | 88 void ff_put_ ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);\ |
89 void ff_put_no_rnd_ ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);\ | |
90 void ff_avg_ ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride); | |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
838
diff
changeset
|
91 |
984 | 92 DEF_OLD_QPEL(qpel16_mc11_old_c) |
93 DEF_OLD_QPEL(qpel16_mc31_old_c) | |
94 DEF_OLD_QPEL(qpel16_mc12_old_c) | |
95 DEF_OLD_QPEL(qpel16_mc32_old_c) | |
96 DEF_OLD_QPEL(qpel16_mc13_old_c) | |
97 DEF_OLD_QPEL(qpel16_mc33_old_c) | |
98 DEF_OLD_QPEL(qpel8_mc11_old_c) | |
99 DEF_OLD_QPEL(qpel8_mc31_old_c) | |
100 DEF_OLD_QPEL(qpel8_mc12_old_c) | |
101 DEF_OLD_QPEL(qpel8_mc32_old_c) | |
102 DEF_OLD_QPEL(qpel8_mc13_old_c) | |
103 DEF_OLD_QPEL(qpel8_mc33_old_c) | |
651 | 104 |
105 #define CALL_2X_PIXELS(a, b, n)\ | |
106 static void a(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ | |
107 b(block , pixels , line_size, h);\ | |
108 b(block+n, pixels+n, line_size, h);\ | |
109 } | |
255 | 110 |
0 | 111 /* motion estimation */ |
112 | |
1708 | 113 typedef int (*me_cmp_func)(void /*MpegEncContext*/ *s, uint8_t *blk1/*align width (8 or 16)*/, uint8_t *blk2/*align 1*/, int line_size, int h)/* __attribute__ ((const))*/; |
936 | 114 |
1168 | 115 |
1102 | 116 /** |
117 * DSPContext. | |
118 */ | |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
838
diff
changeset
|
119 typedef struct DSPContext { |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
838
diff
changeset
|
120 /* pixel ops : interface with DCT */ |
1064 | 121 void (*get_pixels)(DCTELEM *block/*align 16*/, const uint8_t *pixels/*align 8*/, int line_size); |
122 void (*diff_pixels)(DCTELEM *block/*align 16*/, const uint8_t *s1/*align 8*/, const uint8_t *s2/*align 8*/, int stride); | |
123 void (*put_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size); | |
124 void (*add_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size); | |
1136 | 125 /** |
126 * translational global motion compensation. | |
127 */ | |
1064 | 128 void (*gmc1)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x16, int y16, int rounder); |
1136 | 129 /** |
130 * global motion compensation. | |
131 */ | |
1064 | 132 void (*gmc )(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int ox, int oy, |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
838
diff
changeset
|
133 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height); |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
838
diff
changeset
|
134 void (*clear_blocks)(DCTELEM *blocks/*align 16*/); |
1064 | 135 int (*pix_sum)(uint8_t * pix, int line_size); |
136 int (*pix_norm1)(uint8_t * pix, int line_size); | |
1708 | 137 // 16x16 8x8 4x4 2x2 16x8 8x4 4x2 8x16 4x8 2x4 |
138 | |
139 me_cmp_func sad[4]; /* identical to pix_absAxA except additional void * */ | |
140 me_cmp_func sse[4]; | |
141 me_cmp_func hadamard8_diff[4]; | |
142 me_cmp_func dct_sad[4]; | |
143 me_cmp_func quant_psnr[4]; | |
144 me_cmp_func bit[4]; | |
145 me_cmp_func rd[4]; | |
936 | 146 int (*hadamard8_abs )(uint8_t *src, int stride, int mean); |
147 | |
1708 | 148 me_cmp_func me_pre_cmp[5]; |
149 me_cmp_func me_cmp[5]; | |
150 me_cmp_func me_sub_cmp[5]; | |
151 me_cmp_func mb_cmp[5]; | |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
838
diff
changeset
|
152 |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
153 /* maybe create an array for 16/8/4/2 functions */ |
1136 | 154 /** |
155 * Halfpel motion compensation with rounding (a+b+1)>>1. | |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
156 * this is an array[4][4] of motion compensation funcions for 4 |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
157 * horizontal blocksizes (2,4,8,16) and the 4 halfpel positions<br> |
1213 | 158 * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ] |
1136 | 159 * @param block destination where the result is stored |
160 * @param pixels source | |
161 * @param line_size number of bytes in a horizontal line of block | |
162 * @param h height | |
163 */ | |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
164 op_pixels_func put_pixels_tab[4][4]; |
1136 | 165 |
166 /** | |
167 * Halfpel motion compensation with rounding (a+b+1)>>1. | |
1320 | 168 * This is an array[4][4] of motion compensation functions for 4 |
1319 | 169 * horizontal blocksizes (2,4,8,16) and the 4 halfpel positions<br> |
1213 | 170 * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ] |
1136 | 171 * @param block destination into which the result is averaged (a+b+1)>>1 |
172 * @param pixels source | |
173 * @param line_size number of bytes in a horizontal line of block | |
174 * @param h height | |
175 */ | |
1319 | 176 op_pixels_func avg_pixels_tab[4][4]; |
1136 | 177 |
178 /** | |
179 * Halfpel motion compensation with no rounding (a+b)>>1. | |
1225 | 180 * this is an array[2][4] of motion compensation funcions for 2 |
181 * horizontal blocksizes (8,16) and the 4 halfpel positions<br> | |
1213 | 182 * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ] |
1136 | 183 * @param block destination where the result is stored |
184 * @param pixels source | |
185 * @param line_size number of bytes in a horizontal line of block | |
186 * @param h height | |
187 */ | |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
838
diff
changeset
|
188 op_pixels_func put_no_rnd_pixels_tab[2][4]; |
1136 | 189 |
190 /** | |
191 * Halfpel motion compensation with no rounding (a+b)>>1. | |
1225 | 192 * this is an array[2][4] of motion compensation funcions for 2 |
193 * horizontal blocksizes (8,16) and the 4 halfpel positions<br> | |
1213 | 194 * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ] |
1136 | 195 * @param block destination into which the result is averaged (a+b)>>1 |
196 * @param pixels source | |
197 * @param line_size number of bytes in a horizontal line of block | |
198 * @param h height | |
199 */ | |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
838
diff
changeset
|
200 op_pixels_func avg_no_rnd_pixels_tab[2][4]; |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
201 |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
202 /** |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
203 * Thirdpel motion compensation with rounding (a+b+1)>>1. |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
204 * this is an array[12] of motion compensation funcions for the 9 thirdpel positions<br> |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
205 * *pixels_tab[ xthirdpel + 4*ythirdpel ] |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
206 * @param block destination where the result is stored |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
207 * @param pixels source |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
208 * @param line_size number of bytes in a horizontal line of block |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
209 * @param h height |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
210 */ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
211 tpel_mc_func put_tpel_pixels_tab[11]; //FIXME individual func ptr per width? |
1319 | 212 tpel_mc_func avg_tpel_pixels_tab[11]; //FIXME individual func ptr per width? |
213 | |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
838
diff
changeset
|
214 qpel_mc_func put_qpel_pixels_tab[2][16]; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
838
diff
changeset
|
215 qpel_mc_func avg_qpel_pixels_tab[2][16]; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
838
diff
changeset
|
216 qpel_mc_func put_no_rnd_qpel_pixels_tab[2][16]; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
838
diff
changeset
|
217 qpel_mc_func avg_no_rnd_qpel_pixels_tab[2][16]; |
936 | 218 qpel_mc_func put_mspel_pixels_tab[8]; |
1168 | 219 |
220 /** | |
221 * h264 Chram MC | |
222 */ | |
223 h264_chroma_mc_func put_h264_chroma_pixels_tab[3]; | |
224 h264_chroma_mc_func avg_h264_chroma_pixels_tab[3]; | |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
838
diff
changeset
|
225 |
1168 | 226 qpel_mc_func put_h264_qpel_pixels_tab[3][16]; |
227 qpel_mc_func avg_h264_qpel_pixels_tab[3][16]; | |
228 | |
1708 | 229 me_cmp_func pix_abs[2][4]; |
866 | 230 |
231 /* huffyuv specific */ | |
232 void (*add_bytes)(uint8_t *dst/*align 16*/, uint8_t *src/*align 16*/, int w); | |
936 | 233 void (*diff_bytes)(uint8_t *dst/*align 16*/, uint8_t *src1/*align 16*/, uint8_t *src2/*align 1*/,int w); |
1527 | 234 /** |
235 * subtract huffyuv's variant of median prediction | |
236 * note, this might read from src1[-1], src2[-1] | |
237 */ | |
238 void (*sub_hfyu_median_prediction)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top); | |
1273 | 239 void (*bswap_buf)(uint32_t *dst, uint32_t *src, int w); |
1092 | 240 |
1644 | 241 void (*h263_v_loop_filter)(uint8_t *src, int stride, int qscale); |
242 void (*h263_h_loop_filter)(uint8_t *src, int stride, int qscale); | |
243 | |
1092 | 244 /* (I)DCT */ |
245 void (*fdct)(DCTELEM *block/* align 16*/); | |
1567 | 246 void (*fdct248)(DCTELEM *block/* align 16*/); |
1102 | 247 |
1324
7d328fd9d8a5
the return of the idct with 16bit output by ("Ivan Kalvachev" <ivan at cacad dot com>)
michaelni
parents:
1320
diff
changeset
|
248 /* IDCT really*/ |
7d328fd9d8a5
the return of the idct with 16bit output by ("Ivan Kalvachev" <ivan at cacad dot com>)
michaelni
parents:
1320
diff
changeset
|
249 void (*idct)(DCTELEM *block/* align 16*/); |
7d328fd9d8a5
the return of the idct with 16bit output by ("Ivan Kalvachev" <ivan at cacad dot com>)
michaelni
parents:
1320
diff
changeset
|
250 |
1102 | 251 /** |
1104 | 252 * block -> idct -> clip to unsigned 8 bit -> dest. |
1102 | 253 * (-1392, 0, 0, ...) -> idct -> (-174, -174, ...) -> put -> (0, 0, ...) |
1111 | 254 * @param line_size size in bytes of a horizotal line of dest |
1102 | 255 */ |
1092 | 256 void (*idct_put)(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/); |
1102 | 257 |
258 /** | |
259 * block -> idct -> add dest -> clip to unsigned 8 bit -> dest. | |
1111 | 260 * @param line_size size in bytes of a horizotal line of dest |
1102 | 261 */ |
1092 | 262 void (*idct_add)(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/); |
1102 | 263 |
264 /** | |
1104 | 265 * idct input permutation. |
1241 | 266 * several optimized IDCTs need a permutated input (relative to the normal order of the reference |
267 * IDCT) | |
268 * this permutation must be performed before the idct_put/add, note, normally this can be merged | |
269 * with the zigzag/alternate scan<br> | |
1102 | 270 * an example to avoid confusion: |
271 * - (->decode coeffs -> zigzag reorder -> dequant -> reference idct ->...) | |
272 * - (x -> referece dct -> reference idct -> x) | |
273 * - (x -> referece dct -> simple_mmx_perm = idct_permutation -> simple_idct_mmx -> x) | |
274 * - (->decode coeffs -> zigzag reorder -> simple_mmx_perm -> dequant -> simple_idct_mmx ->...) | |
275 */ | |
1092 | 276 uint8_t idct_permutation[64]; |
277 int idct_permutation_type; | |
278 #define FF_NO_IDCT_PERM 1 | |
279 #define FF_LIBMPEG2_IDCT_PERM 2 | |
280 #define FF_SIMPLE_IDCT_PERM 3 | |
281 #define FF_TRANSPOSE_IDCT_PERM 4 | |
282 | |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
838
diff
changeset
|
283 } DSPContext; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
838
diff
changeset
|
284 |
1201 | 285 void dsputil_static_init(void); |
1092 | 286 void dsputil_init(DSPContext* p, AVCodecContext *avctx); |
0 | 287 |
764 | 288 /** |
289 * permute block according to permuatation. | |
290 * @param last last non zero element in scantable order | |
291 */ | |
1064 | 292 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last); |
34 | 293 |
1264 | 294 #define BYTE_VEC32(c) ((c)*0x01010101UL) |
295 | |
296 static inline uint32_t rnd_avg32(uint32_t a, uint32_t b) | |
297 { | |
298 return (a | b) - (((a ^ b) & ~BYTE_VEC32(0x01)) >> 1); | |
299 } | |
300 | |
301 static inline uint32_t no_rnd_avg32(uint32_t a, uint32_t b) | |
302 { | |
303 return (a & b) + (((a ^ b) & ~BYTE_VEC32(0x01)) >> 1); | |
304 } | |
305 | |
1102 | 306 /** |
1104 | 307 * Empty mmx state. |
1102 | 308 * this must be called between any dsp function and float/double code. |
309 * for example sin(); dsp->idct_put(); emms_c(); cos() | |
310 */ | |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
838
diff
changeset
|
311 #define emms_c() |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
838
diff
changeset
|
312 |
995
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
984
diff
changeset
|
313 /* should be defined by architectures supporting |
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
984
diff
changeset
|
314 one or more MultiMedia extension */ |
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
984
diff
changeset
|
315 int mm_support(void); |
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
984
diff
changeset
|
316 |
62 | 317 #if defined(HAVE_MMX) |
0 | 318 |
862 | 319 #undef emms_c |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
838
diff
changeset
|
320 |
0 | 321 #define MM_MMX 0x0001 /* standard MMX */ |
322 #define MM_3DNOW 0x0004 /* AMD 3DNOW */ | |
323 #define MM_MMXEXT 0x0002 /* SSE integer functions or AMD MMX ext */ | |
324 #define MM_SSE 0x0008 /* SSE functions */ | |
325 #define MM_SSE2 0x0010 /* PIV SSE2 functions */ | |
326 | |
327 extern int mm_flags; | |
328 | |
1064 | 329 void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size); |
330 void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size); | |
0 | 331 |
332 static inline void emms(void) | |
333 { | |
6
ec4642daa6fe
added emms_c() macro which should can used in c code in both mmx/non mmx cases
glantau
parents:
2
diff
changeset
|
334 __asm __volatile ("emms;":::"memory"); |
ec4642daa6fe
added emms_c() macro which should can used in c code in both mmx/non mmx cases
glantau
parents:
2
diff
changeset
|
335 } |
ec4642daa6fe
added emms_c() macro which should can used in c code in both mmx/non mmx cases
glantau
parents:
2
diff
changeset
|
336 |
936 | 337 |
6
ec4642daa6fe
added emms_c() macro which should can used in c code in both mmx/non mmx cases
glantau
parents:
2
diff
changeset
|
338 #define emms_c() \ |
ec4642daa6fe
added emms_c() macro which should can used in c code in both mmx/non mmx cases
glantau
parents:
2
diff
changeset
|
339 {\ |
ec4642daa6fe
added emms_c() macro which should can used in c code in both mmx/non mmx cases
glantau
parents:
2
diff
changeset
|
340 if (mm_flags & MM_MMX)\ |
ec4642daa6fe
added emms_c() macro which should can used in c code in both mmx/non mmx cases
glantau
parents:
2
diff
changeset
|
341 emms();\ |
0 | 342 } |
343 | |
344 #define __align8 __attribute__ ((aligned (8))) | |
345 | |
1092 | 346 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx); |
347 void dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx); | |
1065 | 348 |
62 | 349 #elif defined(ARCH_ARMV4L) |
350 | |
351 /* This is to use 4 bytes read to the IDCT pointers for some 'zero' | |
352 line ptimizations */ | |
353 #define __align8 __attribute__ ((aligned (4))) | |
354 | |
1092 | 355 void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx); |
62 | 356 |
88 | 357 #elif defined(HAVE_MLIB) |
358 | |
359 /* SPARC/VIS IDCT needs 8-byte aligned DCT blocks */ | |
360 #define __align8 __attribute__ ((aligned (8))) | |
361 | |
1092 | 362 void dsputil_init_mlib(DSPContext* c, AVCodecContext *avctx); |
88 | 363 |
214
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
190
diff
changeset
|
364 #elif defined(ARCH_ALPHA) |
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
190
diff
changeset
|
365 |
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
190
diff
changeset
|
366 #define __align8 __attribute__ ((aligned (8))) |
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
190
diff
changeset
|
367 |
1092 | 368 void dsputil_init_alpha(DSPContext* c, AVCodecContext *avctx); |
214
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
190
diff
changeset
|
369 |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
612
diff
changeset
|
370 #elif defined(ARCH_POWERPC) |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
612
diff
changeset
|
371 |
894
a408778eff87
altivec accelerated v-resample patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
884
diff
changeset
|
372 #define MM_ALTIVEC 0x0001 /* standard AltiVec */ |
a408778eff87
altivec accelerated v-resample patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
884
diff
changeset
|
373 |
a408778eff87
altivec accelerated v-resample patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
884
diff
changeset
|
374 extern int mm_flags; |
a408778eff87
altivec accelerated v-resample patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
884
diff
changeset
|
375 |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1008
diff
changeset
|
376 #if defined(HAVE_ALTIVEC) && !defined(CONFIG_DARWIN) |
1653 | 377 #define pixel altivec_pixel |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1008
diff
changeset
|
378 #include <altivec.h> |
1653 | 379 #undef pixel |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1008
diff
changeset
|
380 #endif |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1008
diff
changeset
|
381 |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
612
diff
changeset
|
382 #define __align8 __attribute__ ((aligned (16))) |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
612
diff
changeset
|
383 |
1092 | 384 void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx); |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
612
diff
changeset
|
385 |
689
efcbfbd18864
ps2 idct patch by (Leon van Stuivenberg <leonvs at iae dot nl>)
michaelni
parents:
687
diff
changeset
|
386 #elif defined(HAVE_MMI) |
efcbfbd18864
ps2 idct patch by (Leon van Stuivenberg <leonvs at iae dot nl>)
michaelni
parents:
687
diff
changeset
|
387 |
efcbfbd18864
ps2 idct patch by (Leon van Stuivenberg <leonvs at iae dot nl>)
michaelni
parents:
687
diff
changeset
|
388 #define __align8 __attribute__ ((aligned (16))) |
efcbfbd18864
ps2 idct patch by (Leon van Stuivenberg <leonvs at iae dot nl>)
michaelni
parents:
687
diff
changeset
|
389 |
1092 | 390 void dsputil_init_mmi(DSPContext* c, AVCodecContext *avctx); |
689
efcbfbd18864
ps2 idct patch by (Leon van Stuivenberg <leonvs at iae dot nl>)
michaelni
parents:
687
diff
changeset
|
391 |
1259
e8c3884f2c7e
sh4 optimized idct & bswap patch by (BERO <bero at geocities dot co dot jp>)
michaelni
parents:
1241
diff
changeset
|
392 #elif defined(ARCH_SH4) |
e8c3884f2c7e
sh4 optimized idct & bswap patch by (BERO <bero at geocities dot co dot jp>)
michaelni
parents:
1241
diff
changeset
|
393 |
e8c3884f2c7e
sh4 optimized idct & bswap patch by (BERO <bero at geocities dot co dot jp>)
michaelni
parents:
1241
diff
changeset
|
394 #define __align8 __attribute__ ((aligned (8))) |
e8c3884f2c7e
sh4 optimized idct & bswap patch by (BERO <bero at geocities dot co dot jp>)
michaelni
parents:
1241
diff
changeset
|
395 |
e8c3884f2c7e
sh4 optimized idct & bswap patch by (BERO <bero at geocities dot co dot jp>)
michaelni
parents:
1241
diff
changeset
|
396 void dsputil_init_sh4(DSPContext* c, AVCodecContext *avctx); |
e8c3884f2c7e
sh4 optimized idct & bswap patch by (BERO <bero at geocities dot co dot jp>)
michaelni
parents:
1241
diff
changeset
|
397 |
0 | 398 #else |
399 | |
400 #define __align8 | |
401 | |
402 #endif | |
403 | |
493
873b9075d853
move unaligned access macros to dsputil.h - added unaligned 32 bit store
bellard
parents:
480
diff
changeset
|
404 #ifdef __GNUC__ |
873b9075d853
move unaligned access macros to dsputil.h - added unaligned 32 bit store
bellard
parents:
480
diff
changeset
|
405 |
873b9075d853
move unaligned access macros to dsputil.h - added unaligned 32 bit store
bellard
parents:
480
diff
changeset
|
406 struct unaligned_64 { uint64_t l; } __attribute__((packed)); |
873b9075d853
move unaligned access macros to dsputil.h - added unaligned 32 bit store
bellard
parents:
480
diff
changeset
|
407 struct unaligned_32 { uint32_t l; } __attribute__((packed)); |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
408 struct unaligned_16 { uint16_t l; } __attribute__((packed)); |
493
873b9075d853
move unaligned access macros to dsputil.h - added unaligned 32 bit store
bellard
parents:
480
diff
changeset
|
409 |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
410 #define LD16(a) (((const struct unaligned_16 *) (a))->l) |
493
873b9075d853
move unaligned access macros to dsputil.h - added unaligned 32 bit store
bellard
parents:
480
diff
changeset
|
411 #define LD32(a) (((const struct unaligned_32 *) (a))->l) |
873b9075d853
move unaligned access macros to dsputil.h - added unaligned 32 bit store
bellard
parents:
480
diff
changeset
|
412 #define LD64(a) (((const struct unaligned_64 *) (a))->l) |
873b9075d853
move unaligned access macros to dsputil.h - added unaligned 32 bit store
bellard
parents:
480
diff
changeset
|
413 |
873b9075d853
move unaligned access macros to dsputil.h - added unaligned 32 bit store
bellard
parents:
480
diff
changeset
|
414 #define ST32(a, b) (((struct unaligned_32 *) (a))->l) = (b) |
873b9075d853
move unaligned access macros to dsputil.h - added unaligned 32 bit store
bellard
parents:
480
diff
changeset
|
415 |
873b9075d853
move unaligned access macros to dsputil.h - added unaligned 32 bit store
bellard
parents:
480
diff
changeset
|
416 #else /* __GNUC__ */ |
873b9075d853
move unaligned access macros to dsputil.h - added unaligned 32 bit store
bellard
parents:
480
diff
changeset
|
417 |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
418 #define LD16(a) (*((uint16_t*)(a))) |
493
873b9075d853
move unaligned access macros to dsputil.h - added unaligned 32 bit store
bellard
parents:
480
diff
changeset
|
419 #define LD32(a) (*((uint32_t*)(a))) |
873b9075d853
move unaligned access macros to dsputil.h - added unaligned 32 bit store
bellard
parents:
480
diff
changeset
|
420 #define LD64(a) (*((uint64_t*)(a))) |
873b9075d853
move unaligned access macros to dsputil.h - added unaligned 32 bit store
bellard
parents:
480
diff
changeset
|
421 |
873b9075d853
move unaligned access macros to dsputil.h - added unaligned 32 bit store
bellard
parents:
480
diff
changeset
|
422 #define ST32(a, b) *((uint32_t*)(a)) = (b) |
873b9075d853
move unaligned access macros to dsputil.h - added unaligned 32 bit store
bellard
parents:
480
diff
changeset
|
423 |
873b9075d853
move unaligned access macros to dsputil.h - added unaligned 32 bit store
bellard
parents:
480
diff
changeset
|
424 #endif /* !__GNUC__ */ |
873b9075d853
move unaligned access macros to dsputil.h - added unaligned 32 bit store
bellard
parents:
480
diff
changeset
|
425 |
252
ddb1a0e94cf4
- Added PSNR feature to libavcodec and ffmpeg. By now just Y PSNR until I'm
pulento
parents:
214
diff
changeset
|
426 /* PSNR */ |
1064 | 427 void get_psnr(uint8_t *orig_image[3], uint8_t *coded_image[3], |
252
ddb1a0e94cf4
- Added PSNR feature to libavcodec and ffmpeg. By now just Y PSNR until I'm
pulento
parents:
214
diff
changeset
|
428 int orig_linesize[3], int coded_linesize, |
ddb1a0e94cf4
- Added PSNR feature to libavcodec and ffmpeg. By now just Y PSNR until I'm
pulento
parents:
214
diff
changeset
|
429 AVCodecContext *avctx); |
781 | 430 |
431 /* FFT computation */ | |
432 | |
433 /* NOTE: soon integer code will be added, so you must use the | |
434 FFTSample type */ | |
435 typedef float FFTSample; | |
436 | |
437 typedef struct FFTComplex { | |
438 FFTSample re, im; | |
439 } FFTComplex; | |
440 | |
441 typedef struct FFTContext { | |
442 int nbits; | |
443 int inverse; | |
444 uint16_t *revtab; | |
445 FFTComplex *exptab; | |
446 FFTComplex *exptab1; /* only used by SSE code */ | |
447 void (*fft_calc)(struct FFTContext *s, FFTComplex *z); | |
448 } FFTContext; | |
449 | |
450 int fft_init(FFTContext *s, int nbits, int inverse); | |
451 void fft_permute(FFTContext *s, FFTComplex *z); | |
452 void fft_calc_c(FFTContext *s, FFTComplex *z); | |
453 void fft_calc_sse(FFTContext *s, FFTComplex *z); | |
975
e05d525505c5
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding
bellard
parents:
954
diff
changeset
|
454 void fft_calc_altivec(FFTContext *s, FFTComplex *z); |
e05d525505c5
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding
bellard
parents:
954
diff
changeset
|
455 |
781 | 456 static inline void fft_calc(FFTContext *s, FFTComplex *z) |
457 { | |
458 s->fft_calc(s, z); | |
459 } | |
460 void fft_end(FFTContext *s); | |
461 | |
462 /* MDCT computation */ | |
463 | |
464 typedef struct MDCTContext { | |
465 int n; /* size of MDCT (i.e. number of input data * 2) */ | |
466 int nbits; /* n = 2^nbits */ | |
467 /* pre/post rotation tables */ | |
468 FFTSample *tcos; | |
469 FFTSample *tsin; | |
470 FFTContext fft; | |
471 } MDCTContext; | |
472 | |
794 | 473 int ff_mdct_init(MDCTContext *s, int nbits, int inverse); |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
838
diff
changeset
|
474 void ff_imdct_calc(MDCTContext *s, FFTSample *output, |
781 | 475 const FFTSample *input, FFTSample *tmp); |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
838
diff
changeset
|
476 void ff_mdct_calc(MDCTContext *s, FFTSample *out, |
781 | 477 const FFTSample *input, FFTSample *tmp); |
794 | 478 void ff_mdct_end(MDCTContext *s); |
781 | 479 |
1708 | 480 #define WARPER8_16(name8, name16)\ |
481 static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\ | |
482 return name8(s, dst , src , stride, h)\ | |
483 +name8(s, dst+8 , src+8 , stride, h);\ | |
484 } | |
485 | |
486 #define WARPER8_16_SQ(name8, name16)\ | |
487 static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\ | |
488 int score=0;\ | |
489 score +=name8(s, dst , src , stride, 8);\ | |
490 score +=name8(s, dst+8 , src+8 , stride, 8);\ | |
491 if(h==16){\ | |
492 dst += 8*stride;\ | |
493 src += 8*stride;\ | |
494 score +=name8(s, dst , src , stride, 8);\ | |
495 score +=name8(s, dst+8 , src+8 , stride, 8);\ | |
496 }\ | |
497 return score;\ | |
936 | 498 } |
499 | |
838
b78812db886f
lrintf detection (based upon a patch by Franois Revol <revol at free dot fr>)
michaelni
parents:
802
diff
changeset
|
500 #ifndef HAVE_LRINTF |
796
8a5b70c68fbd
added lrintf for non ISOC libcs (fixme: find a better test)
bellard
parents:
794
diff
changeset
|
501 /* XXX: add ISOC specific test to avoid specific BSD testing. */ |
8a5b70c68fbd
added lrintf for non ISOC libcs (fixme: find a better test)
bellard
parents:
794
diff
changeset
|
502 /* better than nothing implementation. */ |
802 | 503 /* btw, rintf() is existing on fbsd too -- alex */ |
796
8a5b70c68fbd
added lrintf for non ISOC libcs (fixme: find a better test)
bellard
parents:
794
diff
changeset
|
504 static inline long int lrintf(float x) |
8a5b70c68fbd
added lrintf for non ISOC libcs (fixme: find a better test)
bellard
parents:
794
diff
changeset
|
505 { |
1040
998d5035b15b
win32: rint() does not seem to be defined with mingw32-gcc 2.95 - do you have a better solution ?
bellard
parents:
1033
diff
changeset
|
506 #ifdef CONFIG_WIN32 |
998d5035b15b
win32: rint() does not seem to be defined with mingw32-gcc 2.95 - do you have a better solution ?
bellard
parents:
1033
diff
changeset
|
507 /* XXX: incorrect, but make it compile */ |
998d5035b15b
win32: rint() does not seem to be defined with mingw32-gcc 2.95 - do you have a better solution ?
bellard
parents:
1033
diff
changeset
|
508 return (int)(x); |
998d5035b15b
win32: rint() does not seem to be defined with mingw32-gcc 2.95 - do you have a better solution ?
bellard
parents:
1033
diff
changeset
|
509 #else |
796
8a5b70c68fbd
added lrintf for non ISOC libcs (fixme: find a better test)
bellard
parents:
794
diff
changeset
|
510 return (int)(rint(x)); |
1040
998d5035b15b
win32: rint() does not seem to be defined with mingw32-gcc 2.95 - do you have a better solution ?
bellard
parents:
1033
diff
changeset
|
511 #endif |
796
8a5b70c68fbd
added lrintf for non ISOC libcs (fixme: find a better test)
bellard
parents:
794
diff
changeset
|
512 } |
0 | 513 #endif |
796
8a5b70c68fbd
added lrintf for non ISOC libcs (fixme: find a better test)
bellard
parents:
794
diff
changeset
|
514 |
8a5b70c68fbd
added lrintf for non ISOC libcs (fixme: find a better test)
bellard
parents:
794
diff
changeset
|
515 #endif |