Mercurial > libavcodec.hg
annotate i386/mpegvideo_mmx.c @ 625:bb6a69f9d409 libavcodec
slow but accurate integer dct from IJG (should be ok with the LGPL as the old DCT is the fast integer DCT from IJG)
per context DCT selection
author | michaelni |
---|---|
date | Thu, 29 Aug 2002 23:55:32 +0000 |
parents | a5aa53b6e648 |
children | 9abb13c21fbe |
rev | line source |
---|---|
8 | 1 /* |
2 * The simplest mpeg encoder (well, it was the simplest!) | |
429 | 3 * Copyright (c) 2000,2001 Fabrice Bellard. |
8 | 4 * |
429 | 5 * This library is free software; you can redistribute it and/or |
6 * modify it under the terms of the GNU Lesser General Public | |
7 * License as published by the Free Software Foundation; either | |
8 * version 2 of the License, or (at your option) any later version. | |
8 | 9 * |
429 | 10 * This library is distributed in the hope that it will be useful, |
8 | 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
429 | 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 * Lesser General Public License for more details. | |
8 | 14 * |
429 | 15 * You should have received a copy of the GNU Lesser General Public |
16 * License along with this library; if not, write to the Free Software | |
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
8 | 18 * |
19 * Optimized for ia32 cpus by Nick Kurshev <nickols_k@mail.ru> | |
325 | 20 * h263, mpeg1, mpeg2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at> |
8 | 21 */ |
22 | |
14
8ceb13af9cb6
renamed - use of s->dct_unquantize function pointer - SHOULD add faster h263 mmx specific unquantization stuff
glantau
parents:
8
diff
changeset
|
23 #include "../dsputil.h" |
8ceb13af9cb6
renamed - use of s->dct_unquantize function pointer - SHOULD add faster h263 mmx specific unquantization stuff
glantau
parents:
8
diff
changeset
|
24 #include "../mpegvideo.h" |
220 | 25 #include "../avcodec.h" |
14
8ceb13af9cb6
renamed - use of s->dct_unquantize function pointer - SHOULD add faster h263 mmx specific unquantization stuff
glantau
parents:
8
diff
changeset
|
26 |
200 | 27 extern UINT8 zigzag_end[64]; |
220 | 28 |
29 extern UINT8 zigzag_direct_noperm[64]; | |
30 extern UINT16 inv_zigzag_direct16[64]; | |
31 extern UINT32 inverse[256]; | |
200 | 32 |
14
8ceb13af9cb6
renamed - use of s->dct_unquantize function pointer - SHOULD add faster h263 mmx specific unquantization stuff
glantau
parents:
8
diff
changeset
|
33 #if 0 |
8ceb13af9cb6
renamed - use of s->dct_unquantize function pointer - SHOULD add faster h263 mmx specific unquantization stuff
glantau
parents:
8
diff
changeset
|
34 |
8ceb13af9cb6
renamed - use of s->dct_unquantize function pointer - SHOULD add faster h263 mmx specific unquantization stuff
glantau
parents:
8
diff
changeset
|
35 /* XXX: GL: I don't understand why this function needs optimization |
8ceb13af9cb6
renamed - use of s->dct_unquantize function pointer - SHOULD add faster h263 mmx specific unquantization stuff
glantau
parents:
8
diff
changeset
|
36 (it is called only once per frame!), so I disabled it */ |
8ceb13af9cb6
renamed - use of s->dct_unquantize function pointer - SHOULD add faster h263 mmx specific unquantization stuff
glantau
parents:
8
diff
changeset
|
37 |
8 | 38 void MPV_frame_start(MpegEncContext *s) |
39 { | |
40 if (s->pict_type == B_TYPE) { | |
41 __asm __volatile( | |
42 "movl (%1), %%eax\n\t" | |
43 "movl 4(%1), %%edx\n\t" | |
44 "movl 8(%1), %%ecx\n\t" | |
45 "movl %%eax, (%0)\n\t" | |
46 "movl %%edx, 4(%0)\n\t" | |
47 "movl %%ecx, 8(%0)\n\t" | |
48 : | |
49 :"r"(s->current_picture), "r"(s->aux_picture) | |
50 :"eax","edx","ecx","memory"); | |
51 } else { | |
52 /* swap next and last */ | |
53 __asm __volatile( | |
54 "movl (%1), %%eax\n\t" | |
55 "movl 4(%1), %%edx\n\t" | |
56 "movl 8(%1), %%ecx\n\t" | |
57 "xchgl (%0), %%eax\n\t" | |
58 "xchgl 4(%0), %%edx\n\t" | |
59 "xchgl 8(%0), %%ecx\n\t" | |
60 "movl %%eax, (%1)\n\t" | |
61 "movl %%edx, 4(%1)\n\t" | |
62 "movl %%ecx, 8(%1)\n\t" | |
63 "movl %%eax, (%2)\n\t" | |
64 "movl %%edx, 4(%2)\n\t" | |
65 "movl %%ecx, 8(%2)\n\t" | |
66 : | |
67 :"r"(s->last_picture), "r"(s->next_picture), "r"(s->current_picture) | |
68 :"eax","edx","ecx","memory"); | |
69 } | |
70 } | |
14
8ceb13af9cb6
renamed - use of s->dct_unquantize function pointer - SHOULD add faster h263 mmx specific unquantization stuff
glantau
parents:
8
diff
changeset
|
71 #endif |
8 | 72 |
73 static const unsigned long long int mm_wabs __attribute__ ((aligned(8))) = 0xffffffffffffffffULL; | |
74 static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL; | |
75 | |
145
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
76 |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
77 static void dct_unquantize_h263_mmx(MpegEncContext *s, |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
78 DCTELEM *block, int n, int qscale) |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
79 { |
200 | 80 int i, level, qmul, qadd, nCoeffs; |
81 | |
145
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
82 qmul = s->qscale << 1; |
248
56ee684c48bb
- H.263+ decoder support for Advanded INTRA Coding (buggy)
pulento
parents:
220
diff
changeset
|
83 if (s->h263_aic && s->mb_intra) |
56ee684c48bb
- H.263+ decoder support for Advanded INTRA Coding (buggy)
pulento
parents:
220
diff
changeset
|
84 qadd = 0; |
56ee684c48bb
- H.263+ decoder support for Advanded INTRA Coding (buggy)
pulento
parents:
220
diff
changeset
|
85 else |
56ee684c48bb
- H.263+ decoder support for Advanded INTRA Coding (buggy)
pulento
parents:
220
diff
changeset
|
86 qadd = (s->qscale - 1) | 1; |
145
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
87 |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
88 if (s->mb_intra) { |
249
42a0b7b16738
- Bug fixes in H.263+ Advanced INTRA Coding decoder.
pulento
parents:
248
diff
changeset
|
89 if (!s->h263_aic) { |
42a0b7b16738
- Bug fixes in H.263+ Advanced INTRA Coding decoder.
pulento
parents:
248
diff
changeset
|
90 if (n < 4) |
42a0b7b16738
- Bug fixes in H.263+ Advanced INTRA Coding decoder.
pulento
parents:
248
diff
changeset
|
91 block[0] = block[0] * s->y_dc_scale; |
42a0b7b16738
- Bug fixes in H.263+ Advanced INTRA Coding decoder.
pulento
parents:
248
diff
changeset
|
92 else |
42a0b7b16738
- Bug fixes in H.263+ Advanced INTRA Coding decoder.
pulento
parents:
248
diff
changeset
|
93 block[0] = block[0] * s->c_dc_scale; |
42a0b7b16738
- Bug fixes in H.263+ Advanced INTRA Coding decoder.
pulento
parents:
248
diff
changeset
|
94 } |
250
3449316664b5
- Bug fix on RTYPE (rounding type) not being honoured by H.263+ decoder.
pulento
parents:
249
diff
changeset
|
95 for(i=1; i<8; i++) { |
252
ddb1a0e94cf4
- Added PSNR feature to libavcodec and ffmpeg. By now just Y PSNR until I'm
pulento
parents:
250
diff
changeset
|
96 level = block[i]; |
ddb1a0e94cf4
- Added PSNR feature to libavcodec and ffmpeg. By now just Y PSNR until I'm
pulento
parents:
250
diff
changeset
|
97 if (level) { |
ddb1a0e94cf4
- Added PSNR feature to libavcodec and ffmpeg. By now just Y PSNR until I'm
pulento
parents:
250
diff
changeset
|
98 if (level < 0) { |
ddb1a0e94cf4
- Added PSNR feature to libavcodec and ffmpeg. By now just Y PSNR until I'm
pulento
parents:
250
diff
changeset
|
99 level = level * qmul - qadd; |
ddb1a0e94cf4
- Added PSNR feature to libavcodec and ffmpeg. By now just Y PSNR until I'm
pulento
parents:
250
diff
changeset
|
100 } else { |
ddb1a0e94cf4
- Added PSNR feature to libavcodec and ffmpeg. By now just Y PSNR until I'm
pulento
parents:
250
diff
changeset
|
101 level = level * qmul + qadd; |
ddb1a0e94cf4
- Added PSNR feature to libavcodec and ffmpeg. By now just Y PSNR until I'm
pulento
parents:
250
diff
changeset
|
102 } |
ddb1a0e94cf4
- Added PSNR feature to libavcodec and ffmpeg. By now just Y PSNR until I'm
pulento
parents:
250
diff
changeset
|
103 block[i] = level; |
250
3449316664b5
- Bug fix on RTYPE (rounding type) not being honoured by H.263+ decoder.
pulento
parents:
249
diff
changeset
|
104 } |
3449316664b5
- Bug fix on RTYPE (rounding type) not being honoured by H.263+ decoder.
pulento
parents:
249
diff
changeset
|
105 } |
249
42a0b7b16738
- Bug fixes in H.263+ Advanced INTRA Coding decoder.
pulento
parents:
248
diff
changeset
|
106 nCoeffs=64; |
145
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
107 } else { |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
108 i = 0; |
249
42a0b7b16738
- Bug fixes in H.263+ Advanced INTRA Coding decoder.
pulento
parents:
248
diff
changeset
|
109 nCoeffs= zigzag_end[ s->block_last_index[n] ]; |
145
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
110 } |
200 | 111 //printf("%d %d ", qmul, qadd); |
145
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
112 asm volatile( |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
113 "movd %1, %%mm6 \n\t" //qmul |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
114 "packssdw %%mm6, %%mm6 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
115 "packssdw %%mm6, %%mm6 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
116 "movd %2, %%mm5 \n\t" //qadd |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
117 "pxor %%mm7, %%mm7 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
118 "packssdw %%mm5, %%mm5 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
119 "packssdw %%mm5, %%mm5 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
120 "psubw %%mm5, %%mm7 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
121 "pxor %%mm4, %%mm4 \n\t" |
153 | 122 ".balign 16\n\t" |
145
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
123 "1: \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
124 "movq (%0, %3), %%mm0 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
125 "movq 8(%0, %3), %%mm1 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
126 |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
127 "pmullw %%mm6, %%mm0 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
128 "pmullw %%mm6, %%mm1 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
129 |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
130 "movq (%0, %3), %%mm2 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
131 "movq 8(%0, %3), %%mm3 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
132 |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
133 "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
134 "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
135 |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
136 "pxor %%mm2, %%mm0 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
137 "pxor %%mm3, %%mm1 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
138 |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
139 "paddw %%mm7, %%mm0 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
140 "paddw %%mm7, %%mm1 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
141 |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
142 "pxor %%mm0, %%mm2 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
143 "pxor %%mm1, %%mm3 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
144 |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
145 "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
146 "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
147 |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
148 "pandn %%mm2, %%mm0 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
149 "pandn %%mm3, %%mm1 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
150 |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
151 "movq %%mm0, (%0, %3) \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
152 "movq %%mm1, 8(%0, %3) \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
153 |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
154 "addl $16, %3 \n\t" |
200 | 155 "js 1b \n\t" |
156 ::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(i-nCoeffs)) | |
145
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
157 : "memory" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
158 ); |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
159 } |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
160 |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
161 |
8 | 162 /* |
163 NK: | |
164 Note: looking at PARANOID: | |
165 "enable all paranoid tests for rounding, overflows, etc..." | |
166 | |
167 #ifdef PARANOID | |
168 if (level < -2048 || level > 2047) | |
169 fprintf(stderr, "unquant error %d %d\n", i, level); | |
170 #endif | |
171 We can suppose that result of two multiplications can't be greate of 0xFFFF | |
172 i.e. is 16-bit, so we use here only PMULLW instruction and can avoid | |
173 a complex multiplication. | |
174 ===================================================== | |
175 Full formula for multiplication of 2 integer numbers | |
176 which are represent as high:low words: | |
177 input: value1 = high1:low1 | |
178 value2 = high2:low2 | |
179 output: value3 = value1*value2 | |
180 value3=high3:low3 (on overflow: modulus 2^32 wrap-around) | |
181 this mean that for 0x123456 * 0x123456 correct result is 0x766cb0ce4 | |
182 but this algorithm will compute only 0x66cb0ce4 | |
183 this limited by 16-bit size of operands | |
184 --------------------------------- | |
185 tlow1 = high1*low2 | |
186 tlow2 = high2*low1 | |
187 tlow1 = tlow1 + tlow2 | |
188 high3:low3 = low1*low2 | |
189 high3 += tlow1 | |
190 */ | |
14
8ceb13af9cb6
renamed - use of s->dct_unquantize function pointer - SHOULD add faster h263 mmx specific unquantization stuff
glantau
parents:
8
diff
changeset
|
191 static void dct_unquantize_mpeg1_mmx(MpegEncContext *s, |
8ceb13af9cb6
renamed - use of s->dct_unquantize function pointer - SHOULD add faster h263 mmx specific unquantization stuff
glantau
parents:
8
diff
changeset
|
192 DCTELEM *block, int n, int qscale) |
8 | 193 { |
325 | 194 int nCoeffs; |
8 | 195 const UINT16 *quant_matrix; |
200 | 196 |
197 if(s->alternate_scan) nCoeffs= 64; | |
620
a5aa53b6e648
warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michaelni
parents:
429
diff
changeset
|
198 else nCoeffs= zigzag_end[ s->block_last_index[n] ]; |
200 | 199 |
8 | 200 if (s->mb_intra) { |
325 | 201 int block0; |
8 | 202 if (n < 4) |
325 | 203 block0 = block[0] * s->y_dc_scale; |
8 | 204 else |
325 | 205 block0 = block[0] * s->c_dc_scale; |
8 | 206 /* XXX: only mpeg1 */ |
207 quant_matrix = s->intra_matrix; | |
145
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
208 asm volatile( |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
209 "pcmpeqw %%mm7, %%mm7 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
210 "psrlw $15, %%mm7 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
211 "movd %2, %%mm6 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
212 "packssdw %%mm6, %%mm6 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
213 "packssdw %%mm6, %%mm6 \n\t" |
325 | 214 "movl %3, %%eax \n\t" |
153 | 215 ".balign 16\n\t" |
145
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
216 "1: \n\t" |
325 | 217 "movq (%0, %%eax), %%mm0 \n\t" |
218 "movq 8(%0, %%eax), %%mm1 \n\t" | |
219 "movq (%1, %%eax), %%mm4 \n\t" | |
220 "movq 8(%1, %%eax), %%mm5 \n\t" | |
221 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] | |
222 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] | |
223 "pxor %%mm2, %%mm2 \n\t" | |
224 "pxor %%mm3, %%mm3 \n\t" | |
225 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 | |
226 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 | |
227 "pxor %%mm2, %%mm0 \n\t" | |
228 "pxor %%mm3, %%mm1 \n\t" | |
229 "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) | |
230 "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) | |
231 "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q | |
232 "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q | |
233 "pxor %%mm4, %%mm4 \n\t" | |
234 "pxor %%mm5, %%mm5 \n\t" // FIXME slow | |
235 "pcmpeqw (%0, %%eax), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 | |
236 "pcmpeqw 8(%0, %%eax), %%mm5 \n\t" // block[i] == 0 ? -1 : 0 | |
237 "psraw $3, %%mm0 \n\t" | |
238 "psraw $3, %%mm1 \n\t" | |
239 "psubw %%mm7, %%mm0 \n\t" | |
240 "psubw %%mm7, %%mm1 \n\t" | |
241 "por %%mm7, %%mm0 \n\t" | |
242 "por %%mm7, %%mm1 \n\t" | |
243 "pxor %%mm2, %%mm0 \n\t" | |
244 "pxor %%mm3, %%mm1 \n\t" | |
245 "psubw %%mm2, %%mm0 \n\t" | |
246 "psubw %%mm3, %%mm1 \n\t" | |
247 "pandn %%mm0, %%mm4 \n\t" | |
248 "pandn %%mm1, %%mm5 \n\t" | |
249 "movq %%mm4, (%0, %%eax) \n\t" | |
250 "movq %%mm5, 8(%0, %%eax) \n\t" | |
251 | |
252 "addl $16, %%eax \n\t" | |
253 "js 1b \n\t" | |
254 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) | |
255 : "%eax", "memory" | |
256 ); | |
257 block[0]= block0; | |
258 | |
259 } else { | |
344 | 260 quant_matrix = s->inter_matrix; |
325 | 261 asm volatile( |
262 "pcmpeqw %%mm7, %%mm7 \n\t" | |
263 "psrlw $15, %%mm7 \n\t" | |
264 "movd %2, %%mm6 \n\t" | |
265 "packssdw %%mm6, %%mm6 \n\t" | |
266 "packssdw %%mm6, %%mm6 \n\t" | |
267 "movl %3, %%eax \n\t" | |
268 ".balign 16\n\t" | |
269 "1: \n\t" | |
270 "movq (%0, %%eax), %%mm0 \n\t" | |
271 "movq 8(%0, %%eax), %%mm1 \n\t" | |
272 "movq (%1, %%eax), %%mm4 \n\t" | |
273 "movq 8(%1, %%eax), %%mm5 \n\t" | |
145
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
274 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
275 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
276 "pxor %%mm2, %%mm2 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
277 "pxor %%mm3, %%mm3 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
278 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
279 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
280 "pxor %%mm2, %%mm0 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
281 "pxor %%mm3, %%mm1 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
282 "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
283 "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
284 "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
285 "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
286 "paddw %%mm7, %%mm0 \n\t" // abs(block[i])*2 + 1 |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
287 "paddw %%mm7, %%mm1 \n\t" // abs(block[i])*2 + 1 |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
288 "pmullw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
289 "pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
290 "pxor %%mm4, %%mm4 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
291 "pxor %%mm5, %%mm5 \n\t" // FIXME slow |
325 | 292 "pcmpeqw (%0, %%eax), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 |
293 "pcmpeqw 8(%0, %%eax), %%mm5 \n\t" // block[i] == 0 ? -1 : 0 | |
145
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
294 "psraw $4, %%mm0 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
295 "psraw $4, %%mm1 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
296 "psubw %%mm7, %%mm0 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
297 "psubw %%mm7, %%mm1 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
298 "por %%mm7, %%mm0 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
299 "por %%mm7, %%mm1 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
300 "pxor %%mm2, %%mm0 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
301 "pxor %%mm3, %%mm1 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
302 "psubw %%mm2, %%mm0 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
303 "psubw %%mm3, %%mm1 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
304 "pandn %%mm0, %%mm4 \n\t" |
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
305 "pandn %%mm1, %%mm5 \n\t" |
325 | 306 "movq %%mm4, (%0, %%eax) \n\t" |
307 "movq %%mm5, 8(%0, %%eax) \n\t" | |
308 | |
309 "addl $16, %%eax \n\t" | |
310 "js 1b \n\t" | |
311 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) | |
312 : "%eax", "memory" | |
313 ); | |
314 } | |
315 } | |
316 | |
317 static void dct_unquantize_mpeg2_mmx(MpegEncContext *s, | |
318 DCTELEM *block, int n, int qscale) | |
319 { | |
320 int nCoeffs; | |
321 const UINT16 *quant_matrix; | |
322 | |
323 if(s->alternate_scan) nCoeffs= 64; | |
620
a5aa53b6e648
warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michaelni
parents:
429
diff
changeset
|
324 else nCoeffs= zigzag_end[ s->block_last_index[n] ]; |
325 | 325 |
326 if (s->mb_intra) { | |
327 int block0; | |
328 if (n < 4) | |
329 block0 = block[0] * s->y_dc_scale; | |
330 else | |
331 block0 = block[0] * s->c_dc_scale; | |
332 quant_matrix = s->intra_matrix; | |
333 asm volatile( | |
334 "pcmpeqw %%mm7, %%mm7 \n\t" | |
335 "psrlw $15, %%mm7 \n\t" | |
336 "movd %2, %%mm6 \n\t" | |
337 "packssdw %%mm6, %%mm6 \n\t" | |
338 "packssdw %%mm6, %%mm6 \n\t" | |
339 "movl %3, %%eax \n\t" | |
340 ".balign 16\n\t" | |
341 "1: \n\t" | |
342 "movq (%0, %%eax), %%mm0 \n\t" | |
343 "movq 8(%0, %%eax), %%mm1 \n\t" | |
344 "movq (%1, %%eax), %%mm4 \n\t" | |
345 "movq 8(%1, %%eax), %%mm5 \n\t" | |
346 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] | |
347 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] | |
348 "pxor %%mm2, %%mm2 \n\t" | |
349 "pxor %%mm3, %%mm3 \n\t" | |
350 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 | |
351 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 | |
352 "pxor %%mm2, %%mm0 \n\t" | |
353 "pxor %%mm3, %%mm1 \n\t" | |
354 "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) | |
355 "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) | |
356 "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q | |
357 "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q | |
358 "pxor %%mm4, %%mm4 \n\t" | |
359 "pxor %%mm5, %%mm5 \n\t" // FIXME slow | |
360 "pcmpeqw (%0, %%eax), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 | |
361 "pcmpeqw 8(%0, %%eax), %%mm5 \n\t" // block[i] == 0 ? -1 : 0 | |
362 "psraw $3, %%mm0 \n\t" | |
363 "psraw $3, %%mm1 \n\t" | |
364 "pxor %%mm2, %%mm0 \n\t" | |
365 "pxor %%mm3, %%mm1 \n\t" | |
366 "psubw %%mm2, %%mm0 \n\t" | |
367 "psubw %%mm3, %%mm1 \n\t" | |
368 "pandn %%mm0, %%mm4 \n\t" | |
369 "pandn %%mm1, %%mm5 \n\t" | |
370 "movq %%mm4, (%0, %%eax) \n\t" | |
371 "movq %%mm5, 8(%0, %%eax) \n\t" | |
145
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
372 |
325 | 373 "addl $16, %%eax \n\t" |
200 | 374 "js 1b \n\t" |
325 | 375 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) |
376 : "%eax", "memory" | |
377 ); | |
378 block[0]= block0; | |
379 //Note, we dont do mismatch control for intra as errors cannot accumulate | |
380 | |
381 } else { | |
344 | 382 quant_matrix = s->inter_matrix; |
325 | 383 asm volatile( |
384 "pcmpeqw %%mm7, %%mm7 \n\t" | |
385 "psrlq $48, %%mm7 \n\t" | |
386 "movd %2, %%mm6 \n\t" | |
387 "packssdw %%mm6, %%mm6 \n\t" | |
388 "packssdw %%mm6, %%mm6 \n\t" | |
389 "movl %3, %%eax \n\t" | |
390 ".balign 16\n\t" | |
391 "1: \n\t" | |
392 "movq (%0, %%eax), %%mm0 \n\t" | |
393 "movq 8(%0, %%eax), %%mm1 \n\t" | |
394 "movq (%1, %%eax), %%mm4 \n\t" | |
395 "movq 8(%1, %%eax), %%mm5 \n\t" | |
396 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] | |
397 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] | |
398 "pxor %%mm2, %%mm2 \n\t" | |
399 "pxor %%mm3, %%mm3 \n\t" | |
400 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 | |
401 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 | |
402 "pxor %%mm2, %%mm0 \n\t" | |
403 "pxor %%mm3, %%mm1 \n\t" | |
404 "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) | |
405 "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) | |
406 "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 | |
407 "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 | |
408 "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*2*q | |
409 "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*2*q | |
410 "paddw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q | |
411 "paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q | |
412 "pxor %%mm4, %%mm4 \n\t" | |
413 "pxor %%mm5, %%mm5 \n\t" // FIXME slow | |
414 "pcmpeqw (%0, %%eax), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 | |
415 "pcmpeqw 8(%0, %%eax), %%mm5 \n\t" // block[i] == 0 ? -1 : 0 | |
416 "psrlw $4, %%mm0 \n\t" | |
417 "psrlw $4, %%mm1 \n\t" | |
418 "pxor %%mm2, %%mm0 \n\t" | |
419 "pxor %%mm3, %%mm1 \n\t" | |
420 "psubw %%mm2, %%mm0 \n\t" | |
421 "psubw %%mm3, %%mm1 \n\t" | |
422 "pandn %%mm0, %%mm4 \n\t" | |
423 "pandn %%mm1, %%mm5 \n\t" | |
424 "pxor %%mm4, %%mm7 \n\t" | |
425 "pxor %%mm5, %%mm7 \n\t" | |
426 "movq %%mm4, (%0, %%eax) \n\t" | |
427 "movq %%mm5, 8(%0, %%eax) \n\t" | |
428 | |
429 "addl $16, %%eax \n\t" | |
430 "js 1b \n\t" | |
431 "movd 124(%0, %3), %%mm0 \n\t" | |
432 "movq %%mm7, %%mm6 \n\t" | |
433 "psrlq $32, %%mm7 \n\t" | |
434 "pxor %%mm6, %%mm7 \n\t" | |
435 "movq %%mm7, %%mm6 \n\t" | |
436 "psrlq $16, %%mm7 \n\t" | |
437 "pxor %%mm6, %%mm7 \n\t" | |
438 "pslld $31, %%mm7 \n\t" | |
439 "psrlq $15, %%mm7 \n\t" | |
440 "pxor %%mm7, %%mm0 \n\t" | |
441 "movd %%mm0, 124(%0, %3) \n\t" | |
442 | |
443 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "r" (-2*nCoeffs) | |
444 : "%eax", "memory" | |
145
bd1adece8280
dct_unquantize_h263_mmx() by Michael Niedermayer <michaelni@gmx.at>
arpi_esp
parents:
14
diff
changeset
|
445 ); |
8 | 446 } |
447 } | |
448 | |
206 | 449 /* draw the edges of width 'w' of an image of size width, height |
450 this mmx version can only handle w==8 || w==16 */ | |
451 static void draw_edges_mmx(UINT8 *buf, int wrap, int width, int height, int w) | |
452 { | |
453 UINT8 *ptr, *last_line; | |
454 int i; | |
455 | |
456 last_line = buf + (height - 1) * wrap; | |
457 /* left and right */ | |
458 ptr = buf; | |
459 if(w==8) | |
460 { | |
461 asm volatile( | |
462 "1: \n\t" | |
463 "movd (%0), %%mm0 \n\t" | |
464 "punpcklbw %%mm0, %%mm0 \n\t" | |
465 "punpcklwd %%mm0, %%mm0 \n\t" | |
466 "punpckldq %%mm0, %%mm0 \n\t" | |
467 "movq %%mm0, -8(%0) \n\t" | |
468 "movq -8(%0, %2), %%mm1 \n\t" | |
469 "punpckhbw %%mm1, %%mm1 \n\t" | |
470 "punpckhwd %%mm1, %%mm1 \n\t" | |
471 "punpckhdq %%mm1, %%mm1 \n\t" | |
472 "movq %%mm1, (%0, %2) \n\t" | |
473 "addl %1, %0 \n\t" | |
474 "cmpl %3, %0 \n\t" | |
475 " jb 1b \n\t" | |
476 : "+r" (ptr) | |
477 : "r" (wrap), "r" (width), "r" (ptr + wrap*height) | |
478 ); | |
479 } | |
480 else | |
481 { | |
482 asm volatile( | |
483 "1: \n\t" | |
484 "movd (%0), %%mm0 \n\t" | |
485 "punpcklbw %%mm0, %%mm0 \n\t" | |
486 "punpcklwd %%mm0, %%mm0 \n\t" | |
487 "punpckldq %%mm0, %%mm0 \n\t" | |
488 "movq %%mm0, -8(%0) \n\t" | |
489 "movq %%mm0, -16(%0) \n\t" | |
490 "movq -8(%0, %2), %%mm1 \n\t" | |
491 "punpckhbw %%mm1, %%mm1 \n\t" | |
492 "punpckhwd %%mm1, %%mm1 \n\t" | |
493 "punpckhdq %%mm1, %%mm1 \n\t" | |
494 "movq %%mm1, (%0, %2) \n\t" | |
495 "movq %%mm1, 8(%0, %2) \n\t" | |
496 "addl %1, %0 \n\t" | |
497 "cmpl %3, %0 \n\t" | |
498 " jb 1b \n\t" | |
499 : "+r" (ptr) | |
500 : "r" (wrap), "r" (width), "r" (ptr + wrap*height) | |
501 ); | |
502 } | |
503 | |
504 for(i=0;i<w;i+=4) { | |
505 /* top and bottom (and hopefully also the corners) */ | |
506 ptr= buf - (i + 1) * wrap - w; | |
507 asm volatile( | |
508 "1: \n\t" | |
509 "movq (%1, %0), %%mm0 \n\t" | |
510 "movq %%mm0, (%0) \n\t" | |
511 "movq %%mm0, (%0, %2) \n\t" | |
512 "movq %%mm0, (%0, %2, 2) \n\t" | |
513 "movq %%mm0, (%0, %3) \n\t" | |
514 "addl $8, %0 \n\t" | |
515 "cmpl %4, %0 \n\t" | |
516 " jb 1b \n\t" | |
517 : "+r" (ptr) | |
518 : "r" ((int)buf - (int)ptr - w), "r" (-wrap), "r" (-wrap*3), "r" (ptr+width+2*w) | |
519 ); | |
520 ptr= last_line + (i + 1) * wrap - w; | |
521 asm volatile( | |
522 "1: \n\t" | |
523 "movq (%1, %0), %%mm0 \n\t" | |
524 "movq %%mm0, (%0) \n\t" | |
525 "movq %%mm0, (%0, %2) \n\t" | |
526 "movq %%mm0, (%0, %2, 2) \n\t" | |
527 "movq %%mm0, (%0, %3) \n\t" | |
528 "addl $8, %0 \n\t" | |
529 "cmpl %4, %0 \n\t" | |
530 " jb 1b \n\t" | |
531 : "+r" (ptr) | |
532 : "r" ((int)last_line - (int)ptr - w), "r" (wrap), "r" (wrap*3), "r" (ptr+width+2*w) | |
533 ); | |
534 } | |
535 } | |
536 | |
220 | 537 static volatile int esp_temp; |
538 | |
539 void unused_var_warning_killer(){ | |
540 esp_temp++; | |
541 } | |
542 | |
543 #undef HAVE_MMX2 | |
544 #define RENAME(a) a ## _MMX | |
545 #include "mpegvideo_mmx_template.c" | |
546 | |
547 #define HAVE_MMX2 | |
548 #undef RENAME | |
549 #define RENAME(a) a ## _MMX2 | |
550 #include "mpegvideo_mmx_template.c" | |
206 | 551 |
14
8ceb13af9cb6
renamed - use of s->dct_unquantize function pointer - SHOULD add faster h263 mmx specific unquantization stuff
glantau
parents:
8
diff
changeset
|
552 void MPV_common_init_mmx(MpegEncContext *s) |
8 | 553 { |
14
8ceb13af9cb6
renamed - use of s->dct_unquantize function pointer - SHOULD add faster h263 mmx specific unquantization stuff
glantau
parents:
8
diff
changeset
|
554 if (mm_flags & MM_MMX) { |
625
bb6a69f9d409
slow but accurate integer dct from IJG (should be ok with the LGPL as the old DCT is the fast integer DCT from IJG)
michaelni
parents:
620
diff
changeset
|
555 const int dct_algo= s->avctx->dct_algo; |
325 | 556 s->dct_unquantize_h263 = dct_unquantize_h263_mmx; |
557 s->dct_unquantize_mpeg1 = dct_unquantize_mpeg1_mmx; | |
558 s->dct_unquantize_mpeg2 = dct_unquantize_mpeg2_mmx; | |
312 | 559 |
350
6ebbecc10063
- Advanced Intra Coding (AIC) support for H.263+ encoder, just DC by now.
pulento
parents:
344
diff
changeset
|
560 draw_edges = draw_edges_mmx; |
220 | 561 |
625
bb6a69f9d409
slow but accurate integer dct from IJG (should be ok with the LGPL as the old DCT is the fast integer DCT from IJG)
michaelni
parents:
620
diff
changeset
|
562 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){ |
bb6a69f9d409
slow but accurate integer dct from IJG (should be ok with the LGPL as the old DCT is the fast integer DCT from IJG)
michaelni
parents:
620
diff
changeset
|
563 s->fdct = fdct_mmx; |
bb6a69f9d409
slow but accurate integer dct from IJG (should be ok with the LGPL as the old DCT is the fast integer DCT from IJG)
michaelni
parents:
620
diff
changeset
|
564 |
bb6a69f9d409
slow but accurate integer dct from IJG (should be ok with the LGPL as the old DCT is the fast integer DCT from IJG)
michaelni
parents:
620
diff
changeset
|
565 if(mm_flags & MM_MMXEXT){ |
bb6a69f9d409
slow but accurate integer dct from IJG (should be ok with the LGPL as the old DCT is the fast integer DCT from IJG)
michaelni
parents:
620
diff
changeset
|
566 s->dct_quantize= dct_quantize_MMX2; |
bb6a69f9d409
slow but accurate integer dct from IJG (should be ok with the LGPL as the old DCT is the fast integer DCT from IJG)
michaelni
parents:
620
diff
changeset
|
567 } else { |
bb6a69f9d409
slow but accurate integer dct from IJG (should be ok with the LGPL as the old DCT is the fast integer DCT from IJG)
michaelni
parents:
620
diff
changeset
|
568 s->dct_quantize= dct_quantize_MMX; |
bb6a69f9d409
slow but accurate integer dct from IJG (should be ok with the LGPL as the old DCT is the fast integer DCT from IJG)
michaelni
parents:
620
diff
changeset
|
569 } |
350
6ebbecc10063
- Advanced Intra Coding (AIC) support for H.263+ encoder, just DC by now.
pulento
parents:
344
diff
changeset
|
570 } |
14
8ceb13af9cb6
renamed - use of s->dct_unquantize function pointer - SHOULD add faster h263 mmx specific unquantization stuff
glantau
parents:
8
diff
changeset
|
571 } |
8 | 572 } |