Mercurial > mplayer.hg
annotate libmpcodecs/native/rtjpegn.c @ 28894:b29169fccda9
Fix and restructure fastmemcpybench. It is now one binary that runs all
available memcpy variants and prints benchmark results about them.
author | diego |
---|---|
date | Tue, 10 Mar 2009 10:05:09 +0000 |
parents | e3f3a991ba81 |
children | 0f1b5b68af32 |
rev | line source |
---|---|
3802 | 1 /* |
2 RTjpeg (C) Justin Schoeman 1998 (justin@suntiger.ee.up.ac.za) | |
3 | |
4 With modifications by: | |
5 (c) 1998, 1999 by Joerg Walter <trouble@moes.pmnet.uni-oldenburg.de> | |
6 and | |
7 (c) 1999 by Wim Taymans <wim.taymans@tvd.be> | |
8 | |
9 This program is free software; you can redistribute it and/or modify | |
10 it under the terms of the GNU General Public License as published by | |
11 the Free Software Foundation; either version 2 of the License, or | |
12 (at your option) any later version. | |
13 | |
14 This program is distributed in the hope that it will be useful, | |
15 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
17 GNU General Public License for more details. | |
18 | |
19 You should have received a copy of the GNU General Public License | |
20 along with this program; if not, write to the Free Software | |
21977
cea0eb833758
Fix FSF address and otherwise broken license headers.
diego
parents:
21507
diff
changeset
|
21 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
3802 | 22 */ |
23 | |
24 #include <stdio.h> | |
25 #include <stdlib.h> | |
26 #include <string.h> | |
3805 | 27 |
28 #include "config.h" | |
29 | |
21507
fa99b3d31d13
Hack around libavutil/bswap.h compilation problems due to always_inline undefined.
reimar
parents:
21372
diff
changeset
|
30 #include "mpbswap.h" |
26304
5f526e8e3988
Rename RTJPEG files so that filenames consist of lowercase name only.
diego
parents:
26280
diff
changeset
|
31 #include "rtjpegn.h" |
3802 | 32 |
28298
a7124a264ea6
Completely get rid of MMX define, use HAVE_MMX define instead.
gpoirier
parents:
28296
diff
changeset
|
33 #if HAVE_MMX |
3802 | 34 #include "mmx.h" |
35 #endif | |
36 | |
37 //#define SHOWBLOCK 1 | |
38 #define BETTERCOMPRESSION 1 | |
39 | |
40 static const unsigned char RTjpeg_ZZ[64]={ | |
41 0, | |
42 8, 1, | |
43 2, 9, 16, | |
44 24, 17, 10, 3, | |
45 4, 11, 18, 25, 32, | |
46 40, 33, 26, 19, 12, 5, | |
47 6, 13, 20, 27, 34, 41, 48, | |
48 56, 49, 42, 35, 28, 21, 14, 7, | |
49 15, 22, 29, 36, 43, 50, 57, | |
50 58, 51, 44, 37, 30, 23, | |
51 31, 38, 45, 52, 59, | |
52 60, 53, 46, 39, | |
53 47, 54, 61, | |
54 62, 55, | |
55 63 }; | |
56 | |
57 static const __u64 RTjpeg_aan_tab[64]={ | |
58 4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL, 3374581504ULL, 2324432128ULL, 1184891264ULL, | |
59 5957222912ULL, 8263040512ULL, 7783580160ULL, 7005009920ULL, 5957222912ULL, 4680582144ULL, 3224107520ULL, 1643641088ULL, | |
60 5611718144ULL, 7783580160ULL, 7331904512ULL, 6598688768ULL, 5611718144ULL, 4408998912ULL, 3036936960ULL, 1548224000ULL, | |
61 5050464768ULL, 7005009920ULL, 6598688768ULL, 5938608128ULL, 5050464768ULL, 3968072960ULL, 2733115392ULL, 1393296000ULL, | |
62 4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL, 3374581504ULL, 2324432128ULL, 1184891264ULL, | |
63 3374581504ULL, 4680582144ULL, 4408998912ULL, 3968072960ULL, 3374581504ULL, 2651326208ULL, 1826357504ULL, 931136000ULL, | |
64 2324432128ULL, 3224107520ULL, 3036936960ULL, 2733115392ULL, 2324432128ULL, 1826357504ULL, 1258030336ULL, 641204288ULL, | |
65 1184891264ULL, 1643641088ULL, 1548224000ULL, 1393296000ULL, 1184891264ULL, 931136000ULL, 641204288ULL, 326894240ULL, | |
66 }; | |
67 | |
28298
a7124a264ea6
Completely get rid of MMX define, use HAVE_MMX define instead.
gpoirier
parents:
28296
diff
changeset
|
68 #if !HAVE_MMX |
3802 | 69 static __s32 RTjpeg_ws[64+31]; |
70 #endif | |
28849
87b59e8d3c26
Mark everything not used outside the file as "static"
reimar
parents:
28298
diff
changeset
|
71 static __u8 RTjpeg_alldata[2*64+4*64+4*64+4*64+4*64+32]; |
3802 | 72 |
3835 | 73 static __s16 *block; // rh |
74 static __s16 *RTjpeg_block; | |
75 static __s32 *RTjpeg_lqt; | |
76 static __s32 *RTjpeg_cqt; | |
77 static __u32 *RTjpeg_liqt; | |
78 static __u32 *RTjpeg_ciqt; | |
79 | |
80 static unsigned char RTjpeg_lb8; | |
81 static unsigned char RTjpeg_cb8; | |
82 static int RTjpeg_width, RTjpeg_height; | |
83 static int RTjpeg_Ywidth, RTjpeg_Cwidth; | |
84 static int RTjpeg_Ysize, RTjpeg_Csize; | |
85 | |
86 static __s16 *RTjpeg_old=NULL; | |
3802 | 87 |
28298
a7124a264ea6
Completely get rid of MMX define, use HAVE_MMX define instead.
gpoirier
parents:
28296
diff
changeset
|
88 #if HAVE_MMX |
28849
87b59e8d3c26
Mark everything not used outside the file as "static"
reimar
parents:
28298
diff
changeset
|
89 static mmx_t RTjpeg_lmask; |
87b59e8d3c26
Mark everything not used outside the file as "static"
reimar
parents:
28298
diff
changeset
|
90 static mmx_t RTjpeg_cmask; |
3802 | 91 #else |
28849
87b59e8d3c26
Mark everything not used outside the file as "static"
reimar
parents:
28298
diff
changeset
|
92 static __u16 RTjpeg_lmask; |
87b59e8d3c26
Mark everything not used outside the file as "static"
reimar
parents:
28298
diff
changeset
|
93 static __u16 RTjpeg_cmask; |
3802 | 94 #endif |
95 | |
96 static const unsigned char RTjpeg_lum_quant_tbl[64] = { | |
97 16, 11, 10, 16, 24, 40, 51, 61, | |
98 12, 12, 14, 19, 26, 58, 60, 55, | |
99 14, 13, 16, 24, 40, 57, 69, 56, | |
100 14, 17, 22, 29, 51, 87, 80, 62, | |
101 18, 22, 37, 56, 68, 109, 103, 77, | |
102 24, 35, 55, 64, 81, 104, 113, 92, | |
103 49, 64, 78, 87, 103, 121, 120, 101, | |
104 72, 92, 95, 98, 112, 100, 103, 99 | |
105 }; | |
106 | |
107 static const unsigned char RTjpeg_chrom_quant_tbl[64] = { | |
108 17, 18, 24, 47, 99, 99, 99, 99, | |
109 18, 21, 26, 66, 99, 99, 99, 99, | |
110 24, 26, 56, 99, 99, 99, 99, 99, | |
111 47, 66, 99, 99, 99, 99, 99, 99, | |
112 99, 99, 99, 99, 99, 99, 99, 99, | |
113 99, 99, 99, 99, 99, 99, 99, 99, | |
114 99, 99, 99, 99, 99, 99, 99, 99, | |
115 99, 99, 99, 99, 99, 99, 99, 99 | |
116 }; | |
117 | |
118 #ifdef BETTERCOMPRESSION | |
119 | |
120 /*--------------------------------------------------*/ | |
121 /* better encoding, but needs a lot more cpu time */ | |
122 /* seems to be more effective than old method +lzo */ | |
123 /* with this encoding lzo isn't efficient anymore */ | |
124 /* there is still more potential for better */ | |
125 /* encoding but that would need even more cputime */ | |
126 /* anyway your mileage may vary */ | |
127 /* */ | |
128 /* written by Martin BIELY and Roman HOCHLEITNER */ | |
129 /*--------------------------------------------------*/ | |
130 | |
131 /* +++++++++++++++++++++++++++++++++++++++++++++++++++*/ | |
132 /* Block to Stream (encoding) */ | |
133 /* */ | |
134 | |
28849
87b59e8d3c26
Mark everything not used outside the file as "static"
reimar
parents:
28298
diff
changeset
|
135 static int RTjpeg_b2s(__s16 *data, __s8 *strm, __u8 bt8) |
3802 | 136 { |
137 register int ci, co=1; | |
138 register __s16 ZZvalue; | |
139 register unsigned char bitten; | |
140 register unsigned char bitoff; | |
141 | |
142 #ifdef SHOWBLOCK | |
143 | |
144 int ii; | |
145 for (ii=0; ii < 64; ii++) { | |
146 fprintf(stdout, "%d ", data[RTjpeg_ZZ[ii]]); | |
147 } | |
148 fprintf(stdout, "\n\n"); | |
149 | |
150 #endif | |
151 | |
152 // first byte allways written | |
12378 | 153 ((__u8*)strm)[0]= |
3802 | 154 (__u8)(data[RTjpeg_ZZ[0]]>254) ? 254:((data[RTjpeg_ZZ[0]]<0)?0:data[RTjpeg_ZZ[0]]); |
155 | |
156 | |
157 ci=63; | |
158 while (data[RTjpeg_ZZ[ci]]==0 && ci>0) ci--; | |
159 | |
160 bitten = ((unsigned char)ci) << 2; | |
161 | |
162 if (ci==0) { | |
12378 | 163 ((__u8*)strm)[1]= bitten; |
3802 | 164 co = 2; |
165 return (int)co; | |
166 } | |
167 | |
168 /* bitoff=0 because the high 6bit contain first non zero position */ | |
169 bitoff = 0; | |
170 co = 1; | |
171 | |
172 for(; ci>0; ci--) { | |
173 | |
174 ZZvalue = data[RTjpeg_ZZ[ci]]; | |
175 | |
176 switch(ZZvalue) { | |
177 case 0: | |
178 break; | |
179 case 1: | |
180 bitten |= (0x01<<bitoff); | |
181 break; | |
182 case -1: | |
183 bitten |= (0x03<<bitoff); | |
184 break; | |
185 default: | |
186 bitten |= (0x02<<bitoff); | |
187 goto HERZWEH; | |
188 break; | |
189 } | |
190 | |
191 if( bitoff == 0 ) { | |
12378 | 192 ((__u8*)strm)[co]= bitten; |
3802 | 193 bitten = 0; |
194 bitoff = 8; | |
195 co++; | |
196 } /* "fall through" */ | |
197 bitoff-=2; | |
198 | |
199 } | |
200 | |
201 /* ci must be 0 */ | |
202 if(bitoff != 6) { | |
203 | |
12378 | 204 ((__u8*)strm)[co]= bitten; |
3802 | 205 co++; |
206 | |
207 } | |
208 goto BAUCHWEH; | |
209 | |
210 HERZWEH: | |
211 /* ci cannot be 0 */ | |
212 /* correct bitoff to nibble boundaries */ | |
213 | |
214 switch(bitoff){ | |
215 case 4: | |
216 case 6: | |
217 bitoff = 0; | |
218 break; | |
219 case 2: | |
220 case 0: | |
12378 | 221 ((__u8*)strm)[co]= bitten; |
3802 | 222 bitoff = 4; |
223 co++; | |
224 bitten = 0; // clear half nibble values in bitten | |
225 break; | |
226 default: | |
227 break; | |
228 } | |
229 | |
230 for(; ci>0; ci--) { | |
231 | |
232 ZZvalue = data[RTjpeg_ZZ[ci]]; | |
233 | |
234 if( (ZZvalue > 7) || (ZZvalue < -7) ) { | |
235 bitten |= (0x08<<bitoff); | |
236 goto HIRNWEH; | |
237 } | |
238 | |
239 bitten |= (ZZvalue&0xf)<<bitoff; | |
240 | |
241 if( bitoff == 0 ) { | |
12378 | 242 ((__u8*)strm)[co]= bitten; |
3802 | 243 bitten = 0; |
244 bitoff = 8; | |
245 co++; | |
246 } /* "fall thru" */ | |
247 bitoff-=4; | |
248 } | |
249 | |
250 /* ci must be 0 */ | |
251 if( bitoff == 0 ) { | |
12378 | 252 ((__u8*)strm)[co]= bitten; |
3802 | 253 co++; |
254 } | |
255 goto BAUCHWEH; | |
256 | |
257 HIRNWEH: | |
258 | |
12378 | 259 ((__u8*)strm)[co]= bitten; |
3802 | 260 co++; |
261 | |
262 | |
263 /* bitting is over now we bite */ | |
264 for(; ci>0; ci--) { | |
265 | |
266 ZZvalue = data[RTjpeg_ZZ[ci]]; | |
267 | |
268 if(ZZvalue>0) | |
269 { | |
270 strm[co++]=(__s8)(ZZvalue>127)?127:ZZvalue; | |
271 } | |
272 else | |
273 { | |
274 strm[co++]=(__s8)(ZZvalue<-128)?-128:ZZvalue; | |
275 } | |
276 | |
277 } | |
278 | |
279 | |
280 BAUCHWEH: | |
281 /* we gotoo much now we are ill */ | |
282 #ifdef SHOWBLOCK | |
283 { | |
284 int i; | |
285 fprintf(stdout, "\nco = '%d'\n", co); | |
286 for (i=0; i < co+2; i++) { | |
287 fprintf(stdout, "%d ", strm[i]); | |
288 } | |
289 fprintf(stdout, "\n\n"); | |
290 } | |
291 #endif | |
292 | |
293 return (int)co; | |
294 } | |
295 | |
296 #else | |
297 | |
28849
87b59e8d3c26
Mark everything not used outside the file as "static"
reimar
parents:
28298
diff
changeset
|
298 static int RTjpeg_b2s(__s16 *data, __s8 *strm, __u8 bt8) |
3802 | 299 { |
300 register int ci, co=1, tmp; | |
301 register __s16 ZZvalue; | |
302 | |
303 #ifdef SHOWBLOCK | |
304 | |
305 int ii; | |
306 for (ii=0; ii < 64; ii++) { | |
307 fprintf(stdout, "%d ", data[RTjpeg_ZZ[ii]]); | |
308 } | |
309 fprintf(stdout, "\n\n"); | |
310 | |
311 #endif | |
312 | |
313 (__u8)strm[0]=(__u8)(data[RTjpeg_ZZ[0]]>254) ? 254:((data[RTjpeg_ZZ[0]]<0)?0:data[RTjpeg_ZZ[0]]); | |
314 | |
315 for(ci=1; ci<=bt8; ci++) | |
316 { | |
317 ZZvalue = data[RTjpeg_ZZ[ci]]; | |
318 | |
319 if(ZZvalue>0) | |
320 { | |
321 strm[co++]=(__s8)(ZZvalue>127)?127:ZZvalue; | |
322 } | |
323 else | |
324 { | |
325 strm[co++]=(__s8)(ZZvalue<-128)?-128:ZZvalue; | |
326 } | |
327 } | |
328 | |
329 for(; ci<64; ci++) | |
330 { | |
331 ZZvalue = data[RTjpeg_ZZ[ci]]; | |
332 | |
333 if(ZZvalue>0) | |
334 { | |
335 strm[co++]=(__s8)(ZZvalue>63)?63:ZZvalue; | |
336 } | |
337 else if(ZZvalue<0) | |
338 { | |
339 strm[co++]=(__s8)(ZZvalue<-64)?-64:ZZvalue; | |
340 } | |
341 else /* compress zeros */ | |
342 { | |
343 tmp=ci; | |
344 do | |
345 { | |
346 ci++; | |
347 } | |
348 while((ci<64)&&(data[RTjpeg_ZZ[ci]]==0)); | |
349 | |
350 strm[co++]=(__s8)(63+(ci-tmp)); | |
351 ci--; | |
352 } | |
353 } | |
354 return (int)co; | |
355 } | |
356 | |
28849
87b59e8d3c26
Mark everything not used outside the file as "static"
reimar
parents:
28298
diff
changeset
|
357 static int RTjpeg_s2b(__s16 *data, __s8 *strm, __u8 bt8, __u32 *qtbl) |
3802 | 358 { |
359 int ci=1, co=1, tmp; | |
360 register int i; | |
361 | |
362 i=RTjpeg_ZZ[0]; | |
363 data[i]=((__u8)strm[0])*qtbl[i]; | |
364 | |
365 for(co=1; co<=bt8; co++) | |
366 { | |
367 i=RTjpeg_ZZ[co]; | |
368 data[i]=strm[ci++]*qtbl[i]; | |
369 } | |
370 | |
371 for(; co<64; co++) | |
372 { | |
373 if(strm[ci]>63) | |
374 { | |
375 tmp=co+strm[ci]-63; | |
376 for(; co<tmp; co++)data[RTjpeg_ZZ[co]]=0; | |
377 co--; | |
378 } else | |
379 { | |
380 i=RTjpeg_ZZ[co]; | |
381 data[i]=strm[ci]*qtbl[i]; | |
382 } | |
383 ci++; | |
384 } | |
385 return (int)ci; | |
386 } | |
387 #endif | |
388 | |
28298
a7124a264ea6
Completely get rid of MMX define, use HAVE_MMX define instead.
gpoirier
parents:
28296
diff
changeset
|
389 #if HAVE_MMX |
28849
87b59e8d3c26
Mark everything not used outside the file as "static"
reimar
parents:
28298
diff
changeset
|
390 static void RTjpeg_quant_init(void) |
3802 | 391 { |
392 int i; | |
393 __s16 *qtbl; | |
394 | |
395 qtbl=(__s16 *)RTjpeg_lqt; | |
396 for(i=0; i<64; i++)qtbl[i]=(__s16)RTjpeg_lqt[i]; | |
397 | |
398 qtbl=(__s16 *)RTjpeg_cqt; | |
399 for(i=0; i<64; i++)qtbl[i]=(__s16)RTjpeg_cqt[i]; | |
400 } | |
401 | |
12928 | 402 static mmx_t RTjpeg_ones={0x0001000100010001LL}; |
403 static mmx_t RTjpeg_half={0x7fff7fff7fff7fffLL}; | |
3802 | 404 |
28849
87b59e8d3c26
Mark everything not used outside the file as "static"
reimar
parents:
28298
diff
changeset
|
405 static void RTjpeg_quant(__s16 *block, __s32 *qtbl) |
3802 | 406 { |
407 int i; | |
408 mmx_t *bl, *ql; | |
409 | |
410 ql=(mmx_t *)qtbl; | |
411 bl=(mmx_t *)block; | |
412 | |
413 movq_m2r(RTjpeg_ones, mm6); | |
414 movq_m2r(RTjpeg_half, mm7); | |
415 | |
416 for(i=16; i; i--) | |
417 { | |
418 movq_m2r(*(ql++), mm0); /* quant vals (4) */ | |
419 movq_m2r(*bl, mm2); /* block vals (4) */ | |
420 movq_r2r(mm0, mm1); | |
421 movq_r2r(mm2, mm3); | |
422 | |
423 punpcklwd_r2r(mm6, mm0); /* 1 qb 1 qa */ | |
424 punpckhwd_r2r(mm6, mm1); /* 1 qd 1 qc */ | |
425 | |
426 punpcklwd_r2r(mm7, mm2); /* 32767 bb 32767 ba */ | |
427 punpckhwd_r2r(mm7, mm3); /* 32767 bd 32767 bc */ | |
428 | |
429 pmaddwd_r2r(mm2, mm0); /* 32767+bb*qb 32767+ba*qa */ | |
430 pmaddwd_r2r(mm3, mm1); /* 32767+bd*qd 32767+bc*qc */ | |
431 | |
432 psrad_i2r(16, mm0); | |
433 psrad_i2r(16, mm1); | |
434 | |
435 packssdw_r2r(mm1, mm0); | |
436 | |
437 movq_r2m(mm0, *(bl++)); | |
438 | |
439 } | |
440 } | |
441 #else | |
28849
87b59e8d3c26
Mark everything not used outside the file as "static"
reimar
parents:
28298
diff
changeset
|
442 static void RTjpeg_quant_init(void) |
3802 | 443 { |
444 } | |
445 | |
28849
87b59e8d3c26
Mark everything not used outside the file as "static"
reimar
parents:
28298
diff
changeset
|
446 static void RTjpeg_quant(__s16 *block, __s32 *qtbl) |
3802 | 447 { |
448 int i; | |
449 | |
450 for(i=0; i<64; i++) | |
451 block[i]=(__s16)((block[i]*qtbl[i]+32767)>>16); | |
452 } | |
453 #endif | |
454 | |
455 /* | |
456 * Perform the forward DCT on one block of samples. | |
457 */ | |
28298
a7124a264ea6
Completely get rid of MMX define, use HAVE_MMX define instead.
gpoirier
parents:
28296
diff
changeset
|
458 #if HAVE_MMX |
12928 | 459 static mmx_t RTjpeg_C4 ={0x2D412D412D412D41LL}; |
460 static mmx_t RTjpeg_C6 ={0x187E187E187E187ELL}; | |
461 static mmx_t RTjpeg_C2mC6={0x22A322A322A322A3LL}; | |
462 static mmx_t RTjpeg_C2pC6={0x539F539F539F539FLL}; | |
463 static mmx_t RTjpeg_zero ={0x0000000000000000LL}; | |
3802 | 464 |
465 #else | |
466 | |
467 #define FIX_0_382683433 ((__s32) 98) /* FIX(0.382683433) */ | |
468 #define FIX_0_541196100 ((__s32) 139) /* FIX(0.541196100) */ | |
469 #define FIX_0_707106781 ((__s32) 181) /* FIX(0.707106781) */ | |
470 #define FIX_1_306562965 ((__s32) 334) /* FIX(1.306562965) */ | |
471 | |
472 #define DESCALE10(x) (__s16)( ((x)+128) >> 8) | |
473 #define DESCALE20(x) (__s16)(((x)+32768) >> 16) | |
474 #define D_MULTIPLY(var,const) ((__s32) ((var) * (const))) | |
475 #endif | |
476 | |
28849
87b59e8d3c26
Mark everything not used outside the file as "static"
reimar
parents:
28298
diff
changeset
|
477 static void RTjpeg_dct_init(void) |
3802 | 478 { |
479 int i; | |
480 | |
481 for(i=0; i<64; i++) | |
482 { | |
483 RTjpeg_lqt[i]=(((__u64)RTjpeg_lqt[i]<<32)/RTjpeg_aan_tab[i]); | |
484 RTjpeg_cqt[i]=(((__u64)RTjpeg_cqt[i]<<32)/RTjpeg_aan_tab[i]); | |
485 } | |
486 } | |
487 | |
28849
87b59e8d3c26
Mark everything not used outside the file as "static"
reimar
parents:
28298
diff
changeset
|
488 static void RTjpeg_dctY(__u8 *idata, __s16 *odata, int rskip) |
3802 | 489 { |
28298
a7124a264ea6
Completely get rid of MMX define, use HAVE_MMX define instead.
gpoirier
parents:
28296
diff
changeset
|
490 #if !HAVE_MMX |
3802 | 491 __s32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
492 __s32 tmp10, tmp11, tmp12, tmp13; | |
493 __s32 z1, z2, z3, z4, z5, z11, z13; | |
494 __u8 *idataptr; | |
495 __s16 *odataptr; | |
496 __s32 *wsptr; | |
497 int ctr; | |
498 | |
499 idataptr = idata; | |
500 wsptr = RTjpeg_ws; | |
501 for (ctr = 7; ctr >= 0; ctr--) { | |
502 tmp0 = idataptr[0] + idataptr[7]; | |
503 tmp7 = idataptr[0] - idataptr[7]; | |
504 tmp1 = idataptr[1] + idataptr[6]; | |
505 tmp6 = idataptr[1] - idataptr[6]; | |
506 tmp2 = idataptr[2] + idataptr[5]; | |
507 tmp5 = idataptr[2] - idataptr[5]; | |
508 tmp3 = idataptr[3] + idataptr[4]; | |
509 tmp4 = idataptr[3] - idataptr[4]; | |
510 | |
511 tmp10 = (tmp0 + tmp3); /* phase 2 */ | |
512 tmp13 = tmp0 - tmp3; | |
513 tmp11 = (tmp1 + tmp2); | |
514 tmp12 = tmp1 - tmp2; | |
515 | |
516 wsptr[0] = (tmp10 + tmp11)<<8; /* phase 3 */ | |
517 wsptr[4] = (tmp10 - tmp11)<<8; | |
518 | |
519 z1 = D_MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */ | |
520 wsptr[2] = (tmp13<<8) + z1; /* phase 5 */ | |
521 wsptr[6] = (tmp13<<8) - z1; | |
522 | |
523 tmp10 = tmp4 + tmp5; /* phase 2 */ | |
524 tmp11 = tmp5 + tmp6; | |
525 tmp12 = tmp6 + tmp7; | |
526 | |
527 z5 = D_MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */ | |
528 z2 = D_MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */ | |
529 z4 = D_MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */ | |
530 z3 = D_MULTIPLY(tmp11, FIX_0_707106781); /* c4 */ | |
531 | |
532 z11 = (tmp7<<8) + z3; /* phase 5 */ | |
533 z13 = (tmp7<<8) - z3; | |
534 | |
535 wsptr[5] = z13 + z2; /* phase 6 */ | |
536 wsptr[3] = z13 - z2; | |
537 wsptr[1] = z11 + z4; | |
538 wsptr[7] = z11 - z4; | |
539 | |
540 idataptr += rskip<<3; /* advance pointer to next row */ | |
541 wsptr += 8; | |
542 } | |
543 | |
544 wsptr = RTjpeg_ws; | |
545 odataptr=odata; | |
546 for (ctr = 7; ctr >= 0; ctr--) { | |
547 tmp0 = wsptr[0] + wsptr[56]; | |
548 tmp7 = wsptr[0] - wsptr[56]; | |
549 tmp1 = wsptr[8] + wsptr[48]; | |
550 tmp6 = wsptr[8] - wsptr[48]; | |
551 tmp2 = wsptr[16] + wsptr[40]; | |
552 tmp5 = wsptr[16] - wsptr[40]; | |
553 tmp3 = wsptr[24] + wsptr[32]; | |
554 tmp4 = wsptr[24] - wsptr[32]; | |
555 | |
556 tmp10 = tmp0 + tmp3; /* phase 2 */ | |
557 tmp13 = tmp0 - tmp3; | |
558 tmp11 = tmp1 + tmp2; | |
559 tmp12 = tmp1 - tmp2; | |
560 | |
561 odataptr[0] = DESCALE10(tmp10 + tmp11); /* phase 3 */ | |
562 odataptr[32] = DESCALE10(tmp10 - tmp11); | |
563 | |
564 z1 = D_MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */ | |
565 odataptr[16] = DESCALE20((tmp13<<8) + z1); /* phase 5 */ | |
566 odataptr[48] = DESCALE20((tmp13<<8) - z1); | |
567 | |
568 tmp10 = tmp4 + tmp5; /* phase 2 */ | |
569 tmp11 = tmp5 + tmp6; | |
570 tmp12 = tmp6 + tmp7; | |
571 | |
572 z5 = D_MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */ | |
573 z2 = D_MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */ | |
574 z4 = D_MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */ | |
575 z3 = D_MULTIPLY(tmp11, FIX_0_707106781); /* c4 */ | |
576 | |
577 z11 = (tmp7<<8) + z3; /* phase 5 */ | |
578 z13 = (tmp7<<8) - z3; | |
579 | |
580 odataptr[40] = DESCALE20(z13 + z2); /* phase 6 */ | |
581 odataptr[24] = DESCALE20(z13 - z2); | |
582 odataptr[8] = DESCALE20(z11 + z4); | |
583 odataptr[56] = DESCALE20(z11 - z4); | |
584 | |
585 odataptr++; /* advance pointer to next column */ | |
586 wsptr++; | |
587 } | |
588 #else | |
589 volatile mmx_t tmp6, tmp7; | |
590 register mmx_t *dataptr = (mmx_t *)odata; | |
591 mmx_t *idata2 = (mmx_t *)idata; | |
592 | |
593 // first copy the input 8 bit to the destination 16 bits | |
594 | |
595 movq_m2r(RTjpeg_zero, mm2); | |
596 | |
597 | |
598 movq_m2r(*idata2, mm0); | |
599 movq_r2r(mm0, mm1); | |
600 | |
601 punpcklbw_r2r(mm2, mm0); | |
602 movq_r2m(mm0, *(dataptr)); | |
603 | |
604 punpckhbw_r2r(mm2, mm1); | |
605 movq_r2m(mm1, *(dataptr+1)); | |
606 | |
607 idata2 += rskip; | |
608 | |
609 movq_m2r(*idata2, mm0); | |
610 movq_r2r(mm0, mm1); | |
611 | |
612 punpcklbw_r2r(mm2, mm0); | |
613 movq_r2m(mm0, *(dataptr+2)); | |
614 | |
615 punpckhbw_r2r(mm2, mm1); | |
616 movq_r2m(mm1, *(dataptr+3)); | |
617 | |
618 idata2 += rskip; | |
619 | |
620 movq_m2r(*idata2, mm0); | |
621 movq_r2r(mm0, mm1); | |
622 | |
623 punpcklbw_r2r(mm2, mm0); | |
624 movq_r2m(mm0, *(dataptr+4)); | |
625 | |
626 punpckhbw_r2r(mm2, mm1); | |
627 movq_r2m(mm1, *(dataptr+5)); | |
628 | |
629 idata2 += rskip; | |
630 | |
631 movq_m2r(*idata2, mm0); | |
632 movq_r2r(mm0, mm1); | |
633 | |
634 punpcklbw_r2r(mm2, mm0); | |
635 movq_r2m(mm0, *(dataptr+6)); | |
636 | |
637 punpckhbw_r2r(mm2, mm1); | |
638 movq_r2m(mm1, *(dataptr+7)); | |
639 | |
640 idata2 += rskip; | |
641 | |
642 movq_m2r(*idata2, mm0); | |
643 movq_r2r(mm0, mm1); | |
644 | |
645 punpcklbw_r2r(mm2, mm0); | |
646 movq_r2m(mm0, *(dataptr+8)); | |
647 | |
648 punpckhbw_r2r(mm2, mm1); | |
649 movq_r2m(mm1, *(dataptr+9)); | |
650 | |
651 idata2 += rskip; | |
652 | |
653 movq_m2r(*idata2, mm0); | |
654 movq_r2r(mm0, mm1); | |
655 | |
656 punpcklbw_r2r(mm2, mm0); | |
657 movq_r2m(mm0, *(dataptr+10)); | |
658 | |
659 punpckhbw_r2r(mm2, mm1); | |
660 movq_r2m(mm1, *(dataptr+11)); | |
661 | |
662 idata2 += rskip; | |
663 | |
664 movq_m2r(*idata2, mm0); | |
665 movq_r2r(mm0, mm1); | |
666 | |
667 punpcklbw_r2r(mm2, mm0); | |
668 movq_r2m(mm0, *(dataptr+12)); | |
669 | |
670 punpckhbw_r2r(mm2, mm1); | |
671 movq_r2m(mm1, *(dataptr+13)); | |
672 | |
673 idata2 += rskip; | |
674 | |
675 movq_m2r(*idata2, mm0); | |
676 movq_r2r(mm0, mm1); | |
677 | |
678 punpcklbw_r2r(mm2, mm0); | |
679 movq_r2m(mm0, *(dataptr+14)); | |
680 | |
681 punpckhbw_r2r(mm2, mm1); | |
682 movq_r2m(mm1, *(dataptr+15)); | |
683 | |
684 /* Start Transpose to do calculations on rows */ | |
685 | |
686 movq_m2r(*(dataptr+9), mm7); // m03:m02|m01:m00 - first line (line 4)and copy into m5 | |
687 | |
688 movq_m2r(*(dataptr+13), mm6); // m23:m22|m21:m20 - third line (line 6)and copy into m2 | |
689 movq_r2r(mm7, mm5); | |
690 | |
691 punpcklwd_m2r(*(dataptr+11), mm7); // m11:m01|m10:m00 - interleave first and second lines | |
692 movq_r2r(mm6, mm2); | |
693 | |
694 punpcklwd_m2r(*(dataptr+15), mm6); // m31:m21|m30:m20 - interleave third and fourth lines | |
695 movq_r2r(mm7, mm1); | |
696 | |
697 movq_m2r(*(dataptr+11), mm3); // m13:m13|m11:m10 - second line | |
698 punpckldq_r2r(mm6, mm7); // m30:m20|m10:m00 - interleave to produce result 1 | |
699 | |
700 movq_m2r(*(dataptr+15), mm0); // m13:m13|m11:m10 - fourth line | |
701 punpckhdq_r2r(mm6, mm1); // m31:m21|m11:m01 - interleave to produce result 2 | |
702 | |
703 movq_r2m(mm7,*(dataptr+9)); // write result 1 | |
704 punpckhwd_r2r(mm3, mm5); // m13:m03|m12:m02 - interleave first and second lines | |
705 | |
706 movq_r2m(mm1,*(dataptr+11)); // write result 2 | |
707 punpckhwd_r2r(mm0, mm2); // m33:m23|m32:m22 - interleave third and fourth lines | |
708 | |
709 movq_r2r(mm5, mm1); | |
710 punpckldq_r2r(mm2, mm5); // m32:m22|m12:m02 - interleave to produce result 3 | |
711 | |
712 movq_m2r(*(dataptr+1), mm0); // m03:m02|m01:m00 - first line, 4x4 | |
713 punpckhdq_r2r(mm2, mm1); // m33:m23|m13:m03 - interleave to produce result 4 | |
714 | |
715 movq_r2m(mm5,*(dataptr+13)); // write result 3 | |
716 | |
717 // last 4x4 done | |
718 | |
719 movq_r2m(mm1, *(dataptr+15)); // write result 4, last 4x4 | |
720 | |
721 movq_m2r(*(dataptr+5), mm2); // m23:m22|m21:m20 - third line | |
722 movq_r2r(mm0, mm6); | |
723 | |
724 punpcklwd_m2r(*(dataptr+3), mm0); // m11:m01|m10:m00 - interleave first and second lines | |
725 movq_r2r(mm2, mm7); | |
726 | |
727 punpcklwd_m2r(*(dataptr+7), mm2); // m31:m21|m30:m20 - interleave third and fourth lines | |
728 movq_r2r(mm0, mm4); | |
729 | |
730 // | |
731 movq_m2r(*(dataptr+8), mm1); // n03:n02|n01:n00 - first line | |
732 punpckldq_r2r(mm2, mm0); // m30:m20|m10:m00 - interleave to produce first result | |
733 | |
734 movq_m2r(*(dataptr+12), mm3); // n23:n22|n21:n20 - third line | |
735 punpckhdq_r2r(mm2, mm4); // m31:m21|m11:m01 - interleave to produce second result | |
736 | |
737 punpckhwd_m2r(*(dataptr+3), mm6); // m13:m03|m12:m02 - interleave first and second lines | |
738 movq_r2r(mm1, mm2); // copy first line | |
739 | |
740 punpckhwd_m2r(*(dataptr+7), mm7); // m33:m23|m32:m22 - interleave third and fourth lines | |
741 movq_r2r(mm6, mm5); // copy first intermediate result | |
742 | |
743 movq_r2m(mm0, *(dataptr+8)); // write result 1 | |
744 punpckhdq_r2r(mm7, mm5); // m33:m23|m13:m03 - produce third result | |
745 | |
746 punpcklwd_m2r(*(dataptr+10), mm1); // n11:n01|n10:n00 - interleave first and second lines | |
747 movq_r2r(mm3, mm0); // copy third line | |
748 | |
749 punpckhwd_m2r(*(dataptr+10), mm2); // n13:n03|n12:n02 - interleave first and second lines | |
750 | |
751 movq_r2m(mm4, *(dataptr+10)); // write result 2 out | |
752 punpckldq_r2r(mm7, mm6); // m32:m22|m12:m02 - produce fourth result | |
753 | |
754 punpcklwd_m2r(*(dataptr+14), mm3); // n31:n21|n30:n20 - interleave third and fourth lines | |
755 movq_r2r(mm1, mm4); | |
756 | |
757 movq_r2m(mm6, *(dataptr+12)); // write result 3 out | |
758 punpckldq_r2r(mm3, mm1); // n30:n20|n10:n00 - produce first result | |
759 | |
760 punpckhwd_m2r(*(dataptr+14), mm0); // n33:n23|n32:n22 - interleave third and fourth lines | |
761 movq_r2r(mm2, mm6); | |
762 | |
763 movq_r2m(mm5, *(dataptr+14)); // write result 4 out | |
764 punpckhdq_r2r(mm3, mm4); // n31:n21|n11:n01- produce second result | |
765 | |
766 movq_r2m(mm1, *(dataptr+1)); // write result 5 out - (first result for other 4 x 4 block) | |
767 punpckldq_r2r(mm0, mm2); // n32:n22|n12:n02- produce third result | |
768 | |
769 movq_r2m(mm4, *(dataptr+3)); // write result 6 out | |
770 punpckhdq_r2r(mm0, mm6); // n33:n23|n13:n03 - produce fourth result | |
771 | |
772 movq_r2m(mm2, *(dataptr+5)); // write result 7 out | |
773 | |
774 movq_m2r(*dataptr, mm0); // m03:m02|m01:m00 - first line, first 4x4 | |
775 | |
776 movq_r2m(mm6, *(dataptr+7)); // write result 8 out | |
777 | |
778 | |
779 // Do first 4x4 quadrant, which is used in the beginning of the DCT: | |
780 | |
781 movq_m2r(*(dataptr+4), mm7); // m23:m22|m21:m20 - third line | |
782 movq_r2r(mm0, mm2); | |
783 | |
784 punpcklwd_m2r(*(dataptr+2), mm0); // m11:m01|m10:m00 - interleave first and second lines | |
785 movq_r2r(mm7, mm4); | |
786 | |
787 punpcklwd_m2r(*(dataptr+6), mm7); // m31:m21|m30:m20 - interleave third and fourth lines | |
788 movq_r2r(mm0, mm1); | |
789 | |
790 movq_m2r(*(dataptr+2), mm6); // m13:m12|m11:m10 - second line | |
791 punpckldq_r2r(mm7, mm0); // m30:m20|m10:m00 - interleave to produce result 1 | |
792 | |
793 movq_m2r(*(dataptr+6), mm5); // m33:m32|m31:m30 - fourth line | |
794 punpckhdq_r2r(mm7, mm1); // m31:m21|m11:m01 - interleave to produce result 2 | |
795 | |
796 movq_r2r(mm0, mm7); // write result 1 | |
797 punpckhwd_r2r(mm6, mm2); // m13:m03|m12:m02 - interleave first and second lines | |
798 | |
799 psubw_m2r(*(dataptr+14), mm7); // tmp07=x0-x7 /* Stage 1 */ | |
800 movq_r2r(mm1, mm6); // write result 2 | |
801 | |
802 paddw_m2r(*(dataptr+14), mm0); // tmp00=x0+x7 /* Stage 1 */ | |
803 punpckhwd_r2r(mm5, mm4); // m33:m23|m32:m22 - interleave third and fourth lines | |
804 | |
805 paddw_m2r(*(dataptr+12), mm1); // tmp01=x1+x6 /* Stage 1 */ | |
806 movq_r2r(mm2, mm3); // copy first intermediate result | |
807 | |
808 psubw_m2r(*(dataptr+12), mm6); // tmp06=x1-x6 /* Stage 1 */ | |
809 punpckldq_r2r(mm4, mm2); // m32:m22|m12:m02 - interleave to produce result 3 | |
810 | |
811 movq_r2m(mm7, tmp7); | |
812 movq_r2r(mm2, mm5); // write result 3 | |
813 | |
814 movq_r2m(mm6, tmp6); | |
815 punpckhdq_r2r(mm4, mm3); // m33:m23|m13:m03 - interleave to produce result 4 | |
816 | |
817 paddw_m2r(*(dataptr+10), mm2); // tmp02=x2+5 /* Stage 1 */ | |
818 movq_r2r(mm3, mm4); // write result 4 | |
819 | |
820 /************************************************************************************************ | |
821 End of Transpose | |
822 ************************************************************************************************/ | |
823 | |
824 | |
825 paddw_m2r(*(dataptr+8), mm3); // tmp03=x3+x4 /* stage 1*/ | |
826 movq_r2r(mm0, mm7); | |
827 | |
828 psubw_m2r(*(dataptr+8), mm4); // tmp04=x3-x4 /* stage 1*/ | |
829 movq_r2r(mm1, mm6); | |
830 | |
831 paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03 /* even 2 */ | |
832 psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03 /* even 2 */ | |
833 | |
834 psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02 /* even 2 */ | |
835 paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02 /* even 2 */ | |
836 | |
837 psubw_m2r(*(dataptr+10), mm5); // tmp05=x2-x5 /* stage 1*/ | |
838 paddw_r2r(mm7, mm6); // tmp12 + tmp13 | |
839 | |
840 /* stage 3 */ | |
841 | |
842 movq_m2r(tmp6, mm2); | |
843 movq_r2r(mm0, mm3); | |
844 | |
845 psllw_i2r(2, mm6); // m8 * 2^2 | |
846 paddw_r2r(mm1, mm0); | |
847 | |
848 pmulhw_m2r(RTjpeg_C4, mm6); // z1 | |
849 psubw_r2r(mm1, mm3); | |
850 | |
851 movq_r2m(mm0, *dataptr); | |
852 movq_r2r(mm7, mm0); | |
853 | |
854 /* Odd part */ | |
855 movq_r2m(mm3, *(dataptr+8)); | |
856 paddw_r2r(mm5, mm4); // tmp10 | |
857 | |
858 movq_m2r(tmp7, mm3); | |
859 paddw_r2r(mm6, mm0); // tmp32 | |
860 | |
861 paddw_r2r(mm2, mm5); // tmp11 | |
862 psubw_r2r(mm6, mm7); // tmp33 | |
863 | |
864 movq_r2m(mm0, *(dataptr+4)); | |
865 paddw_r2r(mm3, mm2); // tmp12 | |
866 | |
867 /* stage 4 */ | |
868 | |
869 movq_r2m(mm7, *(dataptr+12)); | |
870 movq_r2r(mm4, mm1); // copy of tmp10 | |
871 | |
872 psubw_r2r(mm2, mm1); // tmp10 - tmp12 | |
873 psllw_i2r(2, mm4); // m8 * 2^2 | |
874 | |
875 movq_m2r(RTjpeg_C2mC6, mm0); | |
876 psllw_i2r(2, mm1); | |
877 | |
878 pmulhw_m2r(RTjpeg_C6, mm1); // z5 | |
879 psllw_i2r(2, mm2); | |
880 | |
881 pmulhw_r2r(mm0, mm4); // z5 | |
882 | |
883 /* stage 5 */ | |
884 | |
885 pmulhw_m2r(RTjpeg_C2pC6, mm2); | |
886 psllw_i2r(2, mm5); | |
887 | |
888 pmulhw_m2r(RTjpeg_C4, mm5); // z3 | |
889 movq_r2r(mm3, mm0); // copy tmp7 | |
890 | |
891 movq_m2r(*(dataptr+1), mm7); | |
892 paddw_r2r(mm1, mm4); // z2 | |
893 | |
894 paddw_r2r(mm1, mm2); // z4 | |
895 | |
896 paddw_r2r(mm5, mm0); // z11 | |
897 psubw_r2r(mm5, mm3); // z13 | |
898 | |
899 /* stage 6 */ | |
900 | |
901 movq_r2r(mm3, mm5); // copy z13 | |
902 psubw_r2r(mm4, mm3); // y3=z13 - z2 | |
903 | |
904 paddw_r2r(mm4, mm5); // y5=z13 + z2 | |
905 movq_r2r(mm0, mm6); // copy z11 | |
906 | |
907 movq_r2m(mm3, *(dataptr+6)); //save y3 | |
908 psubw_r2r(mm2, mm0); // y7=z11 - z4 | |
909 | |
910 movq_r2m(mm5, *(dataptr+10)); //save y5 | |
911 paddw_r2r(mm2, mm6); // y1=z11 + z4 | |
912 | |
913 movq_r2m(mm0, *(dataptr+14)); //save y7 | |
914 | |
915 /************************************************ | |
916 * End of 1st 4 rows | |
917 ************************************************/ | |
918 | |
919 movq_m2r(*(dataptr+3), mm1); // load x1 /* stage 1 */ | |
920 movq_r2r(mm7, mm0); // copy x0 | |
921 | |
922 movq_r2m(mm6, *(dataptr+2)); //save y1 | |
923 | |
924 movq_m2r(*(dataptr+5), mm2); // load x2 /* stage 1 */ | |
925 movq_r2r(mm1, mm6); // copy x1 | |
926 | |
927 paddw_m2r(*(dataptr+15), mm0); // tmp00 = x0 + x7 | |
928 | |
929 movq_m2r(*(dataptr+7), mm3); // load x3 /* stage 1 */ | |
930 movq_r2r(mm2, mm5); // copy x2 | |
931 | |
932 psubw_m2r(*(dataptr+15), mm7); // tmp07 = x0 - x7 | |
933 movq_r2r(mm3, mm4); // copy x3 | |
934 | |
935 paddw_m2r(*(dataptr+13), mm1); // tmp01 = x1 + x6 | |
936 | |
937 movq_r2m(mm7, tmp7); // save tmp07 | |
938 movq_r2r(mm0, mm7); // copy tmp00 | |
939 | |
940 psubw_m2r(*(dataptr+13), mm6); // tmp06 = x1 - x6 | |
941 | |
942 /* stage 2, Even Part */ | |
943 | |
944 paddw_m2r(*(dataptr+9), mm3); // tmp03 = x3 + x4 | |
945 | |
946 movq_r2m(mm6, tmp6); // save tmp07 | |
947 movq_r2r(mm1, mm6); // copy tmp01 | |
948 | |
949 paddw_m2r(*(dataptr+11), mm2); // tmp02 = x2 + x5 | |
950 paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03 | |
951 | |
952 psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03 | |
953 | |
954 psubw_m2r(*(dataptr+9), mm4); // tmp04 = x3 - x4 | |
955 psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02 | |
956 | |
957 paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02 | |
958 | |
959 psubw_m2r(*(dataptr+11), mm5); // tmp05 = x2 - x5 | |
960 paddw_r2r(mm7, mm6); // tmp12 + tmp13 | |
961 | |
962 /* stage 3, Even and stage 4 & 5 even */ | |
963 | |
964 movq_m2r(tmp6, mm2); // load tmp6 | |
965 movq_r2r(mm0, mm3); // copy tmp10 | |
966 | |
967 psllw_i2r(2, mm6); // shift z1 | |
968 paddw_r2r(mm1, mm0); // y0=tmp10 + tmp11 | |
969 | |
970 pmulhw_m2r(RTjpeg_C4, mm6); // z1 | |
971 psubw_r2r(mm1, mm3); // y4=tmp10 - tmp11 | |
972 | |
973 movq_r2m(mm0, *(dataptr+1)); //save y0 | |
974 movq_r2r(mm7, mm0); // copy tmp13 | |
975 | |
976 /* odd part */ | |
977 | |
978 movq_r2m(mm3, *(dataptr+9)); //save y4 | |
979 paddw_r2r(mm5, mm4); // tmp10 = tmp4 + tmp5 | |
980 | |
981 movq_m2r(tmp7, mm3); // load tmp7 | |
982 paddw_r2r(mm6, mm0); // tmp32 = tmp13 + z1 | |
983 | |
984 paddw_r2r(mm2, mm5); // tmp11 = tmp5 + tmp6 | |
985 psubw_r2r(mm6, mm7); // tmp33 = tmp13 - z1 | |
986 | |
987 movq_r2m(mm0, *(dataptr+5)); //save y2 | |
988 paddw_r2r(mm3, mm2); // tmp12 = tmp6 + tmp7 | |
989 | |
990 /* stage 4 */ | |
991 | |
992 movq_r2m(mm7, *(dataptr+13)); //save y6 | |
993 movq_r2r(mm4, mm1); // copy tmp10 | |
994 | |
995 psubw_r2r(mm2, mm1); // tmp10 - tmp12 | |
996 psllw_i2r(2, mm4); // shift tmp10 | |
997 | |
998 movq_m2r(RTjpeg_C2mC6, mm0); // load C2mC6 | |
999 psllw_i2r(2, mm1); // shift (tmp10-tmp12) | |
1000 | |
1001 pmulhw_m2r(RTjpeg_C6, mm1); // z5 | |
1002 psllw_i2r(2, mm5); // prepare for multiply | |
1003 | |
1004 pmulhw_r2r(mm0, mm4); // multiply by converted real | |
1005 | |
1006 /* stage 5 */ | |
1007 | |
1008 pmulhw_m2r(RTjpeg_C4, mm5); // z3 | |
1009 psllw_i2r(2, mm2); // prepare for multiply | |
1010 | |
1011 pmulhw_m2r(RTjpeg_C2pC6, mm2); // multiply | |
1012 movq_r2r(mm3, mm0); // copy tmp7 | |
1013 | |
1014 movq_m2r(*(dataptr+9), mm7); // m03:m02|m01:m00 - first line (line 4)and copy into mm7 | |
1015 paddw_r2r(mm1, mm4); // z2 | |
1016 | |
1017 paddw_r2r(mm5, mm0); // z11 | |
1018 psubw_r2r(mm5, mm3); // z13 | |
1019 | |
1020 /* stage 6 */ | |
1021 | |
1022 movq_r2r(mm3, mm5); // copy z13 | |
1023 paddw_r2r(mm1, mm2); // z4 | |
1024 | |
1025 movq_r2r(mm0, mm6); // copy z11 | |
1026 psubw_r2r(mm4, mm5); // y3 | |
1027 | |
1028 paddw_r2r(mm2, mm6); // y1 | |
1029 paddw_r2r(mm4, mm3); // y5 | |
1030 | |
1031 movq_r2m(mm5, *(dataptr+7)); //save y3 | |
1032 | |
1033 movq_r2m(mm6, *(dataptr+3)); //save y1 | |
1034 psubw_r2r(mm2, mm0); // y7 | |
1035 | |
1036 /************************************************************************************************ | |
1037 Start of Transpose | |
1038 ************************************************************************************************/ | |
1039 | |
1040 movq_m2r(*(dataptr+13), mm6); // m23:m22|m21:m20 - third line (line 6)and copy into m2 | |
1041 movq_r2r(mm7, mm5); // copy first line | |
1042 | |
1043 punpcklwd_r2r(mm3, mm7); // m11:m01|m10:m00 - interleave first and second lines | |
1044 movq_r2r(mm6, mm2); // copy third line | |
1045 | |
1046 punpcklwd_r2r(mm0, mm6); // m31:m21|m30:m20 - interleave third and fourth lines | |
1047 movq_r2r(mm7, mm1); // copy first intermediate result | |
1048 | |
1049 punpckldq_r2r(mm6, mm7); // m30:m20|m10:m00 - interleave to produce result 1 | |
1050 | |
1051 punpckhdq_r2r(mm6, mm1); // m31:m21|m11:m01 - interleave to produce result 2 | |
1052 | |
1053 movq_r2m(mm7, *(dataptr+9)); // write result 1 | |
1054 punpckhwd_r2r(mm3, mm5); // m13:m03|m12:m02 - interleave first and second lines | |
1055 | |
1056 movq_r2m(mm1, *(dataptr+11)); // write result 2 | |
1057 punpckhwd_r2r(mm0, mm2); // m33:m23|m32:m22 - interleave third and fourth lines | |
1058 | |
1059 movq_r2r(mm5, mm1); // copy first intermediate result | |
1060 punpckldq_r2r(mm2, mm5); // m32:m22|m12:m02 - interleave to produce result 3 | |
1061 | |
1062 movq_m2r(*(dataptr+1), mm0); // m03:m02|m01:m00 - first line, 4x4 | |
1063 punpckhdq_r2r(mm2, mm1); // m33:m23|m13:m03 - interleave to produce result 4 | |
1064 | |
1065 movq_r2m(mm5, *(dataptr+13)); // write result 3 | |
1066 | |
1067 /****** last 4x4 done */ | |
1068 | |
1069 movq_r2m(mm1, *(dataptr+15)); // write result 4, last 4x4 | |
1070 | |
1071 movq_m2r(*(dataptr+5), mm2); // m23:m22|m21:m20 - third line | |
1072 movq_r2r(mm0, mm6); // copy first line | |
1073 | |
1074 punpcklwd_m2r(*(dataptr+3), mm0); // m11:m01|m10:m00 - interleave first and second lines | |
1075 movq_r2r(mm2, mm7); // copy third line | |
1076 | |
1077 punpcklwd_m2r(*(dataptr+7), mm2); // m31:m21|m30:m20 - interleave third and fourth lines | |
1078 movq_r2r(mm0, mm4); // copy first intermediate result | |
1079 | |
1080 | |
1081 | |
1082 movq_m2r(*(dataptr+8), mm1); // n03:n02|n01:n00 - first line | |
1083 punpckldq_r2r(mm2, mm0); // m30:m20|m10:m00 - interleave to produce first result | |
1084 | |
1085 movq_m2r(*(dataptr+12), mm3); // n23:n22|n21:n20 - third line | |
1086 punpckhdq_r2r(mm2, mm4); // m31:m21|m11:m01 - interleave to produce second result | |
1087 | |
1088 punpckhwd_m2r(*(dataptr+3), mm6); // m13:m03|m12:m02 - interleave first and second lines | |
1089 movq_r2r(mm1, mm2); // copy first line | |
1090 | |
1091 punpckhwd_m2r(*(dataptr+7), mm7); // m33:m23|m32:m22 - interleave third and fourth lines | |
1092 movq_r2r(mm6, mm5); // copy first intermediate result | |
1093 | |
1094 movq_r2m(mm0, *(dataptr+8)); // write result 1 | |
1095 punpckhdq_r2r(mm7, mm5); // m33:m23|m13:m03 - produce third result | |
1096 | |
1097 punpcklwd_m2r(*(dataptr+10), mm1); // n11:n01|n10:n00 - interleave first and second lines | |
1098 movq_r2r(mm3, mm0); // copy third line | |
1099 | |
1100 punpckhwd_m2r(*(dataptr+10), mm2); // n13:n03|n12:n02 - interleave first and second lines | |
1101 | |
1102 movq_r2m(mm4, *(dataptr+10)); // write result 2 out | |
1103 punpckldq_r2r(mm7, mm6); // m32:m22|m12:m02 - produce fourth result | |
1104 | |
1105 punpcklwd_m2r(*(dataptr+14), mm3); // n33:n23|n32:n22 - interleave third and fourth lines | |
1106 movq_r2r(mm1, mm4); // copy second intermediate result | |
1107 | |
1108 movq_r2m(mm6, *(dataptr+12)); // write result 3 out | |
1109 punpckldq_r2r(mm3, mm1); // | |
1110 | |
1111 punpckhwd_m2r(*(dataptr+14), mm0); // n33:n23|n32:n22 - interleave third and fourth lines | |
1112 movq_r2r(mm2, mm6); // copy second intermediate result | |
1113 | |
1114 movq_r2m(mm5, *(dataptr+14)); // write result 4 out | |
1115 punpckhdq_r2r(mm3, mm4); // n31:n21|n11:n01- produce second result | |
1116 | |
1117 movq_r2m(mm1, *(dataptr+1)); // write result 5 out - (first result for other 4 x 4 block) | |
1118 punpckldq_r2r(mm0, mm2); // n32:n22|n12:n02- produce third result | |
1119 | |
1120 movq_r2m(mm4, *(dataptr+3)); // write result 6 out | |
1121 punpckhdq_r2r(mm0, mm6); // n33:n23|n13:n03 - produce fourth result | |
1122 | |
1123 movq_r2m(mm2, *(dataptr+5)); // write result 7 out | |
1124 | |
1125 movq_m2r(*dataptr, mm0); // m03:m02|m01:m00 - first line, first 4x4 | |
1126 | |
1127 movq_r2m(mm6, *(dataptr+7)); // write result 8 out | |
1128 | |
1129 // Do first 4x4 quadrant, which is used in the beginning of the DCT: | |
1130 | |
1131 movq_m2r(*(dataptr+4), mm7); // m23:m22|m21:m20 - third line | |
1132 movq_r2r(mm0, mm2); // copy first line | |
1133 | |
1134 punpcklwd_m2r(*(dataptr+2), mm0); // m11:m01|m10:m00 - interleave first and second lines | |
1135 movq_r2r(mm7, mm4); // copy third line | |
1136 | |
1137 punpcklwd_m2r(*(dataptr+6), mm7); // m31:m21|m30:m20 - interleave third and fourth lines | |
1138 movq_r2r(mm0, mm1); // copy first intermediate result | |
1139 | |
1140 movq_m2r(*(dataptr+2), mm6); // m13:m12|m11:m10 - second line | |
1141 punpckldq_r2r(mm7, mm0); // m30:m20|m10:m00 - interleave to produce result 1 | |
1142 | |
1143 movq_m2r(*(dataptr+6), mm5); // m33:m32|m31:m30 - fourth line | |
1144 punpckhdq_r2r(mm7, mm1); // m31:m21|m11:m01 - interleave to produce result 2 | |
1145 | |
1146 movq_r2r(mm0, mm7); // write result 1 | |
1147 punpckhwd_r2r(mm6, mm2); // m13:m03|m12:m02 - interleave first and second lines | |
1148 | |
1149 psubw_m2r(*(dataptr+14), mm7); // tmp07=x0-x7 /* Stage 1 */ | |
1150 movq_r2r(mm1, mm6); // write result 2 | |
1151 | |
1152 paddw_m2r(*(dataptr+14), mm0); // tmp00=x0+x7 /* Stage 1 */ | |
1153 punpckhwd_r2r(mm5, mm4); // m33:m23|m32:m22 - interleave third and fourth lines | |
1154 | |
1155 paddw_m2r(*(dataptr+12), mm1); // tmp01=x1+x6 /* Stage 1 */ | |
1156 movq_r2r(mm2, mm3); // copy first intermediate result | |
1157 | |
1158 psubw_m2r(*(dataptr+12), mm6); // tmp06=x1-x6 /* Stage 1 */ | |
1159 punpckldq_r2r(mm4, mm2); // m32:m22|m12:m02 - interleave to produce result 3 | |
1160 | |
1161 movq_r2m(mm7, tmp7); // save tmp07 | |
1162 movq_r2r(mm2, mm5); // write result 3 | |
1163 | |
1164 movq_r2m(mm6, tmp6); // save tmp06 | |
1165 | |
1166 punpckhdq_r2r(mm4, mm3); // m33:m23|m13:m03 - interleave to produce result 4 | |
1167 | |
1168 paddw_m2r(*(dataptr+10), mm2); // tmp02=x2+x5 /* stage 1 */ | |
1169 movq_r2r(mm3, mm4); // write result 4 | |
1170 | |
1171 /************************************************************************************************ | |
1172 End of Transpose 2 | |
1173 ************************************************************************************************/ | |
1174 | |
1175 paddw_m2r(*(dataptr+8), mm3); // tmp03=x3+x4 /* stage 1*/ | |
1176 movq_r2r(mm0, mm7); | |
1177 | |
1178 psubw_m2r(*(dataptr+8), mm4); // tmp04=x3-x4 /* stage 1*/ | |
1179 movq_r2r(mm1, mm6); | |
1180 | |
1181 paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03 /* even 2 */ | |
1182 psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03 /* even 2 */ | |
1183 | |
1184 psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02 /* even 2 */ | |
1185 paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02 /* even 2 */ | |
1186 | |
1187 psubw_m2r(*(dataptr+10), mm5); // tmp05=x2-x5 /* stage 1*/ | |
1188 paddw_r2r(mm7, mm6); // tmp12 + tmp13 | |
1189 | |
1190 /* stage 3 */ | |
1191 | |
1192 movq_m2r(tmp6, mm2); | |
1193 movq_r2r(mm0, mm3); | |
1194 | |
1195 psllw_i2r(2, mm6); // m8 * 2^2 | |
1196 paddw_r2r(mm1, mm0); | |
1197 | |
1198 pmulhw_m2r(RTjpeg_C4, mm6); // z1 | |
1199 psubw_r2r(mm1, mm3); | |
1200 | |
1201 movq_r2m(mm0, *dataptr); | |
1202 movq_r2r(mm7, mm0); | |
1203 | |
1204 /* Odd part */ | |
1205 movq_r2m(mm3, *(dataptr+8)); | |
1206 paddw_r2r(mm5, mm4); // tmp10 | |
1207 | |
1208 movq_m2r(tmp7, mm3); | |
1209 paddw_r2r(mm6, mm0); // tmp32 | |
1210 | |
1211 paddw_r2r(mm2, mm5); // tmp11 | |
1212 psubw_r2r(mm6, mm7); // tmp33 | |
1213 | |
1214 movq_r2m(mm0, *(dataptr+4)); | |
1215 paddw_r2r(mm3, mm2); // tmp12 | |
1216 | |
1217 /* stage 4 */ | |
1218 movq_r2m(mm7, *(dataptr+12)); | |
1219 movq_r2r(mm4, mm1); // copy of tmp10 | |
1220 | |
1221 psubw_r2r(mm2, mm1); // tmp10 - tmp12 | |
1222 psllw_i2r(2, mm4); // m8 * 2^2 | |
1223 | |
1224 movq_m2r(RTjpeg_C2mC6, mm0); | |
1225 psllw_i2r(2, mm1); | |
1226 | |
1227 pmulhw_m2r(RTjpeg_C6, mm1); // z5 | |
1228 psllw_i2r(2, mm2); | |
1229 | |
1230 pmulhw_r2r(mm0, mm4); // z5 | |
1231 | |
1232 /* stage 5 */ | |
1233 | |
1234 pmulhw_m2r(RTjpeg_C2pC6, mm2); | |
1235 psllw_i2r(2, mm5); | |
1236 | |
1237 pmulhw_m2r(RTjpeg_C4, mm5); // z3 | |
1238 movq_r2r(mm3, mm0); // copy tmp7 | |
1239 | |
1240 movq_m2r(*(dataptr+1), mm7); | |
1241 paddw_r2r(mm1, mm4); // z2 | |
1242 | |
1243 paddw_r2r(mm1, mm2); // z4 | |
1244 | |
1245 paddw_r2r(mm5, mm0); // z11 | |
1246 psubw_r2r(mm5, mm3); // z13 | |
1247 | |
1248 /* stage 6 */ | |
1249 | |
1250 movq_r2r(mm3, mm5); // copy z13 | |
1251 psubw_r2r(mm4, mm3); // y3=z13 - z2 | |
1252 | |
1253 paddw_r2r(mm4, mm5); // y5=z13 + z2 | |
1254 movq_r2r(mm0, mm6); // copy z11 | |
1255 | |
1256 movq_r2m(mm3, *(dataptr+6)); //save y3 | |
1257 psubw_r2r(mm2, mm0); // y7=z11 - z4 | |
1258 | |
1259 movq_r2m(mm5, *(dataptr+10)); //save y5 | |
1260 paddw_r2r(mm2, mm6); // y1=z11 + z4 | |
1261 | |
1262 movq_r2m(mm0, *(dataptr+14)); //save y7 | |
1263 | |
1264 /************************************************ | |
1265 * End of 1st 4 rows | |
1266 ************************************************/ | |
1267 | |
1268 movq_m2r(*(dataptr+3), mm1); // load x1 /* stage 1 */ | |
1269 movq_r2r(mm7, mm0); // copy x0 | |
1270 | |
1271 movq_r2m(mm6, *(dataptr+2)); //save y1 | |
1272 | |
1273 movq_m2r(*(dataptr+5), mm2); // load x2 /* stage 1 */ | |
1274 movq_r2r(mm1, mm6); // copy x1 | |
1275 | |
1276 paddw_m2r(*(dataptr+15), mm0); // tmp00 = x0 + x7 | |
1277 | |
1278 movq_m2r(*(dataptr+7), mm3); // load x3 /* stage 1 */ | |
1279 movq_r2r(mm2, mm5); // copy x2 | |
1280 | |
1281 psubw_m2r(*(dataptr+15), mm7); // tmp07 = x0 - x7 | |
1282 movq_r2r(mm3, mm4); // copy x3 | |
1283 | |
1284 paddw_m2r(*(dataptr+13), mm1); // tmp01 = x1 + x6 | |
1285 | |
1286 movq_r2m(mm7, tmp7); // save tmp07 | |
1287 movq_r2r(mm0, mm7); // copy tmp00 | |
1288 | |
1289 psubw_m2r(*(dataptr+13), mm6); // tmp06 = x1 - x6 | |
1290 | |
1291 /* stage 2, Even Part */ | |
1292 | |
1293 paddw_m2r(*(dataptr+9), mm3); // tmp03 = x3 + x4 | |
1294 | |
1295 movq_r2m(mm6, tmp6); // save tmp07 | |
1296 movq_r2r(mm1, mm6); // copy tmp01 | |
1297 | |
1298 paddw_m2r(*(dataptr+11), mm2); // tmp02 = x2 + x5 | |
1299 paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03 | |
1300 | |
1301 psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03 | |
1302 | |
1303 psubw_m2r(*(dataptr+9), mm4); // tmp04 = x3 - x4 | |
1304 psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02 | |
1305 | |
1306 paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02 | |
1307 | |
1308 psubw_m2r(*(dataptr+11), mm5); // tmp05 = x2 - x5 | |
1309 paddw_r2r(mm7, mm6); // tmp12 + tmp13 | |
1310 | |
1311 /* stage 3, Even and stage 4 & 5 even */ | |
1312 | |
1313 movq_m2r(tmp6, mm2); // load tmp6 | |
1314 movq_r2r(mm0, mm3); // copy tmp10 | |
1315 | |
1316 psllw_i2r(2, mm6); // shift z1 | |
1317 paddw_r2r(mm1, mm0); // y0=tmp10 + tmp11 | |
1318 | |
1319 pmulhw_m2r(RTjpeg_C4, mm6); // z1 | |
1320 psubw_r2r(mm1, mm3); // y4=tmp10 - tmp11 | |
1321 | |
1322 movq_r2m(mm0, *(dataptr+1)); //save y0 | |
1323 movq_r2r(mm7, mm0); // copy tmp13 | |
1324 | |
1325 /* odd part */ | |
1326 | |
1327 movq_r2m(mm3, *(dataptr+9)); //save y4 | |
1328 paddw_r2r(mm5, mm4); // tmp10 = tmp4 + tmp5 | |
1329 | |
1330 movq_m2r(tmp7, mm3); // load tmp7 | |
1331 paddw_r2r(mm6, mm0); // tmp32 = tmp13 + z1 | |
1332 | |
1333 paddw_r2r(mm2, mm5); // tmp11 = tmp5 + tmp6 | |
1334 psubw_r2r(mm6, mm7); // tmp33 = tmp13 - z1 | |
1335 | |
1336 movq_r2m(mm0, *(dataptr+5)); //save y2 | |
1337 paddw_r2r(mm3, mm2); // tmp12 = tmp6 + tmp7 | |
1338 | |
1339 /* stage 4 */ | |
1340 | |
1341 movq_r2m(mm7, *(dataptr+13)); //save y6 | |
1342 movq_r2r(mm4, mm1); // copy tmp10 | |
1343 | |
1344 psubw_r2r(mm2, mm1); // tmp10 - tmp12 | |
1345 psllw_i2r(2, mm4); // shift tmp10 | |
1346 | |
1347 movq_m2r(RTjpeg_C2mC6, mm0); // load C2mC6 | |
1348 psllw_i2r(2, mm1); // shift (tmp10-tmp12) | |
1349 | |
1350 pmulhw_m2r(RTjpeg_C6, mm1); // z5 | |
1351 psllw_i2r(2, mm5); // prepare for multiply | |
1352 | |
1353 pmulhw_r2r(mm0, mm4); // multiply by converted real | |
1354 | |
1355 /* stage 5 */ | |
1356 | |
1357 pmulhw_m2r(RTjpeg_C4, mm5); // z3 | |
1358 psllw_i2r(2, mm2); // prepare for multiply | |
1359 | |
1360 pmulhw_m2r(RTjpeg_C2pC6, mm2); // multiply | |
1361 movq_r2r(mm3, mm0); // copy tmp7 | |
1362 | |
1363 movq_m2r(*(dataptr+9), mm7); // m03:m02|m01:m00 - first line (line 4)and copy into mm7 | |
1364 paddw_r2r(mm1, mm4); // z2 | |
1365 | |
1366 paddw_r2r(mm5, mm0); // z11 | |
1367 psubw_r2r(mm5, mm3); // z13 | |
1368 | |
1369 /* stage 6 */ | |
1370 | |
1371 movq_r2r(mm3, mm5); // copy z13 | |
1372 paddw_r2r(mm1, mm2); // z4 | |
1373 | |
1374 movq_r2r(mm0, mm6); // copy z11 | |
1375 psubw_r2r(mm4, mm5); // y3 | |
1376 | |
1377 paddw_r2r(mm2, mm6); // y1 | |
1378 paddw_r2r(mm4, mm3); // y5 | |
1379 | |
1380 movq_r2m(mm5, *(dataptr+7)); //save y3 | |
1381 psubw_r2r(mm2, mm0); // yè=z11 - z4 | |
1382 | |
1383 movq_r2m(mm3, *(dataptr+11)); //save y5 | |
1384 | |
1385 movq_r2m(mm6, *(dataptr+3)); //save y1 | |
1386 | |
1387 movq_r2m(mm0, *(dataptr+15)); //save y7 | |
1388 | |
1389 | |
1390 #endif | |
1391 } | |
1392 | |
1393 /* | |
1394 | |
1395 Main Routines | |
1396 | |
1397 This file contains most of the initialisation and control functions | |
1398 | |
1399 (C) Justin Schoeman 1998 | |
1400 | |
1401 */ | |
1402 | |
1403 /* | |
1404 | |
1405 Private function | |
1406 | |
1407 Initialise all the cache-aliged data blocks | |
1408 | |
1409 */ | |
1410 | |
28849
87b59e8d3c26
Mark everything not used outside the file as "static"
reimar
parents:
28298
diff
changeset
|
1411 static void RTjpeg_init_data(void) |
3802 | 1412 { |
1413 unsigned long dptr; | |
1414 | |
1415 dptr=(unsigned long)&(RTjpeg_alldata[0]); | |
1416 dptr+=32; | |
1417 dptr=dptr>>5; | |
1418 dptr=dptr<<5; /* cache align data */ | |
1419 | |
1420 RTjpeg_block=(__s16 *)dptr; | |
1421 dptr+=sizeof(__s16)*64; | |
1422 RTjpeg_lqt=(__s32 *)dptr; | |
1423 dptr+=sizeof(__s32)*64; | |
1424 RTjpeg_cqt=(__s32 *)dptr; | |
1425 dptr+=sizeof(__s32)*64; | |
1426 RTjpeg_liqt=(__u32 *)dptr; | |
1427 dptr+=sizeof(__u32)*64; | |
1428 RTjpeg_ciqt=(__u32 *)dptr; | |
1429 } | |
1430 | |
1431 /* | |
1432 | |
1433 External Function | |
1434 | |
1435 Re-set quality factor | |
1436 | |
1437 Input: buf -> pointer to 128 ints for quant values store to pass back to | |
1438 init_decompress. | |
1439 Q -> quality factor (192=best, 32=worst) | |
1440 */ | |
1441 | |
28849
87b59e8d3c26
Mark everything not used outside the file as "static"
reimar
parents:
28298
diff
changeset
|
1442 static void RTjpeg_init_Q(__u8 Q) |
3802 | 1443 { |
1444 int i; | |
1445 __u64 qual; | |
1446 | |
1447 qual=(__u64)Q<<(32-7); /* 32 bit FP, 255=2, 0=0 */ | |
1448 | |
1449 for(i=0; i<64; i++) | |
1450 { | |
1451 RTjpeg_lqt[i]=(__s32)((qual/((__u64)RTjpeg_lum_quant_tbl[i]<<16))>>3); | |
1452 if(RTjpeg_lqt[i]==0)RTjpeg_lqt[i]=1; | |
1453 RTjpeg_cqt[i]=(__s32)((qual/((__u64)RTjpeg_chrom_quant_tbl[i]<<16))>>3); | |
1454 if(RTjpeg_cqt[i]==0)RTjpeg_cqt[i]=1; | |
1455 RTjpeg_liqt[i]=(1<<16)/(RTjpeg_lqt[i]<<3); | |
1456 RTjpeg_ciqt[i]=(1<<16)/(RTjpeg_cqt[i]<<3); | |
1457 RTjpeg_lqt[i]=((1<<16)/RTjpeg_liqt[i])>>3; | |
1458 RTjpeg_cqt[i]=((1<<16)/RTjpeg_ciqt[i])>>3; | |
1459 } | |
1460 | |
1461 RTjpeg_lb8=0; | |
1462 while(RTjpeg_liqt[RTjpeg_ZZ[++RTjpeg_lb8]]<=8); | |
1463 RTjpeg_lb8--; | |
1464 RTjpeg_cb8=0; | |
1465 while(RTjpeg_ciqt[RTjpeg_ZZ[++RTjpeg_cb8]]<=8); | |
1466 RTjpeg_cb8--; | |
1467 | |
1468 RTjpeg_dct_init(); | |
1469 RTjpeg_quant_init(); | |
1470 } | |
1471 | |
1472 /* | |
1473 | |
1474 External Function | |
1475 | |
1476 Initialise compression. | |
1477 | |
1478 Input: buf -> pointer to 128 ints for quant values store to pass back to | |
1479 init_decompress. | |
1480 width -> width of image | |
1481 height -> height of image | |
1482 Q -> quality factor (192=best, 32=worst) | |
1483 | |
1484 */ | |
1485 | |
1486 void RTjpeg_init_compress(__u32 *buf, int width, int height, __u8 Q) | |
1487 { | |
1488 int i; | |
1489 __u64 qual; | |
1490 | |
1491 RTjpeg_init_data(); | |
1492 | |
1493 RTjpeg_width=width; | |
1494 RTjpeg_height=height; | |
1495 RTjpeg_Ywidth = RTjpeg_width>>3; | |
1496 RTjpeg_Ysize=width * height; | |
1497 RTjpeg_Cwidth = RTjpeg_width>>4; | |
1498 RTjpeg_Csize= (width>>1) * height; | |
1499 | |
1500 qual=(__u64)Q<<(32-7); /* 32 bit FP, 255=2, 0=0 */ | |
1501 | |
1502 for(i=0; i<64; i++) | |
1503 { | |
1504 RTjpeg_lqt[i]=(__s32)((qual/((__u64)RTjpeg_lum_quant_tbl[i]<<16))>>3); | |
1505 if(RTjpeg_lqt[i]==0)RTjpeg_lqt[i]=1; | |
1506 RTjpeg_cqt[i]=(__s32)((qual/((__u64)RTjpeg_chrom_quant_tbl[i]<<16))>>3); | |
1507 if(RTjpeg_cqt[i]==0)RTjpeg_cqt[i]=1; | |
1508 RTjpeg_liqt[i]=(1<<16)/(RTjpeg_lqt[i]<<3); | |
1509 RTjpeg_ciqt[i]=(1<<16)/(RTjpeg_cqt[i]<<3); | |
1510 RTjpeg_lqt[i]=((1<<16)/RTjpeg_liqt[i])>>3; | |
1511 RTjpeg_cqt[i]=((1<<16)/RTjpeg_ciqt[i])>>3; | |
1512 } | |
1513 | |
1514 RTjpeg_lb8=0; | |
1515 while(RTjpeg_liqt[RTjpeg_ZZ[++RTjpeg_lb8]]<=8); | |
1516 RTjpeg_lb8--; | |
1517 RTjpeg_cb8=0; | |
1518 while(RTjpeg_ciqt[RTjpeg_ZZ[++RTjpeg_cb8]]<=8); | |
1519 RTjpeg_cb8--; | |
1520 | |
1521 RTjpeg_dct_init(); | |
1522 RTjpeg_quant_init(); | |
1523 | |
1524 for(i=0; i<64; i++) | |
14896
9ddae5897422
Make nuv files work on bigendian (but old nuv files created with mencoder
reimar
parents:
14642
diff
changeset
|
1525 buf[i]=le2me_32(RTjpeg_liqt[i]); |
3802 | 1526 for(i=0; i<64; i++) |
14896
9ddae5897422
Make nuv files work on bigendian (but old nuv files created with mencoder
reimar
parents:
14642
diff
changeset
|
1527 buf[64+i]=le2me_32(RTjpeg_ciqt[i]); |
3802 | 1528 } |
1529 | |
1530 int RTjpeg_compressYUV420(__s8 *sp, unsigned char *bp) | |
1531 { | |
1532 __s8 * sb; | |
1533 register __s8 * bp1 = bp + (RTjpeg_width<<3); | |
1534 register __s8 * bp2 = bp + RTjpeg_Ysize; | |
1535 register __s8 * bp3 = bp2 + (RTjpeg_Csize>>1); | |
1536 register int i, j, k; | |
1537 | |
28298
a7124a264ea6
Completely get rid of MMX define, use HAVE_MMX define instead.
gpoirier
parents:
28296
diff
changeset
|
1538 #if HAVE_MMX |
3802 | 1539 emms(); |
1540 #endif | |
1541 sb=sp; | |
1542 /* Y */ | |
1543 for(i=RTjpeg_height>>1; i; i-=8) | |
1544 { | |
1545 for(j=0, k=0; j<RTjpeg_width; j+=16, k+=8) | |
1546 { | |
1547 RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_Ywidth); | |
1548 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt); | |
1549 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8); | |
1550 | |
1551 RTjpeg_dctY(bp+j+8, RTjpeg_block, RTjpeg_Ywidth); | |
1552 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt); | |
1553 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8); | |
1554 | |
1555 RTjpeg_dctY(bp1+j, RTjpeg_block, RTjpeg_Ywidth); | |
1556 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt); | |
1557 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8); | |
1558 | |
1559 RTjpeg_dctY(bp1+j+8, RTjpeg_block, RTjpeg_Ywidth); | |
1560 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt); | |
1561 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8); | |
1562 | |
1563 RTjpeg_dctY(bp2+k, RTjpeg_block, RTjpeg_Cwidth); | |
1564 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt); | |
1565 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8); | |
1566 | |
1567 RTjpeg_dctY(bp3+k, RTjpeg_block, RTjpeg_Cwidth); | |
1568 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt); | |
1569 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8); | |
1570 | |
1571 } | |
1572 bp+=RTjpeg_width<<4; | |
1573 bp1+=RTjpeg_width<<4; | |
1574 bp2+=RTjpeg_width<<2; | |
1575 bp3+=RTjpeg_width<<2; | |
1576 | |
1577 } | |
28298
a7124a264ea6
Completely get rid of MMX define, use HAVE_MMX define instead.
gpoirier
parents:
28296
diff
changeset
|
1578 #if HAVE_MMX |
3802 | 1579 emms(); |
1580 #endif | |
1581 return (sp-sb); | |
1582 } | |
1583 | |
1584 /* | |
1585 External Function | |
1586 | |
1587 Initialise additional data structures for motion compensation | |
1588 | |
1589 */ | |
1590 | |
1591 void RTjpeg_init_mcompress(void) | |
1592 { | |
1593 unsigned long tmp; | |
1594 | |
1595 if(!RTjpeg_old) | |
1596 { | |
1597 RTjpeg_old=malloc((4*RTjpeg_width*RTjpeg_height)+32); | |
1598 tmp=(unsigned long)RTjpeg_old; | |
1599 tmp+=32; | |
1600 tmp=tmp>>5; | |
1601 RTjpeg_old=(__s16 *)(tmp<<5); | |
1602 } | |
1603 if (!RTjpeg_old) | |
1604 { | |
1605 fprintf(stderr, "RTjpeg: Could not allocate memory\n"); | |
1606 exit(-1); | |
1607 } | |
14642
38572280e8e7
bzero is deprecated patch by Gianluigi Tiesi <mplayer at netfarm.it>
faust3
parents:
12928
diff
changeset
|
1608 memset(RTjpeg_old, 0, ((4*RTjpeg_width*RTjpeg_height))); |
3802 | 1609 } |
1610 | |
28298
a7124a264ea6
Completely get rid of MMX define, use HAVE_MMX define instead.
gpoirier
parents:
28296
diff
changeset
|
1611 #if HAVE_MMX |
3802 | 1612 |
28849
87b59e8d3c26
Mark everything not used outside the file as "static"
reimar
parents:
28298
diff
changeset
|
1613 static int RTjpeg_bcomp(__s16 *old, mmx_t *mask) |
3802 | 1614 { |
1615 int i; | |
1616 mmx_t *mold=(mmx_t *)old; | |
1617 mmx_t *mblock=(mmx_t *)RTjpeg_block; | |
1618 volatile mmx_t result; | |
12928 | 1619 static mmx_t neg={0xffffffffffffffffULL}; |
3802 | 1620 |
1621 movq_m2r(*mask, mm7); | |
1622 movq_m2r(neg, mm6); | |
1623 pxor_r2r(mm5, mm5); | |
1624 | |
1625 for(i=0; i<8; i++) | |
1626 { | |
1627 movq_m2r(*(mblock++), mm0); | |
1628 movq_m2r(*(mblock++), mm2); | |
1629 movq_m2r(*(mold++), mm1); | |
1630 movq_m2r(*(mold++), mm3); | |
1631 psubsw_r2r(mm1, mm0); | |
1632 psubsw_r2r(mm3, mm2); | |
1633 movq_r2r(mm0, mm1); | |
1634 movq_r2r(mm2, mm3); | |
1635 pcmpgtw_r2r(mm7, mm0); | |
1636 pcmpgtw_r2r(mm7, mm2); | |
1637 pxor_r2r(mm6, mm1); | |
1638 pxor_r2r(mm6, mm3); | |
1639 pcmpgtw_r2r(mm7, mm1); | |
1640 pcmpgtw_r2r(mm7, mm3); | |
1641 por_r2r(mm0, mm5); | |
1642 por_r2r(mm2, mm5); | |
1643 por_r2r(mm1, mm5); | |
1644 por_r2r(mm3, mm5); | |
1645 } | |
1646 movq_r2m(mm5, result); | |
1647 | |
1648 if(result.q) | |
1649 { | |
1650 return 0; | |
1651 } | |
1652 return 1; | |
1653 } | |
1654 | |
1655 #else | |
28849
87b59e8d3c26
Mark everything not used outside the file as "static"
reimar
parents:
28298
diff
changeset
|
1656 static int RTjpeg_bcomp(__s16 *old, __u16 *mask) |
3802 | 1657 { |
1658 int i; | |
1659 | |
1660 for(i=0; i<64; i++) | |
1661 if(abs(old[i]-RTjpeg_block[i])>*mask) | |
1662 { | |
1663 for(i=0; i<16; i++)((__u64 *)old)[i]=((__u64 *)RTjpeg_block)[i]; | |
1664 return 0; | |
1665 } | |
1666 return 1; | |
1667 } | |
1668 #endif | |
1669 | |
1670 int RTjpeg_mcompressYUV420(__s8 *sp, unsigned char *bp, __u16 lmask, __u16 cmask) | |
1671 { | |
1672 __s8 * sb; | |
1673 register __s8 * bp1 = bp + (RTjpeg_width<<3); | |
1674 register __s8 * bp2 = bp + RTjpeg_Ysize; | |
1675 register __s8 * bp3 = bp2 + (RTjpeg_Csize>>1); | |
1676 register int i, j, k; | |
1677 | |
28298
a7124a264ea6
Completely get rid of MMX define, use HAVE_MMX define instead.
gpoirier
parents:
28296
diff
changeset
|
1678 #if HAVE_MMX |
3802 | 1679 emms(); |
16653
27b0d49988b2
Fix 100l bugs that break playback on 64 bit systems (like typedefing __u32
reimar
parents:
14896
diff
changeset
|
1680 RTjpeg_lmask.uq=((__u64)lmask<<48)|((__u64)lmask<<32)|((__u64)lmask<<16)|lmask; |
27b0d49988b2
Fix 100l bugs that break playback on 64 bit systems (like typedefing __u32
reimar
parents:
14896
diff
changeset
|
1681 RTjpeg_cmask.uq=((__u64)cmask<<48)|((__u64)cmask<<32)|((__u64)cmask<<16)|cmask; |
3802 | 1682 #else |
16661
adb581352e63
Stupidity in last patch broke compile without MMX: RTjpeg_lmask is a union
reimar
parents:
16653
diff
changeset
|
1683 RTjpeg_lmask=lmask; |
adb581352e63
Stupidity in last patch broke compile without MMX: RTjpeg_lmask is a union
reimar
parents:
16653
diff
changeset
|
1684 RTjpeg_cmask=cmask; |
3802 | 1685 #endif |
1686 | |
1687 sb=sp; | |
1688 block=RTjpeg_old; | |
1689 /* Y */ | |
1690 for(i=RTjpeg_height>>1; i; i-=8) | |
1691 { | |
1692 for(j=0, k=0; j<RTjpeg_width; j+=16, k+=8) | |
1693 { | |
1694 RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_Ywidth); | |
1695 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt); | |
1696 if(RTjpeg_bcomp(block, &RTjpeg_lmask)) | |
1697 { | |
1698 *((__u8 *)sp++)=255; | |
1699 } | |
1700 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8); | |
1701 block+=64; | |
1702 | |
1703 RTjpeg_dctY(bp+j+8, RTjpeg_block, RTjpeg_Ywidth); | |
1704 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt); | |
1705 if(RTjpeg_bcomp(block, &RTjpeg_lmask)) | |
1706 { | |
1707 *((__u8 *)sp++)=255; | |
1708 } | |
1709 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8); | |
1710 block+=64; | |
1711 | |
1712 RTjpeg_dctY(bp1+j, RTjpeg_block, RTjpeg_Ywidth); | |
1713 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt); | |
1714 if(RTjpeg_bcomp(block, &RTjpeg_lmask)) | |
1715 { | |
1716 *((__u8 *)sp++)=255; | |
1717 } | |
1718 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8); | |
1719 block+=64; | |
1720 | |
1721 RTjpeg_dctY(bp1+j+8, RTjpeg_block, RTjpeg_Ywidth); | |
1722 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt); | |
1723 if(RTjpeg_bcomp(block, &RTjpeg_lmask)) | |
1724 { | |
1725 *((__u8 *)sp++)=255; | |
1726 } | |
1727 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8); | |
1728 block+=64; | |
1729 | |
1730 RTjpeg_dctY(bp2+k, RTjpeg_block, RTjpeg_Cwidth); | |
1731 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt); | |
1732 if(RTjpeg_bcomp(block, &RTjpeg_cmask)) | |
1733 { | |
1734 *((__u8 *)sp++)=255; | |
1735 } | |
1736 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8); | |
1737 block+=64; | |
1738 | |
1739 RTjpeg_dctY(bp3+k, RTjpeg_block, RTjpeg_Cwidth); | |
1740 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt); | |
1741 if(RTjpeg_bcomp(block, &RTjpeg_cmask)) | |
1742 { | |
1743 *((__u8 *)sp++)=255; | |
1744 } | |
1745 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8); | |
1746 block+=64; | |
1747 } | |
1748 bp+=RTjpeg_width<<4; | |
1749 bp1+=RTjpeg_width<<4; | |
1750 bp2+=RTjpeg_width<<2; | |
1751 bp3+=RTjpeg_width<<2; | |
1752 | |
1753 } | |
28298
a7124a264ea6
Completely get rid of MMX define, use HAVE_MMX define instead.
gpoirier
parents:
28296
diff
changeset
|
1754 #if HAVE_MMX |
3802 | 1755 emms(); |
1756 #endif | |
1757 return (sp-sb); | |
1758 } |