Mercurial > mplayer.hg
annotate libmpcodecs/native/RTjpegN.c @ 11619:179138947307
This patch contains bugfixes for the esd audio output driver that I
uncovered while trying to send sound to a remote esd server over a
wireless (11 mbs, just enough to handle to sound) link.
First, the sound was full "ticking" sounds. I found a bug that
prevented the "send the remainder of this block" code from ever being
called - so large chunks of audio were simply being ignored. Fixing
this bug removed the "ticking" from audio streams.
Fixing this bug, however, uncovered another problem - when the socket
buffer was full, doing a blocking write to finish the buffer would take
far too long and would turn video into a chunky mess. I'd imagine this
blocking write would be fine for an audio-only stream, but it turns out
to hold up the video far too much.
The solution in this patch is to write as much data as possible to the
socket, and then return as soon as possible, reporting the number of
bytes actually written accurately back to mplayer. I've tested it on
both local and remote esd servers, and it works well.
Patch by Benjamin Osheroff <ben@gimbo.net>
author | attila |
---|---|
date | Wed, 10 Dec 2003 12:19:13 +0000 |
parents | 7e5d9bf1e56f |
children | 649f596054e0 |
rev | line source |
---|---|
3802 | 1 /* |
2 RTjpeg (C) Justin Schoeman 1998 (justin@suntiger.ee.up.ac.za) | |
3 | |
4 With modifications by: | |
5 (c) 1998, 1999 by Joerg Walter <trouble@moes.pmnet.uni-oldenburg.de> | |
6 and | |
7 (c) 1999 by Wim Taymans <wim.taymans@tvd.be> | |
8 | |
9 This program is free software; you can redistribute it and/or modify | |
10 it under the terms of the GNU General Public License as published by | |
11 the Free Software Foundation; either version 2 of the License, or | |
12 (at your option) any later version. | |
13 | |
14 This program is distributed in the hope that it will be useful, | |
15 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
17 GNU General Public License for more details. | |
18 | |
19 You should have received a copy of the GNU General Public License | |
20 along with this program; if not, write to the Free Software | |
21 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | |
22 | |
23 */ | |
24 | |
25 #include <stdio.h> | |
26 #include <stdlib.h> | |
27 #include <string.h> | |
3805 | 28 |
29 #include "config.h" | |
30 #ifdef HAVE_MMX | |
31 #define MMX | |
32 #endif | |
33 | |
3802 | 34 #include "RTjpegN.h" |
35 | |
36 #ifdef MMX | |
37 #include "mmx.h" | |
38 #endif | |
39 | |
40 //#define SHOWBLOCK 1 | |
41 #define BETTERCOMPRESSION 1 | |
42 | |
43 static const unsigned char RTjpeg_ZZ[64]={ | |
44 0, | |
45 8, 1, | |
46 2, 9, 16, | |
47 24, 17, 10, 3, | |
48 4, 11, 18, 25, 32, | |
49 40, 33, 26, 19, 12, 5, | |
50 6, 13, 20, 27, 34, 41, 48, | |
51 56, 49, 42, 35, 28, 21, 14, 7, | |
52 15, 22, 29, 36, 43, 50, 57, | |
53 58, 51, 44, 37, 30, 23, | |
54 31, 38, 45, 52, 59, | |
55 60, 53, 46, 39, | |
56 47, 54, 61, | |
57 62, 55, | |
58 63 }; | |
59 | |
60 static const __u64 RTjpeg_aan_tab[64]={ | |
61 4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL, 3374581504ULL, 2324432128ULL, 1184891264ULL, | |
62 5957222912ULL, 8263040512ULL, 7783580160ULL, 7005009920ULL, 5957222912ULL, 4680582144ULL, 3224107520ULL, 1643641088ULL, | |
63 5611718144ULL, 7783580160ULL, 7331904512ULL, 6598688768ULL, 5611718144ULL, 4408998912ULL, 3036936960ULL, 1548224000ULL, | |
64 5050464768ULL, 7005009920ULL, 6598688768ULL, 5938608128ULL, 5050464768ULL, 3968072960ULL, 2733115392ULL, 1393296000ULL, | |
65 4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL, 3374581504ULL, 2324432128ULL, 1184891264ULL, | |
66 3374581504ULL, 4680582144ULL, 4408998912ULL, 3968072960ULL, 3374581504ULL, 2651326208ULL, 1826357504ULL, 931136000ULL, | |
67 2324432128ULL, 3224107520ULL, 3036936960ULL, 2733115392ULL, 2324432128ULL, 1826357504ULL, 1258030336ULL, 641204288ULL, | |
68 1184891264ULL, 1643641088ULL, 1548224000ULL, 1393296000ULL, 1184891264ULL, 931136000ULL, 641204288ULL, 326894240ULL, | |
69 }; | |
70 | |
71 #ifndef MMX | |
72 static __s32 RTjpeg_ws[64+31]; | |
73 #endif | |
74 __u8 RTjpeg_alldata[2*64+4*64+4*64+4*64+4*64+32]; | |
75 | |
3835 | 76 static __s16 *block; // rh |
77 static __s16 *RTjpeg_block; | |
78 static __s32 *RTjpeg_lqt; | |
79 static __s32 *RTjpeg_cqt; | |
80 static __u32 *RTjpeg_liqt; | |
81 static __u32 *RTjpeg_ciqt; | |
82 | |
83 static unsigned char RTjpeg_lb8; | |
84 static unsigned char RTjpeg_cb8; | |
85 static int RTjpeg_width, RTjpeg_height; | |
86 static int RTjpeg_Ywidth, RTjpeg_Cwidth; | |
87 static int RTjpeg_Ysize, RTjpeg_Csize; | |
88 | |
89 static __s16 *RTjpeg_old=NULL; | |
3802 | 90 |
91 #ifdef MMX | |
92 mmx_t RTjpeg_lmask; | |
93 mmx_t RTjpeg_cmask; | |
94 #else | |
95 __u16 RTjpeg_lmask; | |
96 __u16 RTjpeg_cmask; | |
97 #endif | |
98 int RTjpeg_mtest=0; | |
99 | |
100 static const unsigned char RTjpeg_lum_quant_tbl[64] = { | |
101 16, 11, 10, 16, 24, 40, 51, 61, | |
102 12, 12, 14, 19, 26, 58, 60, 55, | |
103 14, 13, 16, 24, 40, 57, 69, 56, | |
104 14, 17, 22, 29, 51, 87, 80, 62, | |
105 18, 22, 37, 56, 68, 109, 103, 77, | |
106 24, 35, 55, 64, 81, 104, 113, 92, | |
107 49, 64, 78, 87, 103, 121, 120, 101, | |
108 72, 92, 95, 98, 112, 100, 103, 99 | |
109 }; | |
110 | |
111 static const unsigned char RTjpeg_chrom_quant_tbl[64] = { | |
112 17, 18, 24, 47, 99, 99, 99, 99, | |
113 18, 21, 26, 66, 99, 99, 99, 99, | |
114 24, 26, 56, 99, 99, 99, 99, 99, | |
115 47, 66, 99, 99, 99, 99, 99, 99, | |
116 99, 99, 99, 99, 99, 99, 99, 99, | |
117 99, 99, 99, 99, 99, 99, 99, 99, | |
118 99, 99, 99, 99, 99, 99, 99, 99, | |
119 99, 99, 99, 99, 99, 99, 99, 99 | |
120 }; | |
121 | |
122 #ifdef BETTERCOMPRESSION | |
123 | |
124 /*--------------------------------------------------*/ | |
125 /* better encoding, but needs a lot more cpu time */ | |
126 /* seems to be more effective than old method +lzo */ | |
127 /* with this encoding lzo isn't efficient anymore */ | |
128 /* there is still more potential for better */ | |
129 /* encoding but that would need even more cputime */ | |
130 /* anyway your mileage may vary */ | |
131 /* */ | |
132 /* written by Martin BIELY and Roman HOCHLEITNER */ | |
133 /*--------------------------------------------------*/ | |
134 | |
135 /* +++++++++++++++++++++++++++++++++++++++++++++++++++*/ | |
136 /* Block to Stream (encoding) */ | |
137 /* */ | |
138 | |
139 int RTjpeg_b2s(__s16 *data, __s8 *strm, __u8 bt8) | |
140 { | |
141 register int ci, co=1; | |
142 register __s16 ZZvalue; | |
143 register unsigned char bitten; | |
144 register unsigned char bitoff; | |
145 | |
146 #ifdef SHOWBLOCK | |
147 | |
148 int ii; | |
149 for (ii=0; ii < 64; ii++) { | |
150 fprintf(stdout, "%d ", data[RTjpeg_ZZ[ii]]); | |
151 } | |
152 fprintf(stdout, "\n\n"); | |
153 | |
154 #endif | |
155 | |
156 // *strm++ = 0x10; | |
157 // *strm = 0x00; | |
158 // | |
159 // return 2; | |
160 | |
161 // first byte allways written | |
162 (__u8)strm[0]= | |
163 (__u8)(data[RTjpeg_ZZ[0]]>254) ? 254:((data[RTjpeg_ZZ[0]]<0)?0:data[RTjpeg_ZZ[0]]); | |
164 | |
165 | |
166 ci=63; | |
167 while (data[RTjpeg_ZZ[ci]]==0 && ci>0) ci--; | |
168 | |
169 bitten = ((unsigned char)ci) << 2; | |
170 | |
171 if (ci==0) { | |
172 (__u8)strm[1]= bitten; | |
173 co = 2; | |
174 return (int)co; | |
175 } | |
176 | |
177 /* bitoff=0 because the high 6bit contain first non zero position */ | |
178 bitoff = 0; | |
179 co = 1; | |
180 | |
181 for(; ci>0; ci--) { | |
182 | |
183 ZZvalue = data[RTjpeg_ZZ[ci]]; | |
184 | |
185 switch(ZZvalue) { | |
186 case 0: | |
187 break; | |
188 case 1: | |
189 bitten |= (0x01<<bitoff); | |
190 break; | |
191 case -1: | |
192 bitten |= (0x03<<bitoff); | |
193 break; | |
194 default: | |
195 bitten |= (0x02<<bitoff); | |
196 goto HERZWEH; | |
197 break; | |
198 } | |
199 | |
200 if( bitoff == 0 ) { | |
201 (__u8)strm[co]= bitten; | |
202 bitten = 0; | |
203 bitoff = 8; | |
204 co++; | |
205 } /* "fall through" */ | |
206 bitoff-=2; | |
207 | |
208 } | |
209 | |
210 /* ci must be 0 */ | |
211 if(bitoff != 6) { | |
212 | |
213 (__u8)strm[co]= bitten; | |
214 co++; | |
215 | |
216 } | |
217 goto BAUCHWEH; | |
218 | |
219 HERZWEH: | |
220 /* ci cannot be 0 */ | |
221 /* correct bitoff to nibble boundaries */ | |
222 | |
223 switch(bitoff){ | |
224 case 4: | |
225 case 6: | |
226 bitoff = 0; | |
227 break; | |
228 case 2: | |
229 case 0: | |
230 (__u8)strm[co]= bitten; | |
231 bitoff = 4; | |
232 co++; | |
233 bitten = 0; // clear half nibble values in bitten | |
234 break; | |
235 default: | |
236 break; | |
237 } | |
238 | |
239 for(; ci>0; ci--) { | |
240 | |
241 ZZvalue = data[RTjpeg_ZZ[ci]]; | |
242 | |
243 if( (ZZvalue > 7) || (ZZvalue < -7) ) { | |
244 bitten |= (0x08<<bitoff); | |
245 goto HIRNWEH; | |
246 } | |
247 | |
248 bitten |= (ZZvalue&0xf)<<bitoff; | |
249 | |
250 if( bitoff == 0 ) { | |
251 (__u8)strm[co]= bitten; | |
252 bitten = 0; | |
253 bitoff = 8; | |
254 co++; | |
255 } /* "fall thru" */ | |
256 bitoff-=4; | |
257 } | |
258 | |
259 /* ci must be 0 */ | |
260 if( bitoff == 0 ) { | |
261 (__u8)strm[co]= bitten; | |
262 co++; | |
263 } | |
264 goto BAUCHWEH; | |
265 | |
266 HIRNWEH: | |
267 | |
268 (__u8)strm[co]= bitten; | |
269 co++; | |
270 | |
271 | |
272 /* bitting is over now we bite */ | |
273 for(; ci>0; ci--) { | |
274 | |
275 ZZvalue = data[RTjpeg_ZZ[ci]]; | |
276 | |
277 if(ZZvalue>0) | |
278 { | |
279 strm[co++]=(__s8)(ZZvalue>127)?127:ZZvalue; | |
280 } | |
281 else | |
282 { | |
283 strm[co++]=(__s8)(ZZvalue<-128)?-128:ZZvalue; | |
284 } | |
285 | |
286 } | |
287 | |
288 | |
289 BAUCHWEH: | |
290 /* we gotoo much now we are ill */ | |
291 #ifdef SHOWBLOCK | |
292 { | |
293 int i; | |
294 fprintf(stdout, "\nco = '%d'\n", co); | |
295 for (i=0; i < co+2; i++) { | |
296 fprintf(stdout, "%d ", strm[i]); | |
297 } | |
298 fprintf(stdout, "\n\n"); | |
299 } | |
300 #endif | |
301 | |
302 return (int)co; | |
303 } | |
304 | |
305 /* +++++++++++++++++++++++++++++++++++++++++++++++++++*/ | |
306 /* Stream to Block (decoding) */ | |
307 /* */ | |
308 | |
309 int RTjpeg_s2b(__s16 *data, __s8 *strm, __u8 bt8, __u32 *qtbl) | |
310 { | |
311 int ci; | |
312 register int co; | |
313 register int i; | |
314 register unsigned char bitten; | |
315 register unsigned char bitoff; | |
316 | |
317 /* first byte always read */ | |
318 i=RTjpeg_ZZ[0]; | |
319 data[i]=((__u8)strm[0])*qtbl[i]; | |
320 | |
321 /* we start at the behind */ | |
322 | |
323 bitten = ((unsigned char)strm[1]) >> 2; | |
324 co = 63; | |
325 for(; co > bitten; co--) { | |
326 | |
327 data[RTjpeg_ZZ[co]] = 0; | |
328 | |
329 } | |
330 | |
331 if (co==0) { | |
332 ci = 2; | |
333 goto AUTOBAHN; | |
334 } | |
335 | |
336 /* we have to read the last 2 bits of the second byte */ | |
337 ci=1; | |
338 bitoff = 0; | |
339 | |
340 for(; co>0; co--) { | |
341 | |
342 bitten = ((unsigned char)strm[ci]) >> bitoff; | |
343 bitten &= 0x03; | |
344 | |
345 i=RTjpeg_ZZ[co]; | |
346 | |
347 switch( bitten ) { | |
348 case 0x03: | |
349 data[i]= -qtbl[i]; | |
350 break; | |
351 case 0x02: | |
352 goto FUSSWEG; | |
353 break; | |
354 case 0x01: | |
355 data[i]= qtbl[i]; | |
356 break; | |
357 case 0x00: | |
358 data[i]= 0; | |
359 break; | |
360 default: | |
6335
e9bd97d5c5cc
warning & newline fixes by Dominik Mierzejewski <dominik@rangers.eu.org>
arpi
parents:
5602
diff
changeset
|
361 break; |
3802 | 362 } |
363 | |
364 if( bitoff == 0 ) { | |
365 bitoff = 8; | |
366 ci++; | |
367 } | |
368 bitoff -= 2; | |
369 } | |
370 /* co is 0 now */ | |
371 /* data is written properly */ | |
372 | |
373 /* if bitoff!=6 then ci is the index, but should be the byte count, so we increment by 1 */ | |
374 if (bitoff!=6) ci++; | |
375 | |
376 goto AUTOBAHN; | |
377 | |
378 | |
379 FUSSWEG: | |
380 /* correct bitoff to nibble */ | |
381 switch(bitoff){ | |
382 case 4: | |
383 case 6: | |
384 bitoff = 0; | |
385 break; | |
386 case 2: | |
387 case 0: | |
388 /* we have to read from the next byte */ | |
389 ci++; | |
390 bitoff = 4; | |
391 break; | |
392 default: | |
393 break; | |
394 } | |
395 | |
396 for(; co>0; co--) { | |
397 | |
398 bitten = ((unsigned char)strm[ci]) >> bitoff; | |
399 bitten &= 0x0f; | |
400 | |
401 i=RTjpeg_ZZ[co]; | |
402 | |
403 if( bitten == 0x08 ) { | |
404 goto STRASSE; | |
405 } | |
406 | |
407 /* the compiler cannot do sign extension for signed nibbles */ | |
408 if( bitten & 0x08 ) { | |
409 bitten |= 0xf0; | |
410 } | |
411 /* the unsigned char bitten now is a valid signed char */ | |
412 | |
413 data[i]=((signed char)bitten)*qtbl[i]; | |
414 | |
415 if( bitoff == 0 ) { | |
416 bitoff = 8; | |
417 ci++; | |
418 } | |
419 bitoff -= 4; | |
420 } | |
421 /* co is 0 */ | |
422 | |
423 /* if bitoff!=4 then ci is the index, but should be the byte count, so we increment by 1 */ | |
424 if (bitoff!=4) ci++; | |
425 | |
426 goto AUTOBAHN; | |
427 | |
428 STRASSE: | |
429 ci++; | |
430 | |
431 for(; co>0; co--) { | |
432 i=RTjpeg_ZZ[co]; | |
433 data[i]=strm[ci++]*qtbl[i]; | |
434 } | |
435 | |
436 /* ci now is the count, because it points to next element => no incrementing */ | |
437 | |
438 AUTOBAHN: | |
439 | |
440 #ifdef SHOWBLOCK | |
441 fprintf(stdout, "\nci = '%d'\n", ci); | |
442 for (i=0; i < 64; i++) { | |
443 fprintf(stdout, "%d ", data[RTjpeg_ZZ[i]]); | |
444 } | |
445 fprintf(stdout, "\n\n"); | |
446 #endif | |
447 | |
448 return ci; | |
449 } | |
450 | |
451 #else | |
452 | |
453 int RTjpeg_b2s(__s16 *data, __s8 *strm, __u8 bt8) | |
454 { | |
455 register int ci, co=1, tmp; | |
456 register __s16 ZZvalue; | |
457 | |
458 #ifdef SHOWBLOCK | |
459 | |
460 int ii; | |
461 for (ii=0; ii < 64; ii++) { | |
462 fprintf(stdout, "%d ", data[RTjpeg_ZZ[ii]]); | |
463 } | |
464 fprintf(stdout, "\n\n"); | |
465 | |
466 #endif | |
467 | |
468 (__u8)strm[0]=(__u8)(data[RTjpeg_ZZ[0]]>254) ? 254:((data[RTjpeg_ZZ[0]]<0)?0:data[RTjpeg_ZZ[0]]); | |
469 | |
470 for(ci=1; ci<=bt8; ci++) | |
471 { | |
472 ZZvalue = data[RTjpeg_ZZ[ci]]; | |
473 | |
474 if(ZZvalue>0) | |
475 { | |
476 strm[co++]=(__s8)(ZZvalue>127)?127:ZZvalue; | |
477 } | |
478 else | |
479 { | |
480 strm[co++]=(__s8)(ZZvalue<-128)?-128:ZZvalue; | |
481 } | |
482 } | |
483 | |
484 for(; ci<64; ci++) | |
485 { | |
486 ZZvalue = data[RTjpeg_ZZ[ci]]; | |
487 | |
488 if(ZZvalue>0) | |
489 { | |
490 strm[co++]=(__s8)(ZZvalue>63)?63:ZZvalue; | |
491 } | |
492 else if(ZZvalue<0) | |
493 { | |
494 strm[co++]=(__s8)(ZZvalue<-64)?-64:ZZvalue; | |
495 } | |
496 else /* compress zeros */ | |
497 { | |
498 tmp=ci; | |
499 do | |
500 { | |
501 ci++; | |
502 } | |
503 while((ci<64)&&(data[RTjpeg_ZZ[ci]]==0)); | |
504 | |
505 strm[co++]=(__s8)(63+(ci-tmp)); | |
506 ci--; | |
507 } | |
508 } | |
509 return (int)co; | |
510 } | |
511 | |
512 int RTjpeg_s2b(__s16 *data, __s8 *strm, __u8 bt8, __u32 *qtbl) | |
513 { | |
514 int ci=1, co=1, tmp; | |
515 register int i; | |
516 | |
517 i=RTjpeg_ZZ[0]; | |
518 data[i]=((__u8)strm[0])*qtbl[i]; | |
519 | |
520 for(co=1; co<=bt8; co++) | |
521 { | |
522 i=RTjpeg_ZZ[co]; | |
523 data[i]=strm[ci++]*qtbl[i]; | |
524 } | |
525 | |
526 for(; co<64; co++) | |
527 { | |
528 if(strm[ci]>63) | |
529 { | |
530 tmp=co+strm[ci]-63; | |
531 for(; co<tmp; co++)data[RTjpeg_ZZ[co]]=0; | |
532 co--; | |
533 } else | |
534 { | |
535 i=RTjpeg_ZZ[co]; | |
536 data[i]=strm[ci]*qtbl[i]; | |
537 } | |
538 ci++; | |
539 } | |
540 return (int)ci; | |
541 } | |
542 #endif | |
543 | |
544 #if defined(MMX) | |
545 void RTjpeg_quant_init(void) | |
546 { | |
547 int i; | |
548 __s16 *qtbl; | |
549 | |
550 qtbl=(__s16 *)RTjpeg_lqt; | |
551 for(i=0; i<64; i++)qtbl[i]=(__s16)RTjpeg_lqt[i]; | |
552 | |
553 qtbl=(__s16 *)RTjpeg_cqt; | |
554 for(i=0; i<64; i++)qtbl[i]=(__s16)RTjpeg_cqt[i]; | |
555 } | |
556 | |
557 static mmx_t RTjpeg_ones=(mmx_t)(long long)0x0001000100010001LL; | |
558 static mmx_t RTjpeg_half=(mmx_t)(long long)0x7fff7fff7fff7fffLL; | |
559 | |
560 void RTjpeg_quant(__s16 *block, __s32 *qtbl) | |
561 { | |
562 int i; | |
563 mmx_t *bl, *ql; | |
564 | |
565 ql=(mmx_t *)qtbl; | |
566 bl=(mmx_t *)block; | |
567 | |
568 movq_m2r(RTjpeg_ones, mm6); | |
569 movq_m2r(RTjpeg_half, mm7); | |
570 | |
571 for(i=16; i; i--) | |
572 { | |
573 movq_m2r(*(ql++), mm0); /* quant vals (4) */ | |
574 movq_m2r(*bl, mm2); /* block vals (4) */ | |
575 movq_r2r(mm0, mm1); | |
576 movq_r2r(mm2, mm3); | |
577 | |
578 punpcklwd_r2r(mm6, mm0); /* 1 qb 1 qa */ | |
579 punpckhwd_r2r(mm6, mm1); /* 1 qd 1 qc */ | |
580 | |
581 punpcklwd_r2r(mm7, mm2); /* 32767 bb 32767 ba */ | |
582 punpckhwd_r2r(mm7, mm3); /* 32767 bd 32767 bc */ | |
583 | |
584 pmaddwd_r2r(mm2, mm0); /* 32767+bb*qb 32767+ba*qa */ | |
585 pmaddwd_r2r(mm3, mm1); /* 32767+bd*qd 32767+bc*qc */ | |
586 | |
587 psrad_i2r(16, mm0); | |
588 psrad_i2r(16, mm1); | |
589 | |
590 packssdw_r2r(mm1, mm0); | |
591 | |
592 movq_r2m(mm0, *(bl++)); | |
593 | |
594 } | |
595 } | |
596 #else | |
597 void RTjpeg_quant_init(void) | |
598 { | |
599 } | |
600 | |
601 void RTjpeg_quant(__s16 *block, __s32 *qtbl) | |
602 { | |
603 int i; | |
604 | |
605 for(i=0; i<64; i++) | |
606 block[i]=(__s16)((block[i]*qtbl[i]+32767)>>16); | |
607 } | |
608 #endif | |
609 | |
610 /* | |
611 * Perform the forward DCT on one block of samples. | |
612 */ | |
613 #ifdef MMX | |
614 static mmx_t RTjpeg_C4 =(mmx_t)(long long)0x2D412D412D412D41LL; | |
615 static mmx_t RTjpeg_C6 =(mmx_t)(long long)0x187E187E187E187ELL; | |
616 static mmx_t RTjpeg_C2mC6=(mmx_t)(long long)0x22A322A322A322A3LL; | |
617 static mmx_t RTjpeg_C2pC6=(mmx_t)(long long)0x539F539F539F539FLL; | |
618 static mmx_t RTjpeg_zero =(mmx_t)(long long)0x0000000000000000LL; | |
619 | |
620 #else | |
621 | |
622 #define FIX_0_382683433 ((__s32) 98) /* FIX(0.382683433) */ | |
623 #define FIX_0_541196100 ((__s32) 139) /* FIX(0.541196100) */ | |
624 #define FIX_0_707106781 ((__s32) 181) /* FIX(0.707106781) */ | |
625 #define FIX_1_306562965 ((__s32) 334) /* FIX(1.306562965) */ | |
626 | |
627 #define DESCALE10(x) (__s16)( ((x)+128) >> 8) | |
628 #define DESCALE20(x) (__s16)(((x)+32768) >> 16) | |
629 #define D_MULTIPLY(var,const) ((__s32) ((var) * (const))) | |
630 #endif | |
631 | |
632 void RTjpeg_dct_init(void) | |
633 { | |
634 int i; | |
635 | |
636 for(i=0; i<64; i++) | |
637 { | |
638 RTjpeg_lqt[i]=(((__u64)RTjpeg_lqt[i]<<32)/RTjpeg_aan_tab[i]); | |
639 RTjpeg_cqt[i]=(((__u64)RTjpeg_cqt[i]<<32)/RTjpeg_aan_tab[i]); | |
640 } | |
641 } | |
642 | |
643 void RTjpeg_dctY(__u8 *idata, __s16 *odata, int rskip) | |
644 { | |
645 #ifndef MMX | |
646 __s32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; | |
647 __s32 tmp10, tmp11, tmp12, tmp13; | |
648 __s32 z1, z2, z3, z4, z5, z11, z13; | |
649 __u8 *idataptr; | |
650 __s16 *odataptr; | |
651 __s32 *wsptr; | |
652 int ctr; | |
653 | |
654 idataptr = idata; | |
655 wsptr = RTjpeg_ws; | |
656 for (ctr = 7; ctr >= 0; ctr--) { | |
657 tmp0 = idataptr[0] + idataptr[7]; | |
658 tmp7 = idataptr[0] - idataptr[7]; | |
659 tmp1 = idataptr[1] + idataptr[6]; | |
660 tmp6 = idataptr[1] - idataptr[6]; | |
661 tmp2 = idataptr[2] + idataptr[5]; | |
662 tmp5 = idataptr[2] - idataptr[5]; | |
663 tmp3 = idataptr[3] + idataptr[4]; | |
664 tmp4 = idataptr[3] - idataptr[4]; | |
665 | |
666 tmp10 = (tmp0 + tmp3); /* phase 2 */ | |
667 tmp13 = tmp0 - tmp3; | |
668 tmp11 = (tmp1 + tmp2); | |
669 tmp12 = tmp1 - tmp2; | |
670 | |
671 wsptr[0] = (tmp10 + tmp11)<<8; /* phase 3 */ | |
672 wsptr[4] = (tmp10 - tmp11)<<8; | |
673 | |
674 z1 = D_MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */ | |
675 wsptr[2] = (tmp13<<8) + z1; /* phase 5 */ | |
676 wsptr[6] = (tmp13<<8) - z1; | |
677 | |
678 tmp10 = tmp4 + tmp5; /* phase 2 */ | |
679 tmp11 = tmp5 + tmp6; | |
680 tmp12 = tmp6 + tmp7; | |
681 | |
682 z5 = D_MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */ | |
683 z2 = D_MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */ | |
684 z4 = D_MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */ | |
685 z3 = D_MULTIPLY(tmp11, FIX_0_707106781); /* c4 */ | |
686 | |
687 z11 = (tmp7<<8) + z3; /* phase 5 */ | |
688 z13 = (tmp7<<8) - z3; | |
689 | |
690 wsptr[5] = z13 + z2; /* phase 6 */ | |
691 wsptr[3] = z13 - z2; | |
692 wsptr[1] = z11 + z4; | |
693 wsptr[7] = z11 - z4; | |
694 | |
695 idataptr += rskip<<3; /* advance pointer to next row */ | |
696 wsptr += 8; | |
697 } | |
698 | |
699 wsptr = RTjpeg_ws; | |
700 odataptr=odata; | |
701 for (ctr = 7; ctr >= 0; ctr--) { | |
702 tmp0 = wsptr[0] + wsptr[56]; | |
703 tmp7 = wsptr[0] - wsptr[56]; | |
704 tmp1 = wsptr[8] + wsptr[48]; | |
705 tmp6 = wsptr[8] - wsptr[48]; | |
706 tmp2 = wsptr[16] + wsptr[40]; | |
707 tmp5 = wsptr[16] - wsptr[40]; | |
708 tmp3 = wsptr[24] + wsptr[32]; | |
709 tmp4 = wsptr[24] - wsptr[32]; | |
710 | |
711 tmp10 = tmp0 + tmp3; /* phase 2 */ | |
712 tmp13 = tmp0 - tmp3; | |
713 tmp11 = tmp1 + tmp2; | |
714 tmp12 = tmp1 - tmp2; | |
715 | |
716 odataptr[0] = DESCALE10(tmp10 + tmp11); /* phase 3 */ | |
717 odataptr[32] = DESCALE10(tmp10 - tmp11); | |
718 | |
719 z1 = D_MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */ | |
720 odataptr[16] = DESCALE20((tmp13<<8) + z1); /* phase 5 */ | |
721 odataptr[48] = DESCALE20((tmp13<<8) - z1); | |
722 | |
723 tmp10 = tmp4 + tmp5; /* phase 2 */ | |
724 tmp11 = tmp5 + tmp6; | |
725 tmp12 = tmp6 + tmp7; | |
726 | |
727 z5 = D_MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */ | |
728 z2 = D_MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */ | |
729 z4 = D_MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */ | |
730 z3 = D_MULTIPLY(tmp11, FIX_0_707106781); /* c4 */ | |
731 | |
732 z11 = (tmp7<<8) + z3; /* phase 5 */ | |
733 z13 = (tmp7<<8) - z3; | |
734 | |
735 odataptr[40] = DESCALE20(z13 + z2); /* phase 6 */ | |
736 odataptr[24] = DESCALE20(z13 - z2); | |
737 odataptr[8] = DESCALE20(z11 + z4); | |
738 odataptr[56] = DESCALE20(z11 - z4); | |
739 | |
740 odataptr++; /* advance pointer to next column */ | |
741 wsptr++; | |
742 } | |
743 #else | |
744 volatile mmx_t tmp6, tmp7; | |
745 register mmx_t *dataptr = (mmx_t *)odata; | |
746 mmx_t *idata2 = (mmx_t *)idata; | |
747 | |
748 // first copy the input 8 bit to the destination 16 bits | |
749 | |
750 movq_m2r(RTjpeg_zero, mm2); | |
751 | |
752 | |
753 movq_m2r(*idata2, mm0); | |
754 movq_r2r(mm0, mm1); | |
755 | |
756 punpcklbw_r2r(mm2, mm0); | |
757 movq_r2m(mm0, *(dataptr)); | |
758 | |
759 punpckhbw_r2r(mm2, mm1); | |
760 movq_r2m(mm1, *(dataptr+1)); | |
761 | |
762 idata2 += rskip; | |
763 | |
764 movq_m2r(*idata2, mm0); | |
765 movq_r2r(mm0, mm1); | |
766 | |
767 punpcklbw_r2r(mm2, mm0); | |
768 movq_r2m(mm0, *(dataptr+2)); | |
769 | |
770 punpckhbw_r2r(mm2, mm1); | |
771 movq_r2m(mm1, *(dataptr+3)); | |
772 | |
773 idata2 += rskip; | |
774 | |
775 movq_m2r(*idata2, mm0); | |
776 movq_r2r(mm0, mm1); | |
777 | |
778 punpcklbw_r2r(mm2, mm0); | |
779 movq_r2m(mm0, *(dataptr+4)); | |
780 | |
781 punpckhbw_r2r(mm2, mm1); | |
782 movq_r2m(mm1, *(dataptr+5)); | |
783 | |
784 idata2 += rskip; | |
785 | |
786 movq_m2r(*idata2, mm0); | |
787 movq_r2r(mm0, mm1); | |
788 | |
789 punpcklbw_r2r(mm2, mm0); | |
790 movq_r2m(mm0, *(dataptr+6)); | |
791 | |
792 punpckhbw_r2r(mm2, mm1); | |
793 movq_r2m(mm1, *(dataptr+7)); | |
794 | |
795 idata2 += rskip; | |
796 | |
797 movq_m2r(*idata2, mm0); | |
798 movq_r2r(mm0, mm1); | |
799 | |
800 punpcklbw_r2r(mm2, mm0); | |
801 movq_r2m(mm0, *(dataptr+8)); | |
802 | |
803 punpckhbw_r2r(mm2, mm1); | |
804 movq_r2m(mm1, *(dataptr+9)); | |
805 | |
806 idata2 += rskip; | |
807 | |
808 movq_m2r(*idata2, mm0); | |
809 movq_r2r(mm0, mm1); | |
810 | |
811 punpcklbw_r2r(mm2, mm0); | |
812 movq_r2m(mm0, *(dataptr+10)); | |
813 | |
814 punpckhbw_r2r(mm2, mm1); | |
815 movq_r2m(mm1, *(dataptr+11)); | |
816 | |
817 idata2 += rskip; | |
818 | |
819 movq_m2r(*idata2, mm0); | |
820 movq_r2r(mm0, mm1); | |
821 | |
822 punpcklbw_r2r(mm2, mm0); | |
823 movq_r2m(mm0, *(dataptr+12)); | |
824 | |
825 punpckhbw_r2r(mm2, mm1); | |
826 movq_r2m(mm1, *(dataptr+13)); | |
827 | |
828 idata2 += rskip; | |
829 | |
830 movq_m2r(*idata2, mm0); | |
831 movq_r2r(mm0, mm1); | |
832 | |
833 punpcklbw_r2r(mm2, mm0); | |
834 movq_r2m(mm0, *(dataptr+14)); | |
835 | |
836 punpckhbw_r2r(mm2, mm1); | |
837 movq_r2m(mm1, *(dataptr+15)); | |
838 | |
839 /* Start Transpose to do calculations on rows */ | |
840 | |
841 movq_m2r(*(dataptr+9), mm7); // m03:m02|m01:m00 - first line (line 4)and copy into m5 | |
842 | |
843 movq_m2r(*(dataptr+13), mm6); // m23:m22|m21:m20 - third line (line 6)and copy into m2 | |
844 movq_r2r(mm7, mm5); | |
845 | |
846 punpcklwd_m2r(*(dataptr+11), mm7); // m11:m01|m10:m00 - interleave first and second lines | |
847 movq_r2r(mm6, mm2); | |
848 | |
849 punpcklwd_m2r(*(dataptr+15), mm6); // m31:m21|m30:m20 - interleave third and fourth lines | |
850 movq_r2r(mm7, mm1); | |
851 | |
852 movq_m2r(*(dataptr+11), mm3); // m13:m13|m11:m10 - second line | |
853 punpckldq_r2r(mm6, mm7); // m30:m20|m10:m00 - interleave to produce result 1 | |
854 | |
855 movq_m2r(*(dataptr+15), mm0); // m13:m13|m11:m10 - fourth line | |
856 punpckhdq_r2r(mm6, mm1); // m31:m21|m11:m01 - interleave to produce result 2 | |
857 | |
858 movq_r2m(mm7,*(dataptr+9)); // write result 1 | |
859 punpckhwd_r2r(mm3, mm5); // m13:m03|m12:m02 - interleave first and second lines | |
860 | |
861 movq_r2m(mm1,*(dataptr+11)); // write result 2 | |
862 punpckhwd_r2r(mm0, mm2); // m33:m23|m32:m22 - interleave third and fourth lines | |
863 | |
864 movq_r2r(mm5, mm1); | |
865 punpckldq_r2r(mm2, mm5); // m32:m22|m12:m02 - interleave to produce result 3 | |
866 | |
867 movq_m2r(*(dataptr+1), mm0); // m03:m02|m01:m00 - first line, 4x4 | |
868 punpckhdq_r2r(mm2, mm1); // m33:m23|m13:m03 - interleave to produce result 4 | |
869 | |
870 movq_r2m(mm5,*(dataptr+13)); // write result 3 | |
871 | |
872 // last 4x4 done | |
873 | |
874 movq_r2m(mm1, *(dataptr+15)); // write result 4, last 4x4 | |
875 | |
876 movq_m2r(*(dataptr+5), mm2); // m23:m22|m21:m20 - third line | |
877 movq_r2r(mm0, mm6); | |
878 | |
879 punpcklwd_m2r(*(dataptr+3), mm0); // m11:m01|m10:m00 - interleave first and second lines | |
880 movq_r2r(mm2, mm7); | |
881 | |
882 punpcklwd_m2r(*(dataptr+7), mm2); // m31:m21|m30:m20 - interleave third and fourth lines | |
883 movq_r2r(mm0, mm4); | |
884 | |
885 // | |
886 movq_m2r(*(dataptr+8), mm1); // n03:n02|n01:n00 - first line | |
887 punpckldq_r2r(mm2, mm0); // m30:m20|m10:m00 - interleave to produce first result | |
888 | |
889 movq_m2r(*(dataptr+12), mm3); // n23:n22|n21:n20 - third line | |
890 punpckhdq_r2r(mm2, mm4); // m31:m21|m11:m01 - interleave to produce second result | |
891 | |
892 punpckhwd_m2r(*(dataptr+3), mm6); // m13:m03|m12:m02 - interleave first and second lines | |
893 movq_r2r(mm1, mm2); // copy first line | |
894 | |
895 punpckhwd_m2r(*(dataptr+7), mm7); // m33:m23|m32:m22 - interleave third and fourth lines | |
896 movq_r2r(mm6, mm5); // copy first intermediate result | |
897 | |
898 movq_r2m(mm0, *(dataptr+8)); // write result 1 | |
899 punpckhdq_r2r(mm7, mm5); // m33:m23|m13:m03 - produce third result | |
900 | |
901 punpcklwd_m2r(*(dataptr+10), mm1); // n11:n01|n10:n00 - interleave first and second lines | |
902 movq_r2r(mm3, mm0); // copy third line | |
903 | |
904 punpckhwd_m2r(*(dataptr+10), mm2); // n13:n03|n12:n02 - interleave first and second lines | |
905 | |
906 movq_r2m(mm4, *(dataptr+10)); // write result 2 out | |
907 punpckldq_r2r(mm7, mm6); // m32:m22|m12:m02 - produce fourth result | |
908 | |
909 punpcklwd_m2r(*(dataptr+14), mm3); // n31:n21|n30:n20 - interleave third and fourth lines | |
910 movq_r2r(mm1, mm4); | |
911 | |
912 movq_r2m(mm6, *(dataptr+12)); // write result 3 out | |
913 punpckldq_r2r(mm3, mm1); // n30:n20|n10:n00 - produce first result | |
914 | |
915 punpckhwd_m2r(*(dataptr+14), mm0); // n33:n23|n32:n22 - interleave third and fourth lines | |
916 movq_r2r(mm2, mm6); | |
917 | |
918 movq_r2m(mm5, *(dataptr+14)); // write result 4 out | |
919 punpckhdq_r2r(mm3, mm4); // n31:n21|n11:n01- produce second result | |
920 | |
921 movq_r2m(mm1, *(dataptr+1)); // write result 5 out - (first result for other 4 x 4 block) | |
922 punpckldq_r2r(mm0, mm2); // n32:n22|n12:n02- produce third result | |
923 | |
924 movq_r2m(mm4, *(dataptr+3)); // write result 6 out | |
925 punpckhdq_r2r(mm0, mm6); // n33:n23|n13:n03 - produce fourth result | |
926 | |
927 movq_r2m(mm2, *(dataptr+5)); // write result 7 out | |
928 | |
929 movq_m2r(*dataptr, mm0); // m03:m02|m01:m00 - first line, first 4x4 | |
930 | |
931 movq_r2m(mm6, *(dataptr+7)); // write result 8 out | |
932 | |
933 | |
934 // Do first 4x4 quadrant, which is used in the beginning of the DCT: | |
935 | |
936 movq_m2r(*(dataptr+4), mm7); // m23:m22|m21:m20 - third line | |
937 movq_r2r(mm0, mm2); | |
938 | |
939 punpcklwd_m2r(*(dataptr+2), mm0); // m11:m01|m10:m00 - interleave first and second lines | |
940 movq_r2r(mm7, mm4); | |
941 | |
942 punpcklwd_m2r(*(dataptr+6), mm7); // m31:m21|m30:m20 - interleave third and fourth lines | |
943 movq_r2r(mm0, mm1); | |
944 | |
945 movq_m2r(*(dataptr+2), mm6); // m13:m12|m11:m10 - second line | |
946 punpckldq_r2r(mm7, mm0); // m30:m20|m10:m00 - interleave to produce result 1 | |
947 | |
948 movq_m2r(*(dataptr+6), mm5); // m33:m32|m31:m30 - fourth line | |
949 punpckhdq_r2r(mm7, mm1); // m31:m21|m11:m01 - interleave to produce result 2 | |
950 | |
951 movq_r2r(mm0, mm7); // write result 1 | |
952 punpckhwd_r2r(mm6, mm2); // m13:m03|m12:m02 - interleave first and second lines | |
953 | |
954 psubw_m2r(*(dataptr+14), mm7); // tmp07=x0-x7 /* Stage 1 */ | |
955 movq_r2r(mm1, mm6); // write result 2 | |
956 | |
957 paddw_m2r(*(dataptr+14), mm0); // tmp00=x0+x7 /* Stage 1 */ | |
958 punpckhwd_r2r(mm5, mm4); // m33:m23|m32:m22 - interleave third and fourth lines | |
959 | |
960 paddw_m2r(*(dataptr+12), mm1); // tmp01=x1+x6 /* Stage 1 */ | |
961 movq_r2r(mm2, mm3); // copy first intermediate result | |
962 | |
963 psubw_m2r(*(dataptr+12), mm6); // tmp06=x1-x6 /* Stage 1 */ | |
964 punpckldq_r2r(mm4, mm2); // m32:m22|m12:m02 - interleave to produce result 3 | |
965 | |
966 movq_r2m(mm7, tmp7); | |
967 movq_r2r(mm2, mm5); // write result 3 | |
968 | |
969 movq_r2m(mm6, tmp6); | |
970 punpckhdq_r2r(mm4, mm3); // m33:m23|m13:m03 - interleave to produce result 4 | |
971 | |
972 paddw_m2r(*(dataptr+10), mm2); // tmp02=x2+5 /* Stage 1 */ | |
973 movq_r2r(mm3, mm4); // write result 4 | |
974 | |
975 /************************************************************************************************ | |
976 End of Transpose | |
977 ************************************************************************************************/ | |
978 | |
979 | |
980 paddw_m2r(*(dataptr+8), mm3); // tmp03=x3+x4 /* stage 1*/ | |
981 movq_r2r(mm0, mm7); | |
982 | |
983 psubw_m2r(*(dataptr+8), mm4); // tmp04=x3-x4 /* stage 1*/ | |
984 movq_r2r(mm1, mm6); | |
985 | |
986 paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03 /* even 2 */ | |
987 psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03 /* even 2 */ | |
988 | |
989 psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02 /* even 2 */ | |
990 paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02 /* even 2 */ | |
991 | |
992 psubw_m2r(*(dataptr+10), mm5); // tmp05=x2-x5 /* stage 1*/ | |
993 paddw_r2r(mm7, mm6); // tmp12 + tmp13 | |
994 | |
995 /* stage 3 */ | |
996 | |
997 movq_m2r(tmp6, mm2); | |
998 movq_r2r(mm0, mm3); | |
999 | |
1000 psllw_i2r(2, mm6); // m8 * 2^2 | |
1001 paddw_r2r(mm1, mm0); | |
1002 | |
1003 pmulhw_m2r(RTjpeg_C4, mm6); // z1 | |
1004 psubw_r2r(mm1, mm3); | |
1005 | |
1006 movq_r2m(mm0, *dataptr); | |
1007 movq_r2r(mm7, mm0); | |
1008 | |
1009 /* Odd part */ | |
1010 movq_r2m(mm3, *(dataptr+8)); | |
1011 paddw_r2r(mm5, mm4); // tmp10 | |
1012 | |
1013 movq_m2r(tmp7, mm3); | |
1014 paddw_r2r(mm6, mm0); // tmp32 | |
1015 | |
1016 paddw_r2r(mm2, mm5); // tmp11 | |
1017 psubw_r2r(mm6, mm7); // tmp33 | |
1018 | |
1019 movq_r2m(mm0, *(dataptr+4)); | |
1020 paddw_r2r(mm3, mm2); // tmp12 | |
1021 | |
1022 /* stage 4 */ | |
1023 | |
1024 movq_r2m(mm7, *(dataptr+12)); | |
1025 movq_r2r(mm4, mm1); // copy of tmp10 | |
1026 | |
1027 psubw_r2r(mm2, mm1); // tmp10 - tmp12 | |
1028 psllw_i2r(2, mm4); // m8 * 2^2 | |
1029 | |
1030 movq_m2r(RTjpeg_C2mC6, mm0); | |
1031 psllw_i2r(2, mm1); | |
1032 | |
1033 pmulhw_m2r(RTjpeg_C6, mm1); // z5 | |
1034 psllw_i2r(2, mm2); | |
1035 | |
1036 pmulhw_r2r(mm0, mm4); // z5 | |
1037 | |
1038 /* stage 5 */ | |
1039 | |
1040 pmulhw_m2r(RTjpeg_C2pC6, mm2); | |
1041 psllw_i2r(2, mm5); | |
1042 | |
1043 pmulhw_m2r(RTjpeg_C4, mm5); // z3 | |
1044 movq_r2r(mm3, mm0); // copy tmp7 | |
1045 | |
1046 movq_m2r(*(dataptr+1), mm7); | |
1047 paddw_r2r(mm1, mm4); // z2 | |
1048 | |
1049 paddw_r2r(mm1, mm2); // z4 | |
1050 | |
1051 paddw_r2r(mm5, mm0); // z11 | |
1052 psubw_r2r(mm5, mm3); // z13 | |
1053 | |
1054 /* stage 6 */ | |
1055 | |
1056 movq_r2r(mm3, mm5); // copy z13 | |
1057 psubw_r2r(mm4, mm3); // y3=z13 - z2 | |
1058 | |
1059 paddw_r2r(mm4, mm5); // y5=z13 + z2 | |
1060 movq_r2r(mm0, mm6); // copy z11 | |
1061 | |
1062 movq_r2m(mm3, *(dataptr+6)); //save y3 | |
1063 psubw_r2r(mm2, mm0); // y7=z11 - z4 | |
1064 | |
1065 movq_r2m(mm5, *(dataptr+10)); //save y5 | |
1066 paddw_r2r(mm2, mm6); // y1=z11 + z4 | |
1067 | |
1068 movq_r2m(mm0, *(dataptr+14)); //save y7 | |
1069 | |
1070 /************************************************ | |
1071 * End of 1st 4 rows | |
1072 ************************************************/ | |
1073 | |
1074 movq_m2r(*(dataptr+3), mm1); // load x1 /* stage 1 */ | |
1075 movq_r2r(mm7, mm0); // copy x0 | |
1076 | |
1077 movq_r2m(mm6, *(dataptr+2)); //save y1 | |
1078 | |
1079 movq_m2r(*(dataptr+5), mm2); // load x2 /* stage 1 */ | |
1080 movq_r2r(mm1, mm6); // copy x1 | |
1081 | |
1082 paddw_m2r(*(dataptr+15), mm0); // tmp00 = x0 + x7 | |
1083 | |
1084 movq_m2r(*(dataptr+7), mm3); // load x3 /* stage 1 */ | |
1085 movq_r2r(mm2, mm5); // copy x2 | |
1086 | |
1087 psubw_m2r(*(dataptr+15), mm7); // tmp07 = x0 - x7 | |
1088 movq_r2r(mm3, mm4); // copy x3 | |
1089 | |
1090 paddw_m2r(*(dataptr+13), mm1); // tmp01 = x1 + x6 | |
1091 | |
1092 movq_r2m(mm7, tmp7); // save tmp07 | |
1093 movq_r2r(mm0, mm7); // copy tmp00 | |
1094 | |
1095 psubw_m2r(*(dataptr+13), mm6); // tmp06 = x1 - x6 | |
1096 | |
1097 /* stage 2, Even Part */ | |
1098 | |
1099 paddw_m2r(*(dataptr+9), mm3); // tmp03 = x3 + x4 | |
1100 | |
1101 movq_r2m(mm6, tmp6); // save tmp07 | |
1102 movq_r2r(mm1, mm6); // copy tmp01 | |
1103 | |
1104 paddw_m2r(*(dataptr+11), mm2); // tmp02 = x2 + x5 | |
1105 paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03 | |
1106 | |
1107 psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03 | |
1108 | |
1109 psubw_m2r(*(dataptr+9), mm4); // tmp04 = x3 - x4 | |
1110 psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02 | |
1111 | |
1112 paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02 | |
1113 | |
1114 psubw_m2r(*(dataptr+11), mm5); // tmp05 = x2 - x5 | |
1115 paddw_r2r(mm7, mm6); // tmp12 + tmp13 | |
1116 | |
1117 /* stage 3, Even and stage 4 & 5 even */ | |
1118 | |
1119 movq_m2r(tmp6, mm2); // load tmp6 | |
1120 movq_r2r(mm0, mm3); // copy tmp10 | |
1121 | |
1122 psllw_i2r(2, mm6); // shift z1 | |
1123 paddw_r2r(mm1, mm0); // y0=tmp10 + tmp11 | |
1124 | |
1125 pmulhw_m2r(RTjpeg_C4, mm6); // z1 | |
1126 psubw_r2r(mm1, mm3); // y4=tmp10 - tmp11 | |
1127 | |
1128 movq_r2m(mm0, *(dataptr+1)); //save y0 | |
1129 movq_r2r(mm7, mm0); // copy tmp13 | |
1130 | |
1131 /* odd part */ | |
1132 | |
1133 movq_r2m(mm3, *(dataptr+9)); //save y4 | |
1134 paddw_r2r(mm5, mm4); // tmp10 = tmp4 + tmp5 | |
1135 | |
1136 movq_m2r(tmp7, mm3); // load tmp7 | |
1137 paddw_r2r(mm6, mm0); // tmp32 = tmp13 + z1 | |
1138 | |
1139 paddw_r2r(mm2, mm5); // tmp11 = tmp5 + tmp6 | |
1140 psubw_r2r(mm6, mm7); // tmp33 = tmp13 - z1 | |
1141 | |
1142 movq_r2m(mm0, *(dataptr+5)); //save y2 | |
1143 paddw_r2r(mm3, mm2); // tmp12 = tmp6 + tmp7 | |
1144 | |
1145 /* stage 4 */ | |
1146 | |
1147 movq_r2m(mm7, *(dataptr+13)); //save y6 | |
1148 movq_r2r(mm4, mm1); // copy tmp10 | |
1149 | |
1150 psubw_r2r(mm2, mm1); // tmp10 - tmp12 | |
1151 psllw_i2r(2, mm4); // shift tmp10 | |
1152 | |
1153 movq_m2r(RTjpeg_C2mC6, mm0); // load C2mC6 | |
1154 psllw_i2r(2, mm1); // shift (tmp10-tmp12) | |
1155 | |
1156 pmulhw_m2r(RTjpeg_C6, mm1); // z5 | |
1157 psllw_i2r(2, mm5); // prepare for multiply | |
1158 | |
1159 pmulhw_r2r(mm0, mm4); // multiply by converted real | |
1160 | |
1161 /* stage 5 */ | |
1162 | |
1163 pmulhw_m2r(RTjpeg_C4, mm5); // z3 | |
1164 psllw_i2r(2, mm2); // prepare for multiply | |
1165 | |
1166 pmulhw_m2r(RTjpeg_C2pC6, mm2); // multiply | |
1167 movq_r2r(mm3, mm0); // copy tmp7 | |
1168 | |
1169 movq_m2r(*(dataptr+9), mm7); // m03:m02|m01:m00 - first line (line 4)and copy into mm7 | |
1170 paddw_r2r(mm1, mm4); // z2 | |
1171 | |
1172 paddw_r2r(mm5, mm0); // z11 | |
1173 psubw_r2r(mm5, mm3); // z13 | |
1174 | |
1175 /* stage 6 */ | |
1176 | |
1177 movq_r2r(mm3, mm5); // copy z13 | |
1178 paddw_r2r(mm1, mm2); // z4 | |
1179 | |
1180 movq_r2r(mm0, mm6); // copy z11 | |
1181 psubw_r2r(mm4, mm5); // y3 | |
1182 | |
1183 paddw_r2r(mm2, mm6); // y1 | |
1184 paddw_r2r(mm4, mm3); // y5 | |
1185 | |
1186 movq_r2m(mm5, *(dataptr+7)); //save y3 | |
1187 | |
1188 movq_r2m(mm6, *(dataptr+3)); //save y1 | |
1189 psubw_r2r(mm2, mm0); // y7 | |
1190 | |
1191 /************************************************************************************************ | |
1192 Start of Transpose | |
1193 ************************************************************************************************/ | |
1194 | |
1195 movq_m2r(*(dataptr+13), mm6); // m23:m22|m21:m20 - third line (line 6)and copy into m2 | |
1196 movq_r2r(mm7, mm5); // copy first line | |
1197 | |
1198 punpcklwd_r2r(mm3, mm7); // m11:m01|m10:m00 - interleave first and second lines | |
1199 movq_r2r(mm6, mm2); // copy third line | |
1200 | |
1201 punpcklwd_r2r(mm0, mm6); // m31:m21|m30:m20 - interleave third and fourth lines | |
1202 movq_r2r(mm7, mm1); // copy first intermediate result | |
1203 | |
1204 punpckldq_r2r(mm6, mm7); // m30:m20|m10:m00 - interleave to produce result 1 | |
1205 | |
1206 punpckhdq_r2r(mm6, mm1); // m31:m21|m11:m01 - interleave to produce result 2 | |
1207 | |
1208 movq_r2m(mm7, *(dataptr+9)); // write result 1 | |
1209 punpckhwd_r2r(mm3, mm5); // m13:m03|m12:m02 - interleave first and second lines | |
1210 | |
1211 movq_r2m(mm1, *(dataptr+11)); // write result 2 | |
1212 punpckhwd_r2r(mm0, mm2); // m33:m23|m32:m22 - interleave third and fourth lines | |
1213 | |
1214 movq_r2r(mm5, mm1); // copy first intermediate result | |
1215 punpckldq_r2r(mm2, mm5); // m32:m22|m12:m02 - interleave to produce result 3 | |
1216 | |
1217 movq_m2r(*(dataptr+1), mm0); // m03:m02|m01:m00 - first line, 4x4 | |
1218 punpckhdq_r2r(mm2, mm1); // m33:m23|m13:m03 - interleave to produce result 4 | |
1219 | |
1220 movq_r2m(mm5, *(dataptr+13)); // write result 3 | |
1221 | |
1222 /****** last 4x4 done */ | |
1223 | |
1224 movq_r2m(mm1, *(dataptr+15)); // write result 4, last 4x4 | |
1225 | |
1226 movq_m2r(*(dataptr+5), mm2); // m23:m22|m21:m20 - third line | |
1227 movq_r2r(mm0, mm6); // copy first line | |
1228 | |
1229 punpcklwd_m2r(*(dataptr+3), mm0); // m11:m01|m10:m00 - interleave first and second lines | |
1230 movq_r2r(mm2, mm7); // copy third line | |
1231 | |
1232 punpcklwd_m2r(*(dataptr+7), mm2); // m31:m21|m30:m20 - interleave third and fourth lines | |
1233 movq_r2r(mm0, mm4); // copy first intermediate result | |
1234 | |
1235 | |
1236 | |
1237 movq_m2r(*(dataptr+8), mm1); // n03:n02|n01:n00 - first line | |
1238 punpckldq_r2r(mm2, mm0); // m30:m20|m10:m00 - interleave to produce first result | |
1239 | |
1240 movq_m2r(*(dataptr+12), mm3); // n23:n22|n21:n20 - third line | |
1241 punpckhdq_r2r(mm2, mm4); // m31:m21|m11:m01 - interleave to produce second result | |
1242 | |
1243 punpckhwd_m2r(*(dataptr+3), mm6); // m13:m03|m12:m02 - interleave first and second lines | |
1244 movq_r2r(mm1, mm2); // copy first line | |
1245 | |
1246 punpckhwd_m2r(*(dataptr+7), mm7); // m33:m23|m32:m22 - interleave third and fourth lines | |
1247 movq_r2r(mm6, mm5); // copy first intermediate result | |
1248 | |
1249 movq_r2m(mm0, *(dataptr+8)); // write result 1 | |
1250 punpckhdq_r2r(mm7, mm5); // m33:m23|m13:m03 - produce third result | |
1251 | |
1252 punpcklwd_m2r(*(dataptr+10), mm1); // n11:n01|n10:n00 - interleave first and second lines | |
1253 movq_r2r(mm3, mm0); // copy third line | |
1254 | |
1255 punpckhwd_m2r(*(dataptr+10), mm2); // n13:n03|n12:n02 - interleave first and second lines | |
1256 | |
1257 movq_r2m(mm4, *(dataptr+10)); // write result 2 out | |
1258 punpckldq_r2r(mm7, mm6); // m32:m22|m12:m02 - produce fourth result | |
1259 | |
1260 punpcklwd_m2r(*(dataptr+14), mm3); // n33:n23|n32:n22 - interleave third and fourth lines | |
1261 movq_r2r(mm1, mm4); // copy second intermediate result | |
1262 | |
1263 movq_r2m(mm6, *(dataptr+12)); // write result 3 out | |
1264 punpckldq_r2r(mm3, mm1); // | |
1265 | |
1266 punpckhwd_m2r(*(dataptr+14), mm0); // n33:n23|n32:n22 - interleave third and fourth lines | |
1267 movq_r2r(mm2, mm6); // copy second intermediate result | |
1268 | |
1269 movq_r2m(mm5, *(dataptr+14)); // write result 4 out | |
1270 punpckhdq_r2r(mm3, mm4); // n31:n21|n11:n01- produce second result | |
1271 | |
1272 movq_r2m(mm1, *(dataptr+1)); // write result 5 out - (first result for other 4 x 4 block) | |
1273 punpckldq_r2r(mm0, mm2); // n32:n22|n12:n02- produce third result | |
1274 | |
1275 movq_r2m(mm4, *(dataptr+3)); // write result 6 out | |
1276 punpckhdq_r2r(mm0, mm6); // n33:n23|n13:n03 - produce fourth result | |
1277 | |
1278 movq_r2m(mm2, *(dataptr+5)); // write result 7 out | |
1279 | |
1280 movq_m2r(*dataptr, mm0); // m03:m02|m01:m00 - first line, first 4x4 | |
1281 | |
1282 movq_r2m(mm6, *(dataptr+7)); // write result 8 out | |
1283 | |
1284 // Do first 4x4 quadrant, which is used in the beginning of the DCT: | |
1285 | |
1286 movq_m2r(*(dataptr+4), mm7); // m23:m22|m21:m20 - third line | |
1287 movq_r2r(mm0, mm2); // copy first line | |
1288 | |
1289 punpcklwd_m2r(*(dataptr+2), mm0); // m11:m01|m10:m00 - interleave first and second lines | |
1290 movq_r2r(mm7, mm4); // copy third line | |
1291 | |
1292 punpcklwd_m2r(*(dataptr+6), mm7); // m31:m21|m30:m20 - interleave third and fourth lines | |
1293 movq_r2r(mm0, mm1); // copy first intermediate result | |
1294 | |
1295 movq_m2r(*(dataptr+2), mm6); // m13:m12|m11:m10 - second line | |
1296 punpckldq_r2r(mm7, mm0); // m30:m20|m10:m00 - interleave to produce result 1 | |
1297 | |
1298 movq_m2r(*(dataptr+6), mm5); // m33:m32|m31:m30 - fourth line | |
1299 punpckhdq_r2r(mm7, mm1); // m31:m21|m11:m01 - interleave to produce result 2 | |
1300 | |
1301 movq_r2r(mm0, mm7); // write result 1 | |
1302 punpckhwd_r2r(mm6, mm2); // m13:m03|m12:m02 - interleave first and second lines | |
1303 | |
1304 psubw_m2r(*(dataptr+14), mm7); // tmp07=x0-x7 /* Stage 1 */ | |
1305 movq_r2r(mm1, mm6); // write result 2 | |
1306 | |
1307 paddw_m2r(*(dataptr+14), mm0); // tmp00=x0+x7 /* Stage 1 */ | |
1308 punpckhwd_r2r(mm5, mm4); // m33:m23|m32:m22 - interleave third and fourth lines | |
1309 | |
1310 paddw_m2r(*(dataptr+12), mm1); // tmp01=x1+x6 /* Stage 1 */ | |
1311 movq_r2r(mm2, mm3); // copy first intermediate result | |
1312 | |
1313 psubw_m2r(*(dataptr+12), mm6); // tmp06=x1-x6 /* Stage 1 */ | |
1314 punpckldq_r2r(mm4, mm2); // m32:m22|m12:m02 - interleave to produce result 3 | |
1315 | |
1316 movq_r2m(mm7, tmp7); // save tmp07 | |
1317 movq_r2r(mm2, mm5); // write result 3 | |
1318 | |
1319 movq_r2m(mm6, tmp6); // save tmp06 | |
1320 | |
1321 punpckhdq_r2r(mm4, mm3); // m33:m23|m13:m03 - interleave to produce result 4 | |
1322 | |
1323 paddw_m2r(*(dataptr+10), mm2); // tmp02=x2+x5 /* stage 1 */ | |
1324 movq_r2r(mm3, mm4); // write result 4 | |
1325 | |
1326 /************************************************************************************************ | |
1327 End of Transpose 2 | |
1328 ************************************************************************************************/ | |
1329 | |
1330 paddw_m2r(*(dataptr+8), mm3); // tmp03=x3+x4 /* stage 1*/ | |
1331 movq_r2r(mm0, mm7); | |
1332 | |
1333 psubw_m2r(*(dataptr+8), mm4); // tmp04=x3-x4 /* stage 1*/ | |
1334 movq_r2r(mm1, mm6); | |
1335 | |
1336 paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03 /* even 2 */ | |
1337 psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03 /* even 2 */ | |
1338 | |
1339 psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02 /* even 2 */ | |
1340 paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02 /* even 2 */ | |
1341 | |
1342 psubw_m2r(*(dataptr+10), mm5); // tmp05=x2-x5 /* stage 1*/ | |
1343 paddw_r2r(mm7, mm6); // tmp12 + tmp13 | |
1344 | |
1345 /* stage 3 */ | |
1346 | |
1347 movq_m2r(tmp6, mm2); | |
1348 movq_r2r(mm0, mm3); | |
1349 | |
1350 psllw_i2r(2, mm6); // m8 * 2^2 | |
1351 paddw_r2r(mm1, mm0); | |
1352 | |
1353 pmulhw_m2r(RTjpeg_C4, mm6); // z1 | |
1354 psubw_r2r(mm1, mm3); | |
1355 | |
1356 movq_r2m(mm0, *dataptr); | |
1357 movq_r2r(mm7, mm0); | |
1358 | |
1359 /* Odd part */ | |
1360 movq_r2m(mm3, *(dataptr+8)); | |
1361 paddw_r2r(mm5, mm4); // tmp10 | |
1362 | |
1363 movq_m2r(tmp7, mm3); | |
1364 paddw_r2r(mm6, mm0); // tmp32 | |
1365 | |
1366 paddw_r2r(mm2, mm5); // tmp11 | |
1367 psubw_r2r(mm6, mm7); // tmp33 | |
1368 | |
1369 movq_r2m(mm0, *(dataptr+4)); | |
1370 paddw_r2r(mm3, mm2); // tmp12 | |
1371 | |
1372 /* stage 4 */ | |
1373 movq_r2m(mm7, *(dataptr+12)); | |
1374 movq_r2r(mm4, mm1); // copy of tmp10 | |
1375 | |
1376 psubw_r2r(mm2, mm1); // tmp10 - tmp12 | |
1377 psllw_i2r(2, mm4); // m8 * 2^2 | |
1378 | |
1379 movq_m2r(RTjpeg_C2mC6, mm0); | |
1380 psllw_i2r(2, mm1); | |
1381 | |
1382 pmulhw_m2r(RTjpeg_C6, mm1); // z5 | |
1383 psllw_i2r(2, mm2); | |
1384 | |
1385 pmulhw_r2r(mm0, mm4); // z5 | |
1386 | |
1387 /* stage 5 */ | |
1388 | |
1389 pmulhw_m2r(RTjpeg_C2pC6, mm2); | |
1390 psllw_i2r(2, mm5); | |
1391 | |
1392 pmulhw_m2r(RTjpeg_C4, mm5); // z3 | |
1393 movq_r2r(mm3, mm0); // copy tmp7 | |
1394 | |
1395 movq_m2r(*(dataptr+1), mm7); | |
1396 paddw_r2r(mm1, mm4); // z2 | |
1397 | |
1398 paddw_r2r(mm1, mm2); // z4 | |
1399 | |
1400 paddw_r2r(mm5, mm0); // z11 | |
1401 psubw_r2r(mm5, mm3); // z13 | |
1402 | |
1403 /* stage 6 */ | |
1404 | |
1405 movq_r2r(mm3, mm5); // copy z13 | |
1406 psubw_r2r(mm4, mm3); // y3=z13 - z2 | |
1407 | |
1408 paddw_r2r(mm4, mm5); // y5=z13 + z2 | |
1409 movq_r2r(mm0, mm6); // copy z11 | |
1410 | |
1411 movq_r2m(mm3, *(dataptr+6)); //save y3 | |
1412 psubw_r2r(mm2, mm0); // y7=z11 - z4 | |
1413 | |
1414 movq_r2m(mm5, *(dataptr+10)); //save y5 | |
1415 paddw_r2r(mm2, mm6); // y1=z11 + z4 | |
1416 | |
1417 movq_r2m(mm0, *(dataptr+14)); //save y7 | |
1418 | |
1419 /************************************************ | |
1420 * End of 1st 4 rows | |
1421 ************************************************/ | |
1422 | |
1423 movq_m2r(*(dataptr+3), mm1); // load x1 /* stage 1 */ | |
1424 movq_r2r(mm7, mm0); // copy x0 | |
1425 | |
1426 movq_r2m(mm6, *(dataptr+2)); //save y1 | |
1427 | |
1428 movq_m2r(*(dataptr+5), mm2); // load x2 /* stage 1 */ | |
1429 movq_r2r(mm1, mm6); // copy x1 | |
1430 | |
1431 paddw_m2r(*(dataptr+15), mm0); // tmp00 = x0 + x7 | |
1432 | |
1433 movq_m2r(*(dataptr+7), mm3); // load x3 /* stage 1 */ | |
1434 movq_r2r(mm2, mm5); // copy x2 | |
1435 | |
1436 psubw_m2r(*(dataptr+15), mm7); // tmp07 = x0 - x7 | |
1437 movq_r2r(mm3, mm4); // copy x3 | |
1438 | |
1439 paddw_m2r(*(dataptr+13), mm1); // tmp01 = x1 + x6 | |
1440 | |
1441 movq_r2m(mm7, tmp7); // save tmp07 | |
1442 movq_r2r(mm0, mm7); // copy tmp00 | |
1443 | |
1444 psubw_m2r(*(dataptr+13), mm6); // tmp06 = x1 - x6 | |
1445 | |
1446 /* stage 2, Even Part */ | |
1447 | |
1448 paddw_m2r(*(dataptr+9), mm3); // tmp03 = x3 + x4 | |
1449 | |
1450 movq_r2m(mm6, tmp6); // save tmp07 | |
1451 movq_r2r(mm1, mm6); // copy tmp01 | |
1452 | |
1453 paddw_m2r(*(dataptr+11), mm2); // tmp02 = x2 + x5 | |
1454 paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03 | |
1455 | |
1456 psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03 | |
1457 | |
1458 psubw_m2r(*(dataptr+9), mm4); // tmp04 = x3 - x4 | |
1459 psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02 | |
1460 | |
1461 paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02 | |
1462 | |
1463 psubw_m2r(*(dataptr+11), mm5); // tmp05 = x2 - x5 | |
1464 paddw_r2r(mm7, mm6); // tmp12 + tmp13 | |
1465 | |
1466 /* stage 3, Even and stage 4 & 5 even */ | |
1467 | |
1468 movq_m2r(tmp6, mm2); // load tmp6 | |
1469 movq_r2r(mm0, mm3); // copy tmp10 | |
1470 | |
1471 psllw_i2r(2, mm6); // shift z1 | |
1472 paddw_r2r(mm1, mm0); // y0=tmp10 + tmp11 | |
1473 | |
1474 pmulhw_m2r(RTjpeg_C4, mm6); // z1 | |
1475 psubw_r2r(mm1, mm3); // y4=tmp10 - tmp11 | |
1476 | |
1477 movq_r2m(mm0, *(dataptr+1)); //save y0 | |
1478 movq_r2r(mm7, mm0); // copy tmp13 | |
1479 | |
1480 /* odd part */ | |
1481 | |
1482 movq_r2m(mm3, *(dataptr+9)); //save y4 | |
1483 paddw_r2r(mm5, mm4); // tmp10 = tmp4 + tmp5 | |
1484 | |
1485 movq_m2r(tmp7, mm3); // load tmp7 | |
1486 paddw_r2r(mm6, mm0); // tmp32 = tmp13 + z1 | |
1487 | |
1488 paddw_r2r(mm2, mm5); // tmp11 = tmp5 + tmp6 | |
1489 psubw_r2r(mm6, mm7); // tmp33 = tmp13 - z1 | |
1490 | |
1491 movq_r2m(mm0, *(dataptr+5)); //save y2 | |
1492 paddw_r2r(mm3, mm2); // tmp12 = tmp6 + tmp7 | |
1493 | |
1494 /* stage 4 */ | |
1495 | |
1496 movq_r2m(mm7, *(dataptr+13)); //save y6 | |
1497 movq_r2r(mm4, mm1); // copy tmp10 | |
1498 | |
1499 psubw_r2r(mm2, mm1); // tmp10 - tmp12 | |
1500 psllw_i2r(2, mm4); // shift tmp10 | |
1501 | |
1502 movq_m2r(RTjpeg_C2mC6, mm0); // load C2mC6 | |
1503 psllw_i2r(2, mm1); // shift (tmp10-tmp12) | |
1504 | |
1505 pmulhw_m2r(RTjpeg_C6, mm1); // z5 | |
1506 psllw_i2r(2, mm5); // prepare for multiply | |
1507 | |
1508 pmulhw_r2r(mm0, mm4); // multiply by converted real | |
1509 | |
1510 /* stage 5 */ | |
1511 | |
1512 pmulhw_m2r(RTjpeg_C4, mm5); // z3 | |
1513 psllw_i2r(2, mm2); // prepare for multiply | |
1514 | |
1515 pmulhw_m2r(RTjpeg_C2pC6, mm2); // multiply | |
1516 movq_r2r(mm3, mm0); // copy tmp7 | |
1517 | |
1518 movq_m2r(*(dataptr+9), mm7); // m03:m02|m01:m00 - first line (line 4)and copy into mm7 | |
1519 paddw_r2r(mm1, mm4); // z2 | |
1520 | |
1521 paddw_r2r(mm5, mm0); // z11 | |
1522 psubw_r2r(mm5, mm3); // z13 | |
1523 | |
1524 /* stage 6 */ | |
1525 | |
1526 movq_r2r(mm3, mm5); // copy z13 | |
1527 paddw_r2r(mm1, mm2); // z4 | |
1528 | |
1529 movq_r2r(mm0, mm6); // copy z11 | |
1530 psubw_r2r(mm4, mm5); // y3 | |
1531 | |
1532 paddw_r2r(mm2, mm6); // y1 | |
1533 paddw_r2r(mm4, mm3); // y5 | |
1534 | |
1535 movq_r2m(mm5, *(dataptr+7)); //save y3 | |
1536 psubw_r2r(mm2, mm0); // yè=z11 - z4 | |
1537 | |
1538 movq_r2m(mm3, *(dataptr+11)); //save y5 | |
1539 | |
1540 movq_r2m(mm6, *(dataptr+3)); //save y1 | |
1541 | |
1542 movq_r2m(mm0, *(dataptr+15)); //save y7 | |
1543 | |
1544 | |
1545 #endif | |
1546 } | |
1547 | |
1548 #define FIX_1_082392200 ((__s32) 277) /* FIX(1.082392200) */ | |
1549 #define FIX_1_414213562 ((__s32) 362) /* FIX(1.414213562) */ | |
1550 #define FIX_1_847759065 ((__s32) 473) /* FIX(1.847759065) */ | |
1551 #define FIX_2_613125930 ((__s32) 669) /* FIX(2.613125930) */ | |
1552 | |
1553 #define DESCALE(x) (__s16)( ((x)+4) >> 3) | |
1554 | |
1555 /* clip yuv to 16..235 (should be 16..240 for cr/cb but ... */ | |
1556 | |
1557 #define RL(x) ((x)>235) ? 235 : (((x)<16) ? 16 : (x)) | |
1558 #define MULTIPLY(var,const) (((__s32) ((var) * (const)) + 128)>>8) | |
1559 | |
1560 void RTjpeg_idct_init(void) | |
1561 { | |
1562 int i; | |
1563 | |
1564 for(i=0; i<64; i++) | |
1565 { | |
1566 RTjpeg_liqt[i]=((__u64)RTjpeg_liqt[i]*RTjpeg_aan_tab[i])>>32; | |
1567 RTjpeg_ciqt[i]=((__u64)RTjpeg_ciqt[i]*RTjpeg_aan_tab[i])>>32; | |
1568 } | |
1569 } | |
1570 | |
1571 void RTjpeg_idct(__u8 *odata, __s16 *data, int rskip) | |
1572 { | |
1573 #ifdef MMX | |
1574 | |
1575 static mmx_t fix_141 = (mmx_t)(long long)0x5a825a825a825a82LL; | |
1576 static mmx_t fix_184n261 = (mmx_t)(long long)0xcf04cf04cf04cf04LL; | |
1577 static mmx_t fix_184 = (mmx_t)(long long)0x7641764176417641LL; | |
1578 static mmx_t fix_n184 = (mmx_t)(long long)0x896f896f896f896fLL; | |
1579 static mmx_t fix_108n184 = (mmx_t)(long long)0xcf04cf04cf04cf04LL; | |
1580 | |
1581 mmx_t workspace[64]; | |
1582 mmx_t *wsptr = workspace; | |
1583 register mmx_t *dataptr = (mmx_t *)odata; | |
1584 mmx_t *idata = (mmx_t *)data; | |
1585 | |
1586 rskip = rskip>>3; | |
1587 /* | |
1588 * Perform inverse DCT on one block of coefficients. | |
1589 */ | |
1590 | |
1591 /* Odd part */ | |
1592 | |
1593 movq_m2r(*(idata+10), mm1); // load idata[DCTSIZE*5] | |
1594 | |
1595 movq_m2r(*(idata+6), mm0); // load idata[DCTSIZE*3] | |
1596 | |
1597 movq_m2r(*(idata+2), mm3); // load idata[DCTSIZE*1] | |
1598 | |
1599 movq_r2r(mm1, mm2); // copy tmp6 /* phase 6 */ | |
1600 | |
1601 movq_m2r(*(idata+14), mm4); // load idata[DCTSIZE*7] | |
1602 | |
1603 paddw_r2r(mm0, mm1); // z13 = tmp6 + tmp5; | |
1604 | |
1605 psubw_r2r(mm0, mm2); // z10 = tmp6 - tmp5 | |
1606 | |
1607 psllw_i2r(2, mm2); // shift z10 | |
1608 movq_r2r(mm2, mm0); // copy z10 | |
1609 | |
1610 pmulhw_m2r(fix_184n261, mm2); // MULTIPLY( z12, FIX_1_847759065); /* 2*c2 */ | |
1611 movq_r2r(mm3, mm5); // copy tmp4 | |
1612 | |
1613 pmulhw_m2r(fix_n184, mm0); // MULTIPLY(z10, -FIX_1_847759065); /* 2*c2 */ | |
1614 paddw_r2r(mm4, mm3); // z11 = tmp4 + tmp7; | |
1615 | |
1616 movq_r2r(mm3, mm6); // copy z11 /* phase 5 */ | |
1617 psubw_r2r(mm4, mm5); // z12 = tmp4 - tmp7; | |
1618 | |
1619 psubw_r2r(mm1, mm6); // z11-z13 | |
1620 psllw_i2r(2, mm5); // shift z12 | |
1621 | |
1622 movq_m2r(*(idata+12), mm4); // load idata[DCTSIZE*6], even part | |
1623 movq_r2r(mm5, mm7); // copy z12 | |
1624 | |
1625 pmulhw_m2r(fix_108n184, mm5); // MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; /* 2*(c2-c6) */ even part | |
1626 paddw_r2r(mm1, mm3); // tmp7 = z11 + z13; | |
1627 | |
1628 //ok | |
1629 | |
1630 /* Even part */ | |
1631 pmulhw_m2r(fix_184, mm7); // MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; /* -2*(c2+c6) */ | |
1632 psllw_i2r(2, mm6); | |
1633 | |
1634 movq_m2r(*(idata+4), mm1); // load idata[DCTSIZE*2] | |
1635 | |
1636 paddw_r2r(mm5, mm0); // tmp10 | |
1637 | |
1638 paddw_r2r(mm7, mm2); // tmp12 | |
1639 | |
1640 pmulhw_m2r(fix_141, mm6); // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */ | |
1641 psubw_r2r(mm3, mm2); // tmp6 = tmp12 - tmp7 | |
1642 | |
1643 movq_r2r(mm1, mm5); // copy tmp1 | |
1644 paddw_r2r(mm4, mm1); // tmp13= tmp1 + tmp3; /* phases 5-3 */ | |
1645 | |
1646 psubw_r2r(mm4, mm5); // tmp1-tmp3 | |
1647 psubw_r2r(mm2, mm6); // tmp5 = tmp11 - tmp6; | |
1648 | |
1649 movq_r2m(mm1, *(wsptr)); // save tmp13 in workspace | |
1650 psllw_i2r(2, mm5); // shift tmp1-tmp3 | |
1651 | |
1652 movq_m2r(*(idata), mm7); // load idata[DCTSIZE*0] | |
1653 | |
1654 pmulhw_m2r(fix_141, mm5); // MULTIPLY(tmp1 - tmp3, FIX_1_414213562) | |
1655 paddw_r2r(mm6, mm0); // tmp4 = tmp10 + tmp5; | |
1656 | |
1657 movq_m2r(*(idata+8), mm4); // load idata[DCTSIZE*4] | |
1658 | |
1659 psubw_r2r(mm1, mm5); // tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */ | |
1660 | |
1661 movq_r2m(mm0, *(wsptr+4)); // save tmp4 in workspace | |
1662 movq_r2r(mm7, mm1); // copy tmp0 /* phase 3 */ | |
1663 | |
1664 movq_r2m(mm5, *(wsptr+2)); // save tmp12 in workspace | |
1665 psubw_r2r(mm4, mm1); // tmp11 = tmp0 - tmp2; | |
1666 | |
1667 paddw_r2r(mm4, mm7); // tmp10 = tmp0 + tmp2; | |
1668 movq_r2r(mm1, mm5); // copy tmp11 | |
1669 | |
1670 paddw_m2r(*(wsptr+2), mm1); // tmp1 = tmp11 + tmp12; | |
1671 movq_r2r(mm7, mm4); // copy tmp10 /* phase 2 */ | |
1672 | |
1673 paddw_m2r(*(wsptr), mm7); // tmp0 = tmp10 + tmp13; | |
1674 | |
1675 psubw_m2r(*(wsptr), mm4); // tmp3 = tmp10 - tmp13; | |
1676 movq_r2r(mm7, mm0); // copy tmp0 | |
1677 | |
1678 psubw_m2r(*(wsptr+2), mm5); // tmp2 = tmp11 - tmp12; | |
1679 paddw_r2r(mm3, mm7); // wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7); | |
1680 | |
1681 psubw_r2r(mm3, mm0); // wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7); | |
1682 | |
1683 movq_r2m(mm7, *(wsptr)); // wsptr[DCTSIZE*0] | |
1684 movq_r2r(mm1, mm3); // copy tmp1 | |
1685 | |
1686 movq_r2m(mm0, *(wsptr+14)); // wsptr[DCTSIZE*7] | |
1687 paddw_r2r(mm2, mm1); // wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6); | |
1688 | |
1689 psubw_r2r(mm2, mm3); // wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6); | |
1690 | |
1691 movq_r2m(mm1, *(wsptr+2)); // wsptr[DCTSIZE*1] | |
1692 movq_r2r(mm4, mm1); // copy tmp3 | |
1693 | |
1694 movq_r2m(mm3, *(wsptr+12)); // wsptr[DCTSIZE*6] | |
1695 | |
1696 paddw_m2r(*(wsptr+4), mm4); // wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4); | |
1697 | |
1698 psubw_m2r(*(wsptr+4), mm1); // wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4); | |
1699 | |
1700 movq_r2m(mm4, *(wsptr+8)); | |
1701 movq_r2r(mm5, mm7); // copy tmp2 | |
1702 | |
1703 paddw_r2r(mm6, mm5); // wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5) | |
1704 | |
1705 movq_r2m(mm1, *(wsptr+6)); | |
1706 psubw_r2r(mm6, mm7); // wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5); | |
1707 | |
1708 movq_r2m(mm5, *(wsptr+4)); | |
1709 | |
1710 movq_r2m(mm7, *(wsptr+10)); | |
1711 | |
1712 //ok | |
1713 | |
1714 | |
1715 /*****************************************************************/ | |
1716 | |
1717 idata++; | |
1718 wsptr++; | |
1719 | |
1720 /*****************************************************************/ | |
1721 | |
1722 movq_m2r(*(idata+10), mm1); // load idata[DCTSIZE*5] | |
1723 | |
1724 movq_m2r(*(idata+6), mm0); // load idata[DCTSIZE*3] | |
1725 | |
1726 movq_m2r(*(idata+2), mm3); // load idata[DCTSIZE*1] | |
1727 movq_r2r(mm1, mm2); // copy tmp6 /* phase 6 */ | |
1728 | |
1729 movq_m2r(*(idata+14), mm4); // load idata[DCTSIZE*7] | |
1730 paddw_r2r(mm0, mm1); // z13 = tmp6 + tmp5; | |
1731 | |
1732 psubw_r2r(mm0, mm2); // z10 = tmp6 - tmp5 | |
1733 | |
1734 psllw_i2r(2, mm2); // shift z10 | |
1735 movq_r2r(mm2, mm0); // copy z10 | |
1736 | |
1737 pmulhw_m2r(fix_184n261, mm2); // MULTIPLY( z12, FIX_1_847759065); /* 2*c2 */ | |
1738 movq_r2r(mm3, mm5); // copy tmp4 | |
1739 | |
1740 pmulhw_m2r(fix_n184, mm0); // MULTIPLY(z10, -FIX_1_847759065); /* 2*c2 */ | |
1741 paddw_r2r(mm4, mm3); // z11 = tmp4 + tmp7; | |
1742 | |
1743 movq_r2r(mm3, mm6); // copy z11 /* phase 5 */ | |
1744 psubw_r2r(mm4, mm5); // z12 = tmp4 - tmp7; | |
1745 | |
1746 psubw_r2r(mm1, mm6); // z11-z13 | |
1747 psllw_i2r(2, mm5); // shift z12 | |
1748 | |
1749 movq_m2r(*(idata+12), mm4); // load idata[DCTSIZE*6], even part | |
1750 movq_r2r(mm5, mm7); // copy z12 | |
1751 | |
1752 pmulhw_m2r(fix_108n184, mm5); // MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; /* 2*(c2-c6) */ even part | |
1753 paddw_r2r(mm1, mm3); // tmp7 = z11 + z13; | |
1754 | |
1755 //ok | |
1756 | |
1757 /* Even part */ | |
1758 pmulhw_m2r(fix_184, mm7); // MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; /* -2*(c2+c6) */ | |
1759 psllw_i2r(2, mm6); | |
1760 | |
1761 movq_m2r(*(idata+4), mm1); // load idata[DCTSIZE*2] | |
1762 | |
1763 paddw_r2r(mm5, mm0); // tmp10 | |
1764 | |
1765 paddw_r2r(mm7, mm2); // tmp12 | |
1766 | |
1767 pmulhw_m2r(fix_141, mm6); // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */ | |
1768 psubw_r2r(mm3, mm2); // tmp6 = tmp12 - tmp7 | |
1769 | |
1770 movq_r2r(mm1, mm5); // copy tmp1 | |
1771 paddw_r2r(mm4, mm1); // tmp13= tmp1 + tmp3; /* phases 5-3 */ | |
1772 | |
1773 psubw_r2r(mm4, mm5); // tmp1-tmp3 | |
1774 psubw_r2r(mm2, mm6); // tmp5 = tmp11 - tmp6; | |
1775 | |
1776 movq_r2m(mm1, *(wsptr)); // save tmp13 in workspace | |
1777 psllw_i2r(2, mm5); // shift tmp1-tmp3 | |
1778 | |
1779 movq_m2r(*(idata), mm7); // load idata[DCTSIZE*0] | |
1780 paddw_r2r(mm6, mm0); // tmp4 = tmp10 + tmp5; | |
1781 | |
1782 pmulhw_m2r(fix_141, mm5); // MULTIPLY(tmp1 - tmp3, FIX_1_414213562) | |
1783 | |
1784 movq_m2r(*(idata+8), mm4); // load idata[DCTSIZE*4] | |
1785 | |
1786 psubw_r2r(mm1, mm5); // tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */ | |
1787 | |
1788 movq_r2m(mm0, *(wsptr+4)); // save tmp4 in workspace | |
1789 movq_r2r(mm7, mm1); // copy tmp0 /* phase 3 */ | |
1790 | |
1791 movq_r2m(mm5, *(wsptr+2)); // save tmp12 in workspace | |
1792 psubw_r2r(mm4, mm1); // tmp11 = tmp0 - tmp2; | |
1793 | |
1794 paddw_r2r(mm4, mm7); // tmp10 = tmp0 + tmp2; | |
1795 movq_r2r(mm1, mm5); // copy tmp11 | |
1796 | |
1797 paddw_m2r(*(wsptr+2), mm1); // tmp1 = tmp11 + tmp12; | |
1798 movq_r2r(mm7, mm4); // copy tmp10 /* phase 2 */ | |
1799 | |
1800 paddw_m2r(*(wsptr), mm7); // tmp0 = tmp10 + tmp13; | |
1801 | |
1802 psubw_m2r(*(wsptr), mm4); // tmp3 = tmp10 - tmp13; | |
1803 movq_r2r(mm7, mm0); // copy tmp0 | |
1804 | |
1805 psubw_m2r(*(wsptr+2), mm5); // tmp2 = tmp11 - tmp12; | |
1806 paddw_r2r(mm3, mm7); // wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7); | |
1807 | |
1808 psubw_r2r(mm3, mm0); // wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7); | |
1809 | |
1810 movq_r2m(mm7, *(wsptr)); // wsptr[DCTSIZE*0] | |
1811 movq_r2r(mm1, mm3); // copy tmp1 | |
1812 | |
1813 movq_r2m(mm0, *(wsptr+14)); // wsptr[DCTSIZE*7] | |
1814 paddw_r2r(mm2, mm1); // wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6); | |
1815 | |
1816 psubw_r2r(mm2, mm3); // wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6); | |
1817 | |
1818 movq_r2m(mm1, *(wsptr+2)); // wsptr[DCTSIZE*1] | |
1819 movq_r2r(mm4, mm1); // copy tmp3 | |
1820 | |
1821 movq_r2m(mm3, *(wsptr+12)); // wsptr[DCTSIZE*6] | |
1822 | |
1823 paddw_m2r(*(wsptr+4), mm4); // wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4); | |
1824 | |
1825 psubw_m2r(*(wsptr+4), mm1); // wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4); | |
1826 | |
1827 movq_r2m(mm4, *(wsptr+8)); | |
1828 movq_r2r(mm5, mm7); // copy tmp2 | |
1829 | |
1830 paddw_r2r(mm6, mm5); // wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5) | |
1831 | |
1832 movq_r2m(mm1, *(wsptr+6)); | |
1833 psubw_r2r(mm6, mm7); // wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5); | |
1834 | |
1835 movq_r2m(mm5, *(wsptr+4)); | |
1836 | |
1837 movq_r2m(mm7, *(wsptr+10)); | |
1838 | |
1839 /*****************************************************************/ | |
1840 | |
1841 /* Pass 2: process rows from work array, store into output array. */ | |
1842 /* Note that we must descale the results by a factor of 8 == 2**3, */ | |
1843 /* and also undo the PASS1_BITS scaling. */ | |
1844 | |
1845 /*****************************************************************/ | |
1846 /* Even part */ | |
1847 | |
1848 wsptr--; | |
1849 | |
1850 // tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]); | |
1851 // tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]); | |
1852 // tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]); | |
1853 // tmp14 = ((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6]); | |
1854 movq_m2r(*(wsptr), mm0); // wsptr[0,0],[0,1],[0,2],[0,3] | |
1855 | |
1856 movq_m2r(*(wsptr+1), mm1); // wsptr[0,4],[0,5],[0,6],[0,7] | |
1857 movq_r2r(mm0, mm2); | |
1858 | |
1859 movq_m2r(*(wsptr+2), mm3); // wsptr[1,0],[1,1],[1,2],[1,3] | |
1860 paddw_r2r(mm1, mm0); // wsptr[0,tmp10],[xxx],[0,tmp13],[xxx] | |
1861 | |
1862 movq_m2r(*(wsptr+3), mm4); // wsptr[1,4],[1,5],[1,6],[1,7] | |
1863 psubw_r2r(mm1, mm2); // wsptr[0,tmp11],[xxx],[0,tmp14],[xxx] | |
1864 | |
1865 movq_r2r(mm0, mm6); | |
1866 movq_r2r(mm3, mm5); | |
1867 | |
1868 paddw_r2r(mm4, mm3); // wsptr[1,tmp10],[xxx],[1,tmp13],[xxx] | |
1869 movq_r2r(mm2, mm1); | |
1870 | |
1871 psubw_r2r(mm4, mm5); // wsptr[1,tmp11],[xxx],[1,tmp14],[xxx] | |
1872 punpcklwd_r2r(mm3, mm0); // wsptr[0,tmp10],[1,tmp10],[xxx],[xxx] | |
1873 | |
1874 movq_m2r(*(wsptr+7), mm7); // wsptr[3,4],[3,5],[3,6],[3,7] | |
1875 punpckhwd_r2r(mm3, mm6); // wsptr[0,tmp13],[1,tmp13],[xxx],[xxx] | |
1876 | |
1877 movq_m2r(*(wsptr+4), mm3); // wsptr[2,0],[2,1],[2,2],[2,3] | |
1878 punpckldq_r2r(mm6, mm0); // wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13] | |
1879 | |
1880 punpcklwd_r2r(mm5, mm1); // wsptr[0,tmp11],[1,tmp11],[xxx],[xxx] | |
1881 movq_r2r(mm3, mm4); | |
1882 | |
1883 movq_m2r(*(wsptr+6), mm6); // wsptr[3,0],[3,1],[3,2],[3,3] | |
1884 punpckhwd_r2r(mm5, mm2); // wsptr[0,tmp14],[1,tmp14],[xxx],[xxx] | |
1885 | |
1886 movq_m2r(*(wsptr+5), mm5); // wsptr[2,4],[2,5],[2,6],[2,7] | |
1887 punpckldq_r2r(mm2, mm1); // wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14] | |
1888 | |
1889 | |
1890 paddw_r2r(mm5, mm3); // wsptr[2,tmp10],[xxx],[2,tmp13],[xxx] | |
1891 movq_r2r(mm6, mm2); | |
1892 | |
1893 psubw_r2r(mm5, mm4); // wsptr[2,tmp11],[xxx],[2,tmp14],[xxx] | |
1894 paddw_r2r(mm7, mm6); // wsptr[3,tmp10],[xxx],[3,tmp13],[xxx] | |
1895 | |
1896 movq_r2r(mm3, mm5); | |
1897 punpcklwd_r2r(mm6, mm3); // wsptr[2,tmp10],[3,tmp10],[xxx],[xxx] | |
1898 | |
1899 psubw_r2r(mm7, mm2); // wsptr[3,tmp11],[xxx],[3,tmp14],[xxx] | |
1900 punpckhwd_r2r(mm6, mm5); // wsptr[2,tmp13],[3,tmp13],[xxx],[xxx] | |
1901 | |
1902 movq_r2r(mm4, mm7); | |
1903 punpckldq_r2r(mm5, mm3); // wsptr[2,tmp10],[3,tmp10],[2,tmp13],[3,tmp13] | |
1904 | |
1905 punpcklwd_r2r(mm2, mm4); // wsptr[2,tmp11],[3,tmp11],[xxx],[xxx] | |
1906 | |
1907 punpckhwd_r2r(mm2, mm7); // wsptr[2,tmp14],[3,tmp14],[xxx],[xxx] | |
1908 | |
1909 punpckldq_r2r(mm7, mm4); // wsptr[2,tmp11],[3,tmp11],[2,tmp14],[3,tmp14] | |
1910 movq_r2r(mm1, mm6); | |
1911 | |
1912 //ok | |
1913 | |
1914 // mm0 = ;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13] | |
1915 // mm1 = ;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14] | |
1916 | |
1917 | |
1918 movq_r2r(mm0, mm2); | |
1919 punpckhdq_r2r(mm4, mm6); // wsptr[0,tmp14],[1,tmp14],[2,tmp14],[3,tmp14] | |
1920 | |
1921 punpckldq_r2r(mm4, mm1); // wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11] | |
1922 psllw_i2r(2, mm6); | |
1923 | |
1924 pmulhw_m2r(fix_141, mm6); | |
1925 punpckldq_r2r(mm3, mm0); // wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10] | |
1926 | |
1927 punpckhdq_r2r(mm3, mm2); // wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13] | |
1928 movq_r2r(mm0, mm7); | |
1929 | |
1930 // tmp0 = tmp10 + tmp13; | |
1931 // tmp3 = tmp10 - tmp13; | |
1932 paddw_r2r(mm2, mm0); // [0,tmp0],[1,tmp0],[2,tmp0],[3,tmp0] | |
1933 psubw_r2r(mm2, mm7); // [0,tmp3],[1,tmp3],[2,tmp3],[3,tmp3] | |
1934 | |
1935 // tmp12 = MULTIPLY(tmp14, FIX_1_414213562) - tmp13; | |
1936 psubw_r2r(mm2, mm6); // wsptr[0,tmp12],[1,tmp12],[2,tmp12],[3,tmp12] | |
1937 // tmp1 = tmp11 + tmp12; | |
1938 // tmp2 = tmp11 - tmp12; | |
1939 movq_r2r(mm1, mm5); | |
1940 | |
1941 //OK | |
1942 | |
1943 /* Odd part */ | |
1944 | |
1945 // z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3]; | |
1946 // z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3]; | |
1947 // z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7]; | |
1948 // z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7]; | |
1949 movq_m2r(*(wsptr), mm3); // wsptr[0,0],[0,1],[0,2],[0,3] | |
1950 paddw_r2r(mm6, mm1); // [0,tmp1],[1,tmp1],[2,tmp1],[3,tmp1] | |
1951 | |
1952 movq_m2r(*(wsptr+1), mm4); // wsptr[0,4],[0,5],[0,6],[0,7] | |
1953 psubw_r2r(mm6, mm5); // [0,tmp2],[1,tmp2],[2,tmp2],[3,tmp2] | |
1954 | |
1955 movq_r2r(mm3, mm6); | |
1956 punpckldq_r2r(mm4, mm3); // wsptr[0,0],[0,1],[0,4],[0,5] | |
1957 | |
1958 punpckhdq_r2r(mm6, mm4); // wsptr[0,6],[0,7],[0,2],[0,3] | |
1959 movq_r2r(mm3, mm2); | |
1960 | |
1961 //Save tmp0 and tmp1 in wsptr | |
1962 movq_r2m(mm0, *(wsptr)); // save tmp0 | |
1963 paddw_r2r(mm4, mm2); // wsptr[xxx],[0,z11],[xxx],[0,z13] | |
1964 | |
1965 | |
1966 //Continue with z10 --- z13 | |
1967 movq_m2r(*(wsptr+2), mm6); // wsptr[1,0],[1,1],[1,2],[1,3] | |
1968 psubw_r2r(mm4, mm3); // wsptr[xxx],[0,z12],[xxx],[0,z10] | |
1969 | |
1970 movq_m2r(*(wsptr+3), mm0); // wsptr[1,4],[1,5],[1,6],[1,7] | |
1971 movq_r2r(mm6, mm4); | |
1972 | |
1973 movq_r2m(mm1, *(wsptr+1)); // save tmp1 | |
1974 punpckldq_r2r(mm0, mm6); // wsptr[1,0],[1,1],[1,4],[1,5] | |
1975 | |
1976 punpckhdq_r2r(mm4, mm0); // wsptr[1,6],[1,7],[1,2],[1,3] | |
1977 movq_r2r(mm6, mm1); | |
1978 | |
1979 //Save tmp2 and tmp3 in wsptr | |
1980 paddw_r2r(mm0, mm6); // wsptr[xxx],[1,z11],[xxx],[1,z13] | |
1981 movq_r2r(mm2, mm4); | |
1982 | |
1983 //Continue with z10 --- z13 | |
1984 movq_r2m(mm5, *(wsptr+2)); // save tmp2 | |
1985 punpcklwd_r2r(mm6, mm2); // wsptr[xxx],[xxx],[0,z11],[1,z11] | |
1986 | |
1987 psubw_r2r(mm0, mm1); // wsptr[xxx],[1,z12],[xxx],[1,z10] | |
1988 punpckhwd_r2r(mm6, mm4); // wsptr[xxx],[xxx],[0,z13],[1,z13] | |
1989 | |
1990 movq_r2r(mm3, mm0); | |
1991 punpcklwd_r2r(mm1, mm3); // wsptr[xxx],[xxx],[0,z12],[1,z12] | |
1992 | |
1993 movq_r2m(mm7, *(wsptr+3)); // save tmp3 | |
1994 punpckhwd_r2r(mm1, mm0); // wsptr[xxx],[xxx],[0,z10],[1,z10] | |
1995 | |
1996 movq_m2r(*(wsptr+4), mm6); // wsptr[2,0],[2,1],[2,2],[2,3] | |
1997 punpckhdq_r2r(mm2, mm0); // wsptr[0,z10],[1,z10],[0,z11],[1,z11] | |
1998 | |
1999 movq_m2r(*(wsptr+5), mm7); // wsptr[2,4],[2,5],[2,6],[2,7] | |
2000 punpckhdq_r2r(mm4, mm3); // wsptr[0,z12],[1,z12],[0,z13],[1,z13] | |
2001 | |
2002 movq_m2r(*(wsptr+6), mm1); // wsptr[3,0],[3,1],[3,2],[3,3] | |
2003 movq_r2r(mm6, mm4); | |
2004 | |
2005 punpckldq_r2r(mm7, mm6); // wsptr[2,0],[2,1],[2,4],[2,5] | |
2006 movq_r2r(mm1, mm5); | |
2007 | |
2008 punpckhdq_r2r(mm4, mm7); // wsptr[2,6],[2,7],[2,2],[2,3] | |
2009 movq_r2r(mm6, mm2); | |
2010 | |
2011 movq_m2r(*(wsptr+7), mm4); // wsptr[3,4],[3,5],[3,6],[3,7] | |
2012 paddw_r2r(mm7, mm6); // wsptr[xxx],[2,z11],[xxx],[2,z13] | |
2013 | |
2014 psubw_r2r(mm7, mm2); // wsptr[xxx],[2,z12],[xxx],[2,z10] | |
2015 punpckldq_r2r(mm4, mm1); // wsptr[3,0],[3,1],[3,4],[3,5] | |
2016 | |
2017 punpckhdq_r2r(mm5, mm4); // wsptr[3,6],[3,7],[3,2],[3,3] | |
2018 movq_r2r(mm1, mm7); | |
2019 | |
2020 paddw_r2r(mm4, mm1); // wsptr[xxx],[3,z11],[xxx],[3,z13] | |
2021 psubw_r2r(mm4, mm7); // wsptr[xxx],[3,z12],[xxx],[3,z10] | |
2022 | |
2023 movq_r2r(mm6, mm5); | |
2024 punpcklwd_r2r(mm1, mm6); // wsptr[xxx],[xxx],[2,z11],[3,z11] | |
2025 | |
2026 punpckhwd_r2r(mm1, mm5); // wsptr[xxx],[xxx],[2,z13],[3,z13] | |
2027 movq_r2r(mm2, mm4); | |
2028 | |
2029 punpcklwd_r2r(mm7, mm2); // wsptr[xxx],[xxx],[2,z12],[3,z12] | |
2030 | |
2031 punpckhwd_r2r(mm7, mm4); // wsptr[xxx],[xxx],[2,z10],[3,z10] | |
2032 | |
2033 punpckhdq_r2r(mm6, mm4); /// wsptr[2,z10],[3,z10],[2,z11],[3,z11] | |
2034 | |
2035 punpckhdq_r2r(mm5, mm2); // wsptr[2,z12],[3,z12],[2,z13],[3,z13] | |
2036 movq_r2r(mm0, mm5); | |
2037 | |
2038 punpckldq_r2r(mm4, mm0); // wsptr[0,z10],[1,z10],[2,z10],[3,z10] | |
2039 | |
2040 punpckhdq_r2r(mm4, mm5); // wsptr[0,z11],[1,z11],[2,z11],[3,z11] | |
2041 movq_r2r(mm3, mm4); | |
2042 | |
2043 punpckhdq_r2r(mm2, mm4); // wsptr[0,z13],[1,z13],[2,z13],[3,z13] | |
2044 movq_r2r(mm5, mm1); | |
2045 | |
2046 punpckldq_r2r(mm2, mm3); // wsptr[0,z12],[1,z12],[2,z12],[3,z12] | |
2047 // tmp7 = z11 + z13; /* phase 5 */ | |
2048 // tmp8 = z11 - z13; /* phase 5 */ | |
2049 psubw_r2r(mm4, mm1); // tmp8 | |
2050 | |
2051 paddw_r2r(mm4, mm5); // tmp7 | |
2052 // tmp21 = MULTIPLY(tmp8, FIX_1_414213562); /* 2*c4 */ | |
2053 psllw_i2r(2, mm1); | |
2054 | |
2055 psllw_i2r(2, mm0); | |
2056 | |
2057 pmulhw_m2r(fix_141, mm1); // tmp21 | |
2058 // tmp20 = MULTIPLY(z12, (FIX_1_082392200- FIX_1_847759065)) /* 2*(c2-c6) */ | |
2059 // + MULTIPLY(z10, - FIX_1_847759065); /* 2*c2 */ | |
2060 psllw_i2r(2, mm3); | |
2061 movq_r2r(mm0, mm7); | |
2062 | |
2063 pmulhw_m2r(fix_n184, mm7); | |
2064 movq_r2r(mm3, mm6); | |
2065 | |
2066 movq_m2r(*(wsptr), mm2); // tmp0,final1 | |
2067 | |
2068 pmulhw_m2r(fix_108n184, mm6); | |
2069 // tmp22 = MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) /* -2*(c2+c6) */ | |
2070 // + MULTIPLY(z12, FIX_1_847759065); /* 2*c2 */ | |
2071 movq_r2r(mm2, mm4); // final1 | |
2072 | |
2073 pmulhw_m2r(fix_184n261, mm0); | |
2074 paddw_r2r(mm5, mm2); // tmp0+tmp7,final1 | |
2075 | |
2076 pmulhw_m2r(fix_184, mm3); | |
2077 psubw_r2r(mm5, mm4); // tmp0-tmp7,final1 | |
2078 | |
2079 // tmp6 = tmp22 - tmp7; /* phase 2 */ | |
2080 psraw_i2r(3, mm2); // outptr[0,0],[1,0],[2,0],[3,0],final1 | |
2081 | |
2082 paddw_r2r(mm6, mm7); // tmp20 | |
2083 psraw_i2r(3, mm4); // outptr[0,7],[1,7],[2,7],[3,7],final1 | |
2084 | |
2085 paddw_r2r(mm0, mm3); // tmp22 | |
2086 | |
2087 // tmp5 = tmp21 - tmp6; | |
2088 psubw_r2r(mm5, mm3); // tmp6 | |
2089 | |
2090 // tmp4 = tmp20 + tmp5; | |
2091 movq_m2r(*(wsptr+1), mm0); // tmp1,final2 | |
2092 psubw_r2r(mm3, mm1); // tmp5 | |
2093 | |
2094 movq_r2r(mm0, mm6); // final2 | |
2095 paddw_r2r(mm3, mm0); // tmp1+tmp6,final2 | |
2096 | |
2097 /* Final output stage: scale down by a factor of 8 and range-limit */ | |
2098 | |
2099 | |
2100 // outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) | |
2101 // & RANGE_MASK]; | |
2102 // outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) | |
2103 // & RANGE_MASK]; final1 | |
2104 | |
2105 | |
2106 // outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) | |
2107 // & RANGE_MASK]; | |
2108 // outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) | |
2109 // & RANGE_MASK]; final2 | |
2110 psubw_r2r(mm3, mm6); // tmp1-tmp6,final2 | |
2111 psraw_i2r(3, mm0); // outptr[0,1],[1,1],[2,1],[3,1] | |
2112 | |
2113 psraw_i2r(3, mm6); // outptr[0,6],[1,6],[2,6],[3,6] | |
2114 | |
2115 packuswb_r2r(mm4, mm0); // out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7] | |
2116 | |
2117 movq_m2r(*(wsptr+2), mm5); // tmp2,final3 | |
2118 packuswb_r2r(mm6, mm2); // out[0,0],[1,0],[2,0],[3,0],[0,6],[1,6],[2,6],[3,6] | |
2119 | |
2120 // outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) | |
2121 // & RANGE_MASK]; | |
2122 // outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) | |
2123 // & RANGE_MASK]; final3 | |
2124 paddw_r2r(mm1, mm7); // tmp4 | |
2125 movq_r2r(mm5, mm3); | |
2126 | |
2127 paddw_r2r(mm1, mm5); // tmp2+tmp5 | |
2128 psubw_r2r(mm1, mm3); // tmp2-tmp5 | |
2129 | |
2130 psraw_i2r(3, mm5); // outptr[0,2],[1,2],[2,2],[3,2] | |
2131 | |
2132 movq_m2r(*(wsptr+3), mm4); // tmp3,final4 | |
2133 psraw_i2r(3, mm3); // outptr[0,5],[1,5],[2,5],[3,5] | |
2134 | |
2135 | |
2136 | |
2137 // outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) | |
2138 // & RANGE_MASK]; | |
2139 // outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) | |
2140 // & RANGE_MASK]; final4 | |
2141 movq_r2r(mm4, mm6); | |
2142 paddw_r2r(mm7, mm4); // tmp3+tmp4 | |
2143 | |
2144 psubw_r2r(mm7, mm6); // tmp3-tmp4 | |
2145 psraw_i2r(3, mm4); // outptr[0,4],[1,4],[2,4],[3,4] | |
2146 | |
2147 // mov ecx, [dataptr] | |
2148 | |
2149 psraw_i2r(3, mm6); // outptr[0,3],[1,3],[2,3],[3,3] | |
2150 | |
2151 packuswb_r2r(mm4, mm5); // out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4] | |
2152 | |
2153 packuswb_r2r(mm3, mm6); // out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5] | |
2154 movq_r2r(mm2, mm4); | |
2155 | |
2156 movq_r2r(mm5, mm7); | |
2157 punpcklbw_r2r(mm0, mm2); // out[0,0],[0,1],[1,0],[1,1],[2,0],[2,1],[3,0],[3,1] | |
2158 | |
2159 punpckhbw_r2r(mm0, mm4); // out[0,6],[0,7],[1,6],[1,7],[2,6],[2,7],[3,6],[3,7] | |
2160 movq_r2r(mm2, mm1); | |
2161 | |
2162 punpcklbw_r2r(mm6, mm5); // out[0,2],[0,3],[1,2],[1,3],[2,2],[2,3],[3,2],[3,3] | |
2163 | |
2164 // add dataptr, 4 | |
2165 | |
2166 punpckhbw_r2r(mm6, mm7); // out[0,4],[0,5],[1,4],[1,5],[2,4],[2,5],[3,4],[3,5] | |
2167 | |
2168 punpcklwd_r2r(mm5, mm2); // out[0,0],[0,1],[0,2],[0,3],[1,0],[1,1],[1,2],[1,3] | |
2169 | |
2170 // add ecx, output_col | |
2171 | |
2172 movq_r2r(mm7, mm6); | |
2173 punpckhwd_r2r(mm5, mm1); // out[2,0],[2,1],[2,2],[2,3],[3,0],[3,1],[3,2],[3,3] | |
2174 | |
2175 movq_r2r(mm2, mm0); | |
2176 punpcklwd_r2r(mm4, mm6); // out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7] | |
2177 | |
2178 // mov idata, [dataptr] | |
2179 | |
2180 punpckldq_r2r(mm6, mm2); // out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7] | |
2181 | |
2182 // add dataptr, 4 | |
2183 | |
2184 movq_r2r(mm1, mm3); | |
2185 | |
2186 // add idata, output_col | |
2187 | |
2188 punpckhwd_r2r(mm4, mm7); // out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7] | |
2189 | |
2190 movq_r2m(mm2, *(dataptr)); | |
2191 | |
2192 punpckhdq_r2r(mm6, mm0); // out[1,0],[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7] | |
2193 | |
2194 dataptr += rskip; | |
2195 movq_r2m(mm0, *(dataptr)); | |
2196 | |
2197 punpckldq_r2r(mm7, mm1); // out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7] | |
2198 punpckhdq_r2r(mm7, mm3); // out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7] | |
2199 | |
2200 dataptr += rskip; | |
2201 movq_r2m(mm1, *(dataptr)); | |
2202 | |
2203 dataptr += rskip; | |
2204 movq_r2m(mm3, *(dataptr)); | |
2205 | |
2206 /*******************************************************************/ | |
2207 | |
2208 wsptr += 8; | |
2209 | |
2210 /*******************************************************************/ | |
2211 | |
2212 // tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]); | |
2213 // tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]); | |
2214 // tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]); | |
2215 // tmp14 = ((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6]); | |
2216 movq_m2r(*(wsptr), mm0); // wsptr[0,0],[0,1],[0,2],[0,3] | |
2217 | |
2218 movq_m2r(*(wsptr+1), mm1); // wsptr[0,4],[0,5],[0,6],[0,7] | |
2219 movq_r2r(mm0, mm2); | |
2220 | |
2221 movq_m2r(*(wsptr+2), mm3); // wsptr[1,0],[1,1],[1,2],[1,3] | |
2222 paddw_r2r(mm1, mm0); // wsptr[0,tmp10],[xxx],[0,tmp13],[xxx] | |
2223 | |
2224 movq_m2r(*(wsptr+3), mm4); // wsptr[1,4],[1,5],[1,6],[1,7] | |
2225 psubw_r2r(mm1, mm2); // wsptr[0,tmp11],[xxx],[0,tmp14],[xxx] | |
2226 | |
2227 movq_r2r(mm0, mm6); | |
2228 movq_r2r(mm3, mm5); | |
2229 | |
2230 paddw_r2r(mm4, mm3); // wsptr[1,tmp10],[xxx],[1,tmp13],[xxx] | |
2231 movq_r2r(mm2, mm1); | |
2232 | |
2233 psubw_r2r(mm4, mm5); // wsptr[1,tmp11],[xxx],[1,tmp14],[xxx] | |
2234 punpcklwd_r2r(mm3, mm0); // wsptr[0,tmp10],[1,tmp10],[xxx],[xxx] | |
2235 | |
2236 movq_m2r(*(wsptr+7), mm7); // wsptr[3,4],[3,5],[3,6],[3,7] | |
2237 punpckhwd_r2r(mm3, mm6); // wsptr[0,tmp13],[1,tmp13],[xxx],[xxx] | |
2238 | |
2239 movq_m2r(*(wsptr+4), mm3); // wsptr[2,0],[2,1],[2,2],[2,3] | |
2240 punpckldq_r2r(mm6, mm0); // wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13] | |
2241 | |
2242 punpcklwd_r2r(mm5, mm1); // wsptr[0,tmp11],[1,tmp11],[xxx],[xxx] | |
2243 movq_r2r(mm3, mm4); | |
2244 | |
2245 movq_m2r(*(wsptr+6), mm6); // wsptr[3,0],[3,1],[3,2],[3,3] | |
2246 punpckhwd_r2r(mm5, mm2); // wsptr[0,tmp14],[1,tmp14],[xxx],[xxx] | |
2247 | |
2248 movq_m2r(*(wsptr+5), mm5); // wsptr[2,4],[2,5],[2,6],[2,7] | |
2249 punpckldq_r2r(mm2, mm1); // wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14] | |
2250 | |
2251 paddw_r2r(mm5, mm3); // wsptr[2,tmp10],[xxx],[2,tmp13],[xxx] | |
2252 movq_r2r(mm6, mm2); | |
2253 | |
2254 psubw_r2r(mm5, mm4); // wsptr[2,tmp11],[xxx],[2,tmp14],[xxx] | |
2255 paddw_r2r(mm7, mm6); // wsptr[3,tmp10],[xxx],[3,tmp13],[xxx] | |
2256 | |
2257 movq_r2r(mm3, mm5); | |
2258 punpcklwd_r2r(mm6, mm3); // wsptr[2,tmp10],[3,tmp10],[xxx],[xxx] | |
2259 | |
2260 psubw_r2r(mm7, mm2); // wsptr[3,tmp11],[xxx],[3,tmp14],[xxx] | |
2261 punpckhwd_r2r(mm6, mm5); // wsptr[2,tmp13],[3,tmp13],[xxx],[xxx] | |
2262 | |
2263 movq_r2r(mm4, mm7); | |
2264 punpckldq_r2r(mm5, mm3); // wsptr[2,tmp10],[3,tmp10],[2,tmp13],[3,tmp13] | |
2265 | |
2266 punpcklwd_r2r(mm2, mm4); // wsptr[2,tmp11],[3,tmp11],[xxx],[xxx] | |
2267 | |
2268 punpckhwd_r2r(mm2, mm7); // wsptr[2,tmp14],[3,tmp14],[xxx],[xxx] | |
2269 | |
2270 punpckldq_r2r(mm7, mm4); // wsptr[2,tmp11],[3,tmp11],[2,tmp14],[3,tmp14] | |
2271 movq_r2r(mm1, mm6); | |
2272 | |
2273 //OK | |
2274 | |
2275 // mm0 = ;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13] | |
2276 // mm1 = ;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14] | |
2277 | |
2278 movq_r2r(mm0, mm2); | |
2279 punpckhdq_r2r(mm4, mm6); // wsptr[0,tmp14],[1,tmp14],[2,tmp14],[3,tmp14] | |
2280 | |
2281 punpckldq_r2r(mm4, mm1); // wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11] | |
2282 psllw_i2r(2, mm6); | |
2283 | |
2284 pmulhw_m2r(fix_141, mm6); | |
2285 punpckldq_r2r(mm3, mm0); // wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10] | |
2286 | |
2287 punpckhdq_r2r(mm3, mm2); // wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13] | |
2288 movq_r2r(mm0, mm7); | |
2289 | |
2290 // tmp0 = tmp10 + tmp13; | |
2291 // tmp3 = tmp10 - tmp13; | |
2292 paddw_r2r(mm2, mm0); // [0,tmp0],[1,tmp0],[2,tmp0],[3,tmp0] | |
2293 psubw_r2r(mm2, mm7); // [0,tmp3],[1,tmp3],[2,tmp3],[3,tmp3] | |
2294 | |
2295 // tmp12 = MULTIPLY(tmp14, FIX_1_414213562) - tmp13; | |
2296 psubw_r2r(mm2, mm6); // wsptr[0,tmp12],[1,tmp12],[2,tmp12],[3,tmp12] | |
2297 // tmp1 = tmp11 + tmp12; | |
2298 // tmp2 = tmp11 - tmp12; | |
2299 movq_r2r(mm1, mm5); | |
2300 | |
2301 //OK | |
2302 | |
2303 | |
2304 /* Odd part */ | |
2305 | |
2306 // z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3]; | |
2307 // z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3]; | |
2308 // z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7]; | |
2309 // z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7]; | |
2310 movq_m2r(*(wsptr), mm3); // wsptr[0,0],[0,1],[0,2],[0,3] | |
2311 paddw_r2r(mm6, mm1); // [0,tmp1],[1,tmp1],[2,tmp1],[3,tmp1] | |
2312 | |
2313 movq_m2r(*(wsptr+1), mm4); // wsptr[0,4],[0,5],[0,6],[0,7] | |
2314 psubw_r2r(mm6, mm5); // [0,tmp2],[1,tmp2],[2,tmp2],[3,tmp2] | |
2315 | |
2316 movq_r2r(mm3, mm6); | |
2317 punpckldq_r2r(mm4, mm3); // wsptr[0,0],[0,1],[0,4],[0,5] | |
2318 | |
2319 punpckhdq_r2r(mm6, mm4); // wsptr[0,6],[0,7],[0,2],[0,3] | |
2320 movq_r2r(mm3, mm2); | |
2321 | |
2322 //Save tmp0 and tmp1 in wsptr | |
2323 movq_r2m(mm0, *(wsptr)); // save tmp0 | |
2324 paddw_r2r(mm4, mm2); // wsptr[xxx],[0,z11],[xxx],[0,z13] | |
2325 | |
2326 | |
2327 //Continue with z10 --- z13 | |
2328 movq_m2r(*(wsptr+2), mm6); // wsptr[1,0],[1,1],[1,2],[1,3] | |
2329 psubw_r2r(mm4, mm3); // wsptr[xxx],[0,z12],[xxx],[0,z10] | |
2330 | |
2331 movq_m2r(*(wsptr+3), mm0); // wsptr[1,4],[1,5],[1,6],[1,7] | |
2332 movq_r2r(mm6, mm4); | |
2333 | |
2334 movq_r2m(mm1, *(wsptr+1)); // save tmp1 | |
2335 punpckldq_r2r(mm0, mm6); // wsptr[1,0],[1,1],[1,4],[1,5] | |
2336 | |
2337 punpckhdq_r2r(mm4, mm0); // wsptr[1,6],[1,7],[1,2],[1,3] | |
2338 movq_r2r(mm6, mm1); | |
2339 | |
2340 //Save tmp2 and tmp3 in wsptr | |
2341 paddw_r2r(mm0, mm6); // wsptr[xxx],[1,z11],[xxx],[1,z13] | |
2342 movq_r2r(mm2, mm4); | |
2343 | |
2344 //Continue with z10 --- z13 | |
2345 movq_r2m(mm5, *(wsptr+2)); // save tmp2 | |
2346 punpcklwd_r2r(mm6, mm2); // wsptr[xxx],[xxx],[0,z11],[1,z11] | |
2347 | |
2348 psubw_r2r(mm0, mm1); // wsptr[xxx],[1,z12],[xxx],[1,z10] | |
2349 punpckhwd_r2r(mm6, mm4); // wsptr[xxx],[xxx],[0,z13],[1,z13] | |
2350 | |
2351 movq_r2r(mm3, mm0); | |
2352 punpcklwd_r2r(mm1, mm3); // wsptr[xxx],[xxx],[0,z12],[1,z12] | |
2353 | |
2354 movq_r2m(mm7, *(wsptr+3)); // save tmp3 | |
2355 punpckhwd_r2r(mm1, mm0); // wsptr[xxx],[xxx],[0,z10],[1,z10] | |
2356 | |
2357 movq_m2r(*(wsptr+4), mm6); // wsptr[2,0],[2,1],[2,2],[2,3] | |
2358 punpckhdq_r2r(mm2, mm0); // wsptr[0,z10],[1,z10],[0,z11],[1,z11] | |
2359 | |
2360 movq_m2r(*(wsptr+5), mm7); // wsptr[2,4],[2,5],[2,6],[2,7] | |
2361 punpckhdq_r2r(mm4, mm3); // wsptr[0,z12],[1,z12],[0,z13],[1,z13] | |
2362 | |
2363 movq_m2r(*(wsptr+6), mm1); // wsptr[3,0],[3,1],[3,2],[3,3] | |
2364 movq_r2r(mm6, mm4); | |
2365 | |
2366 punpckldq_r2r(mm7, mm6); // wsptr[2,0],[2,1],[2,4],[2,5] | |
2367 movq_r2r(mm1, mm5); | |
2368 | |
2369 punpckhdq_r2r(mm4, mm7); // wsptr[2,6],[2,7],[2,2],[2,3] | |
2370 movq_r2r(mm6, mm2); | |
2371 | |
2372 movq_m2r(*(wsptr+7), mm4); // wsptr[3,4],[3,5],[3,6],[3,7] | |
2373 paddw_r2r(mm7, mm6); // wsptr[xxx],[2,z11],[xxx],[2,z13] | |
2374 | |
2375 psubw_r2r(mm7, mm2); // wsptr[xxx],[2,z12],[xxx],[2,z10] | |
2376 punpckldq_r2r(mm4, mm1); // wsptr[3,0],[3,1],[3,4],[3,5] | |
2377 | |
2378 punpckhdq_r2r(mm5, mm4); // wsptr[3,6],[3,7],[3,2],[3,3] | |
2379 movq_r2r(mm1, mm7); | |
2380 | |
2381 paddw_r2r(mm4, mm1); // wsptr[xxx],[3,z11],[xxx],[3,z13] | |
2382 psubw_r2r(mm4, mm7); // wsptr[xxx],[3,z12],[xxx],[3,z10] | |
2383 | |
2384 movq_r2r(mm6, mm5); | |
2385 punpcklwd_r2r(mm1, mm6); // wsptr[xxx],[xxx],[2,z11],[3,z11] | |
2386 | |
2387 punpckhwd_r2r(mm1, mm5); // wsptr[xxx],[xxx],[2,z13],[3,z13] | |
2388 movq_r2r(mm2, mm4); | |
2389 | |
2390 punpcklwd_r2r(mm7, mm2); // wsptr[xxx],[xxx],[2,z12],[3,z12] | |
2391 | |
2392 punpckhwd_r2r(mm7, mm4); // wsptr[xxx],[xxx],[2,z10],[3,z10] | |
2393 | |
2394 punpckhdq_r2r(mm6, mm4); // wsptr[2,z10],[3,z10],[2,z11],[3,z11] | |
2395 | |
2396 punpckhdq_r2r(mm5, mm2); // wsptr[2,z12],[3,z12],[2,z13],[3,z13] | |
2397 movq_r2r(mm0, mm5); | |
2398 | |
2399 punpckldq_r2r(mm4, mm0); // wsptr[0,z10],[1,z10],[2,z10],[3,z10] | |
2400 | |
2401 punpckhdq_r2r(mm4, mm5); // wsptr[0,z11],[1,z11],[2,z11],[3,z11] | |
2402 movq_r2r(mm3, mm4); | |
2403 | |
2404 punpckhdq_r2r(mm2, mm4); // wsptr[0,z13],[1,z13],[2,z13],[3,z13] | |
2405 movq_r2r(mm5, mm1); | |
2406 | |
2407 punpckldq_r2r(mm2, mm3); // wsptr[0,z12],[1,z12],[2,z12],[3,z12] | |
2408 // tmp7 = z11 + z13; /* phase 5 */ | |
2409 // tmp8 = z11 - z13; /* phase 5 */ | |
2410 psubw_r2r(mm4, mm1); // tmp8 | |
2411 | |
2412 paddw_r2r(mm4, mm5); // tmp7 | |
2413 // tmp21 = MULTIPLY(tmp8, FIX_1_414213562); /* 2*c4 */ | |
2414 psllw_i2r(2, mm1); | |
2415 | |
2416 psllw_i2r(2, mm0); | |
2417 | |
2418 pmulhw_m2r(fix_141, mm1); // tmp21 | |
2419 // tmp20 = MULTIPLY(z12, (FIX_1_082392200- FIX_1_847759065)) /* 2*(c2-c6) */ | |
2420 // + MULTIPLY(z10, - FIX_1_847759065); /* 2*c2 */ | |
2421 psllw_i2r(2, mm3); | |
2422 movq_r2r(mm0, mm7); | |
2423 | |
2424 pmulhw_m2r(fix_n184, mm7); | |
2425 movq_r2r(mm3, mm6); | |
2426 | |
2427 movq_m2r(*(wsptr), mm2); // tmp0,final1 | |
2428 | |
2429 pmulhw_m2r(fix_108n184, mm6); | |
2430 // tmp22 = MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) /* -2*(c2+c6) */ | |
2431 // + MULTIPLY(z12, FIX_1_847759065); /* 2*c2 */ | |
2432 movq_r2r(mm2, mm4); // final1 | |
2433 | |
2434 pmulhw_m2r(fix_184n261, mm0); | |
2435 paddw_r2r(mm5, mm2); // tmp0+tmp7,final1 | |
2436 | |
2437 pmulhw_m2r(fix_184, mm3); | |
2438 psubw_r2r(mm5, mm4); // tmp0-tmp7,final1 | |
2439 | |
2440 // tmp6 = tmp22 - tmp7; /* phase 2 */ | |
2441 psraw_i2r(3, mm2); // outptr[0,0],[1,0],[2,0],[3,0],final1 | |
2442 | |
2443 paddw_r2r(mm6, mm7); // tmp20 | |
2444 psraw_i2r(3, mm4); // outptr[0,7],[1,7],[2,7],[3,7],final1 | |
2445 | |
2446 paddw_r2r(mm0, mm3); // tmp22 | |
2447 | |
2448 // tmp5 = tmp21 - tmp6; | |
2449 psubw_r2r(mm5, mm3); // tmp6 | |
2450 | |
2451 // tmp4 = tmp20 + tmp5; | |
2452 movq_m2r(*(wsptr+1), mm0); // tmp1,final2 | |
2453 psubw_r2r(mm3, mm1); // tmp5 | |
2454 | |
2455 movq_r2r(mm0, mm6); // final2 | |
2456 paddw_r2r(mm3, mm0); // tmp1+tmp6,final2 | |
2457 | |
2458 /* Final output stage: scale down by a factor of 8 and range-limit */ | |
2459 | |
2460 // outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) | |
2461 // & RANGE_MASK]; | |
2462 // outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) | |
2463 // & RANGE_MASK]; final1 | |
2464 | |
2465 | |
2466 // outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) | |
2467 // & RANGE_MASK]; | |
2468 // outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) | |
2469 // & RANGE_MASK]; final2 | |
2470 psubw_r2r(mm3, mm6); // tmp1-tmp6,final2 | |
2471 psraw_i2r(3, mm0); // outptr[0,1],[1,1],[2,1],[3,1] | |
2472 | |
2473 psraw_i2r(3, mm6); // outptr[0,6],[1,6],[2,6],[3,6] | |
2474 | |
2475 packuswb_r2r(mm4, mm0); // out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7] | |
2476 | |
2477 movq_m2r(*(wsptr+2), mm5); // tmp2,final3 | |
2478 packuswb_r2r(mm6, mm2); // out[0,0],[1,0],[2,0],[3,0],[0,6],[1,6],[2,6],[3,6] | |
2479 | |
2480 // outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) | |
2481 // & RANGE_MASK]; | |
2482 // outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) | |
2483 // & RANGE_MASK]; final3 | |
2484 paddw_r2r(mm1, mm7); // tmp4 | |
2485 movq_r2r(mm5, mm3); | |
2486 | |
2487 paddw_r2r(mm1, mm5); // tmp2+tmp5 | |
2488 psubw_r2r(mm1, mm3); // tmp2-tmp5 | |
2489 | |
2490 psraw_i2r(3, mm5); // outptr[0,2],[1,2],[2,2],[3,2] | |
2491 | |
2492 movq_m2r(*(wsptr+3), mm4); // tmp3,final4 | |
2493 psraw_i2r(3, mm3); // outptr[0,5],[1,5],[2,5],[3,5] | |
2494 | |
2495 | |
2496 | |
2497 // outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) | |
2498 // & RANGE_MASK]; | |
2499 // outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) | |
2500 // & RANGE_MASK]; final4 | |
2501 movq_r2r(mm4, mm6); | |
2502 paddw_r2r(mm7, mm4); // tmp3+tmp4 | |
2503 | |
2504 psubw_r2r(mm7, mm6); // tmp3-tmp4 | |
2505 psraw_i2r(3, mm4); // outptr[0,4],[1,4],[2,4],[3,4] | |
2506 | |
2507 psraw_i2r(3, mm6); // outptr[0,3],[1,3],[2,3],[3,3] | |
2508 | |
2509 /* | |
2510 movq_r2m(mm4, *dummy); | |
2511 fprintf(stderr, "3-4 %016llx\n", dummy); | |
2512 movq_r2m(mm4, *dummy); | |
2513 fprintf(stderr, "3+4 %016llx\n", dummy); | |
2514 */ | |
2515 | |
2516 | |
2517 packuswb_r2r(mm4, mm5); // out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4] | |
2518 | |
2519 packuswb_r2r(mm3, mm6); // out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5] | |
2520 movq_r2r(mm2, mm4); | |
2521 | |
2522 movq_r2r(mm5, mm7); | |
2523 punpcklbw_r2r(mm0, mm2); // out[0,0],[0,1],[1,0],[1,1],[2,0],[2,1],[3,0],[3,1] | |
2524 | |
2525 punpckhbw_r2r(mm0, mm4); // out[0,6],[0,7],[1,6],[1,7],[2,6],[2,7],[3,6],[3,7] | |
2526 movq_r2r(mm2, mm1); | |
2527 | |
2528 punpcklbw_r2r(mm6, mm5); // out[0,2],[0,3],[1,2],[1,3],[2,2],[2,3],[3,2],[3,3] | |
2529 | |
2530 punpckhbw_r2r(mm6, mm7); // out[0,4],[0,5],[1,4],[1,5],[2,4],[2,5],[3,4],[3,5] | |
2531 | |
2532 punpcklwd_r2r(mm5, mm2); // out[0,0],[0,1],[0,2],[0,3],[1,0],[1,1],[1,2],[1,3] | |
2533 | |
2534 movq_r2r(mm7, mm6); | |
2535 punpckhwd_r2r(mm5, mm1); // out[2,0],[2,1],[2,2],[2,3],[3,0],[3,1],[3,2],[3,3] | |
2536 | |
2537 movq_r2r(mm2, mm0); | |
2538 punpcklwd_r2r(mm4, mm6); // out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7] | |
2539 | |
2540 punpckldq_r2r(mm6, mm2); // out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7] | |
2541 | |
2542 movq_r2r(mm1, mm3); | |
2543 | |
2544 punpckhwd_r2r(mm4, mm7); // out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7] | |
2545 | |
2546 dataptr += rskip; | |
2547 movq_r2m(mm2, *(dataptr)); | |
2548 | |
2549 punpckhdq_r2r(mm6, mm0); // out[1,0],[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7] | |
2550 | |
2551 dataptr += rskip; | |
2552 movq_r2m(mm0, *(dataptr)); | |
2553 | |
2554 punpckldq_r2r(mm7, mm1); // out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7] | |
2555 | |
2556 punpckhdq_r2r(mm7, mm3); // out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7] | |
2557 | |
2558 dataptr += rskip; | |
2559 movq_r2m(mm1, *(dataptr)); | |
2560 | |
2561 dataptr += rskip; | |
2562 movq_r2m(mm3, *(dataptr)); | |
2563 | |
2564 #else | |
2565 __s32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; | |
2566 __s32 tmp10, tmp11, tmp12, tmp13; | |
2567 __s32 z5, z10, z11, z12, z13; | |
2568 __s16 *inptr; | |
2569 __s32 *wsptr; | |
2570 __u8 *outptr; | |
2571 int ctr; | |
2572 __s32 dcval; | |
2573 __s32 workspace[64]; | |
2574 | |
2575 inptr = data; | |
2576 wsptr = workspace; | |
2577 for (ctr = 8; ctr > 0; ctr--) { | |
2578 | |
2579 if ((inptr[8] | inptr[16] | inptr[24] | | |
2580 inptr[32] | inptr[40] | inptr[48] | inptr[56]) == 0) { | |
2581 dcval = inptr[0]; | |
2582 wsptr[0] = dcval; | |
2583 wsptr[8] = dcval; | |
2584 wsptr[16] = dcval; | |
2585 wsptr[24] = dcval; | |
2586 wsptr[32] = dcval; | |
2587 wsptr[40] = dcval; | |
2588 wsptr[48] = dcval; | |
2589 wsptr[56] = dcval; | |
2590 | |
2591 inptr++; | |
2592 wsptr++; | |
2593 continue; | |
2594 } | |
2595 | |
2596 tmp0 = inptr[0]; | |
2597 tmp1 = inptr[16]; | |
2598 tmp2 = inptr[32]; | |
2599 tmp3 = inptr[48]; | |
2600 | |
2601 tmp10 = tmp0 + tmp2; | |
2602 tmp11 = tmp0 - tmp2; | |
2603 | |
2604 tmp13 = tmp1 + tmp3; | |
2605 tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; | |
2606 | |
2607 tmp0 = tmp10 + tmp13; | |
2608 tmp3 = tmp10 - tmp13; | |
2609 tmp1 = tmp11 + tmp12; | |
2610 tmp2 = tmp11 - tmp12; | |
2611 | |
2612 tmp4 = inptr[8]; | |
2613 tmp5 = inptr[24]; | |
2614 tmp6 = inptr[40]; | |
2615 tmp7 = inptr[56]; | |
2616 | |
2617 z13 = tmp6 + tmp5; | |
2618 z10 = tmp6 - tmp5; | |
2619 z11 = tmp4 + tmp7; | |
2620 z12 = tmp4 - tmp7; | |
2621 | |
2622 tmp7 = z11 + z13; | |
2623 tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); | |
2624 | |
2625 z5 = MULTIPLY(z10 + z12, FIX_1_847759065); | |
2626 tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; | |
2627 tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; | |
2628 | |
2629 tmp6 = tmp12 - tmp7; | |
2630 tmp5 = tmp11 - tmp6; | |
2631 tmp4 = tmp10 + tmp5; | |
2632 | |
2633 wsptr[0] = (__s32) (tmp0 + tmp7); | |
2634 wsptr[56] = (__s32) (tmp0 - tmp7); | |
2635 wsptr[8] = (__s32) (tmp1 + tmp6); | |
2636 wsptr[48] = (__s32) (tmp1 - tmp6); | |
2637 wsptr[16] = (__s32) (tmp2 + tmp5); | |
2638 wsptr[40] = (__s32) (tmp2 - tmp5); | |
2639 wsptr[32] = (__s32) (tmp3 + tmp4); | |
2640 wsptr[24] = (__s32) (tmp3 - tmp4); | |
2641 | |
2642 inptr++; | |
2643 wsptr++; | |
2644 } | |
2645 | |
2646 wsptr = workspace; | |
2647 for (ctr = 0; ctr < 8; ctr++) { | |
2648 outptr = &(odata[ctr*rskip]); | |
2649 | |
2650 tmp10 = wsptr[0] + wsptr[4]; | |
2651 tmp11 = wsptr[0] - wsptr[4]; | |
2652 | |
2653 tmp13 = wsptr[2] + wsptr[6]; | |
2654 tmp12 = MULTIPLY(wsptr[2] - wsptr[6], FIX_1_414213562) - tmp13; | |
2655 | |
2656 tmp0 = tmp10 + tmp13; | |
2657 tmp3 = tmp10 - tmp13; | |
2658 tmp1 = tmp11 + tmp12; | |
2659 tmp2 = tmp11 - tmp12; | |
2660 | |
2661 z13 = wsptr[5] + wsptr[3]; | |
2662 z10 = wsptr[5] - wsptr[3]; | |
2663 z11 = wsptr[1] + wsptr[7]; | |
2664 z12 = wsptr[1] - wsptr[7]; | |
2665 | |
2666 tmp7 = z11 + z13; | |
2667 tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); | |
2668 | |
2669 z5 = MULTIPLY(z10 + z12, FIX_1_847759065); | |
2670 tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; | |
2671 tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; | |
2672 | |
2673 tmp6 = tmp12 - tmp7; | |
2674 tmp5 = tmp11 - tmp6; | |
2675 tmp4 = tmp10 + tmp5; | |
2676 | |
2677 outptr[0] = RL(DESCALE(tmp0 + tmp7)); | |
2678 outptr[7] = RL(DESCALE(tmp0 - tmp7)); | |
2679 outptr[1] = RL(DESCALE(tmp1 + tmp6)); | |
2680 outptr[6] = RL(DESCALE(tmp1 - tmp6)); | |
2681 outptr[2] = RL(DESCALE(tmp2 + tmp5)); | |
2682 outptr[5] = RL(DESCALE(tmp2 - tmp5)); | |
2683 outptr[4] = RL(DESCALE(tmp3 + tmp4)); | |
2684 outptr[3] = RL(DESCALE(tmp3 - tmp4)); | |
2685 | |
2686 wsptr += 8; | |
2687 } | |
2688 #endif | |
2689 } | |
2690 /* | |
2691 | |
2692 Main Routines | |
2693 | |
2694 This file contains most of the initialisation and control functions | |
2695 | |
2696 (C) Justin Schoeman 1998 | |
2697 | |
2698 */ | |
2699 | |
2700 /* | |
2701 | |
2702 Private function | |
2703 | |
2704 Initialise all the cache-aliged data blocks | |
2705 | |
2706 */ | |
2707 | |
2708 void RTjpeg_init_data(void) | |
2709 { | |
2710 unsigned long dptr; | |
2711 | |
2712 dptr=(unsigned long)&(RTjpeg_alldata[0]); | |
2713 dptr+=32; | |
2714 dptr=dptr>>5; | |
2715 dptr=dptr<<5; /* cache align data */ | |
2716 | |
2717 RTjpeg_block=(__s16 *)dptr; | |
2718 dptr+=sizeof(__s16)*64; | |
2719 RTjpeg_lqt=(__s32 *)dptr; | |
2720 dptr+=sizeof(__s32)*64; | |
2721 RTjpeg_cqt=(__s32 *)dptr; | |
2722 dptr+=sizeof(__s32)*64; | |
2723 RTjpeg_liqt=(__u32 *)dptr; | |
2724 dptr+=sizeof(__u32)*64; | |
2725 RTjpeg_ciqt=(__u32 *)dptr; | |
2726 } | |
2727 | |
2728 /* | |
2729 | |
2730 External Function | |
2731 | |
2732 Re-set quality factor | |
2733 | |
2734 Input: buf -> pointer to 128 ints for quant values store to pass back to | |
2735 init_decompress. | |
2736 Q -> quality factor (192=best, 32=worst) | |
2737 */ | |
2738 | |
2739 void RTjpeg_init_Q(__u8 Q) | |
2740 { | |
2741 int i; | |
2742 __u64 qual; | |
2743 | |
2744 qual=(__u64)Q<<(32-7); /* 32 bit FP, 255=2, 0=0 */ | |
2745 | |
2746 for(i=0; i<64; i++) | |
2747 { | |
2748 RTjpeg_lqt[i]=(__s32)((qual/((__u64)RTjpeg_lum_quant_tbl[i]<<16))>>3); | |
2749 if(RTjpeg_lqt[i]==0)RTjpeg_lqt[i]=1; | |
2750 RTjpeg_cqt[i]=(__s32)((qual/((__u64)RTjpeg_chrom_quant_tbl[i]<<16))>>3); | |
2751 if(RTjpeg_cqt[i]==0)RTjpeg_cqt[i]=1; | |
2752 RTjpeg_liqt[i]=(1<<16)/(RTjpeg_lqt[i]<<3); | |
2753 RTjpeg_ciqt[i]=(1<<16)/(RTjpeg_cqt[i]<<3); | |
2754 RTjpeg_lqt[i]=((1<<16)/RTjpeg_liqt[i])>>3; | |
2755 RTjpeg_cqt[i]=((1<<16)/RTjpeg_ciqt[i])>>3; | |
2756 } | |
2757 | |
2758 RTjpeg_lb8=0; | |
2759 while(RTjpeg_liqt[RTjpeg_ZZ[++RTjpeg_lb8]]<=8); | |
2760 RTjpeg_lb8--; | |
2761 RTjpeg_cb8=0; | |
2762 while(RTjpeg_ciqt[RTjpeg_ZZ[++RTjpeg_cb8]]<=8); | |
2763 RTjpeg_cb8--; | |
2764 | |
2765 RTjpeg_dct_init(); | |
2766 RTjpeg_idct_init(); | |
2767 RTjpeg_quant_init(); | |
2768 } | |
2769 | |
2770 /* | |
2771 | |
2772 External Function | |
2773 | |
2774 Initialise compression. | |
2775 | |
2776 Input: buf -> pointer to 128 ints for quant values store to pass back to | |
2777 init_decompress. | |
2778 width -> width of image | |
2779 height -> height of image | |
2780 Q -> quality factor (192=best, 32=worst) | |
2781 | |
2782 */ | |
2783 | |
2784 void RTjpeg_init_compress(__u32 *buf, int width, int height, __u8 Q) | |
2785 { | |
2786 int i; | |
2787 __u64 qual; | |
2788 | |
2789 RTjpeg_init_data(); | |
2790 | |
2791 RTjpeg_width=width; | |
2792 RTjpeg_height=height; | |
2793 RTjpeg_Ywidth = RTjpeg_width>>3; | |
2794 RTjpeg_Ysize=width * height; | |
2795 RTjpeg_Cwidth = RTjpeg_width>>4; | |
2796 RTjpeg_Csize= (width>>1) * height; | |
2797 | |
2798 qual=(__u64)Q<<(32-7); /* 32 bit FP, 255=2, 0=0 */ | |
2799 | |
2800 for(i=0; i<64; i++) | |
2801 { | |
2802 RTjpeg_lqt[i]=(__s32)((qual/((__u64)RTjpeg_lum_quant_tbl[i]<<16))>>3); | |
2803 if(RTjpeg_lqt[i]==0)RTjpeg_lqt[i]=1; | |
2804 RTjpeg_cqt[i]=(__s32)((qual/((__u64)RTjpeg_chrom_quant_tbl[i]<<16))>>3); | |
2805 if(RTjpeg_cqt[i]==0)RTjpeg_cqt[i]=1; | |
2806 RTjpeg_liqt[i]=(1<<16)/(RTjpeg_lqt[i]<<3); | |
2807 RTjpeg_ciqt[i]=(1<<16)/(RTjpeg_cqt[i]<<3); | |
2808 RTjpeg_lqt[i]=((1<<16)/RTjpeg_liqt[i])>>3; | |
2809 RTjpeg_cqt[i]=((1<<16)/RTjpeg_ciqt[i])>>3; | |
2810 } | |
2811 | |
2812 RTjpeg_lb8=0; | |
2813 while(RTjpeg_liqt[RTjpeg_ZZ[++RTjpeg_lb8]]<=8); | |
2814 RTjpeg_lb8--; | |
2815 RTjpeg_cb8=0; | |
2816 while(RTjpeg_ciqt[RTjpeg_ZZ[++RTjpeg_cb8]]<=8); | |
2817 RTjpeg_cb8--; | |
2818 | |
2819 RTjpeg_dct_init(); | |
2820 RTjpeg_quant_init(); | |
2821 | |
2822 for(i=0; i<64; i++) | |
2823 buf[i]=RTjpeg_liqt[i]; | |
2824 for(i=0; i<64; i++) | |
2825 buf[64+i]=RTjpeg_ciqt[i]; | |
2826 } | |
2827 | |
2828 void RTjpeg_init_decompress(__u32 *buf, int width, int height) | |
2829 { | |
2830 int i; | |
2831 | |
2832 RTjpeg_init_data(); | |
2833 | |
2834 RTjpeg_width=width; | |
2835 RTjpeg_height=height; | |
2836 RTjpeg_Ywidth = RTjpeg_width>>3; | |
2837 RTjpeg_Ysize=width * height; | |
2838 RTjpeg_Cwidth = RTjpeg_width>>4; | |
2839 RTjpeg_Csize= (width>>1) * height; | |
2840 | |
2841 for(i=0; i<64; i++) | |
2842 { | |
2843 RTjpeg_liqt[i]=buf[i]; | |
2844 RTjpeg_ciqt[i]=buf[i+64]; | |
2845 } | |
2846 | |
2847 RTjpeg_lb8=0; | |
2848 while(RTjpeg_liqt[RTjpeg_ZZ[++RTjpeg_lb8]]<=8); | |
2849 RTjpeg_lb8--; | |
2850 RTjpeg_cb8=0; | |
2851 while(RTjpeg_ciqt[RTjpeg_ZZ[++RTjpeg_cb8]]<=8); | |
2852 RTjpeg_cb8--; | |
2853 | |
2854 RTjpeg_idct_init(); | |
2855 | |
2856 // RTjpeg_color_init(); | |
2857 } | |
2858 | |
2859 int RTjpeg_compressYUV420(__s8 *sp, unsigned char *bp) | |
2860 { | |
2861 __s8 * sb; | |
2862 register __s8 * bp1 = bp + (RTjpeg_width<<3); | |
2863 register __s8 * bp2 = bp + RTjpeg_Ysize; | |
2864 register __s8 * bp3 = bp2 + (RTjpeg_Csize>>1); | |
2865 register int i, j, k; | |
2866 | |
2867 #ifdef MMX | |
2868 emms(); | |
2869 #endif | |
2870 sb=sp; | |
2871 /* Y */ | |
2872 for(i=RTjpeg_height>>1; i; i-=8) | |
2873 { | |
2874 for(j=0, k=0; j<RTjpeg_width; j+=16, k+=8) | |
2875 { | |
2876 RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_Ywidth); | |
2877 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt); | |
2878 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8); | |
2879 | |
2880 RTjpeg_dctY(bp+j+8, RTjpeg_block, RTjpeg_Ywidth); | |
2881 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt); | |
2882 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8); | |
2883 | |
2884 RTjpeg_dctY(bp1+j, RTjpeg_block, RTjpeg_Ywidth); | |
2885 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt); | |
2886 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8); | |
2887 | |
2888 RTjpeg_dctY(bp1+j+8, RTjpeg_block, RTjpeg_Ywidth); | |
2889 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt); | |
2890 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8); | |
2891 | |
2892 RTjpeg_dctY(bp2+k, RTjpeg_block, RTjpeg_Cwidth); | |
2893 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt); | |
2894 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8); | |
2895 | |
2896 RTjpeg_dctY(bp3+k, RTjpeg_block, RTjpeg_Cwidth); | |
2897 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt); | |
2898 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8); | |
2899 | |
2900 } | |
2901 bp+=RTjpeg_width<<4; | |
2902 bp1+=RTjpeg_width<<4; | |
2903 bp2+=RTjpeg_width<<2; | |
2904 bp3+=RTjpeg_width<<2; | |
2905 | |
2906 } | |
2907 #ifdef MMX | |
2908 emms(); | |
2909 #endif | |
2910 return (sp-sb); | |
2911 } | |
2912 | |
2913 int RTjpeg_compressYUV422(__s8 *sp, unsigned char *bp) | |
2914 { | |
2915 __s8 * sb; | |
2916 register __s8 * bp2 = bp + RTjpeg_Ysize; | |
2917 register __s8 * bp3 = bp2 + RTjpeg_Csize; | |
2918 register int i, j, k; | |
2919 | |
2920 #ifdef MMX | |
2921 emms(); | |
2922 #endif | |
2923 sb=sp; | |
2924 /* Y */ | |
2925 for(i=RTjpeg_height; i; i-=8) | |
2926 { | |
2927 for(j=0, k=0; j<RTjpeg_width; j+=16, k+=8) | |
2928 { | |
2929 RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_Ywidth); | |
2930 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt); | |
2931 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8); | |
2932 | |
2933 RTjpeg_dctY(bp+j+8, RTjpeg_block, RTjpeg_Ywidth); | |
2934 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt); | |
2935 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8); | |
2936 | |
2937 RTjpeg_dctY(bp2+k, RTjpeg_block, RTjpeg_Cwidth); | |
2938 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt); | |
2939 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8); | |
2940 | |
2941 RTjpeg_dctY(bp3+k, RTjpeg_block, RTjpeg_Cwidth); | |
2942 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt); | |
2943 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8); | |
2944 | |
2945 } | |
2946 bp+=RTjpeg_width<<3; | |
2947 bp2+=RTjpeg_width<<2; | |
2948 bp3+=RTjpeg_width<<2; | |
2949 | |
2950 } | |
2951 #ifdef MMX | |
2952 emms(); | |
2953 #endif | |
2954 return (sp-sb); | |
2955 } | |
2956 | |
2957 int RTjpeg_compress8(__s8 *sp, unsigned char *bp) | |
2958 { | |
2959 __s8 * sb; | |
2960 int i, j; | |
2961 | |
2962 #ifdef MMX | |
2963 emms(); | |
2964 #endif | |
2965 | |
2966 sb=sp; | |
2967 /* Y */ | |
2968 for(i=0; i<RTjpeg_height; i+=8) | |
2969 { | |
2970 for(j=0; j<RTjpeg_width; j+=8) | |
2971 { | |
2972 RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_width); | |
2973 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt); | |
2974 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8); | |
2975 } | |
2976 bp+=RTjpeg_width; | |
2977 } | |
2978 | |
2979 #ifdef MMX | |
2980 emms(); | |
2981 #endif | |
2982 return (sp-sb); | |
2983 } | |
2984 | |
2985 void RTjpeg_decompressYUV422(__s8 *sp, __u8 *bp) | |
2986 { | |
2987 register __s8 * bp2 = bp + RTjpeg_Ysize; | |
2988 register __s8 * bp3 = bp2 + (RTjpeg_Csize); | |
2989 int i, j,k; | |
2990 | |
2991 #ifdef MMX | |
2992 emms(); | |
2993 #endif | |
2994 | |
2995 /* Y */ | |
2996 for(i=RTjpeg_height; i; i-=8) | |
2997 { | |
2998 for(k=0, j=0; j<RTjpeg_width; j+=16, k+=8) { | |
2999 if(*sp==-1)sp++; | |
3000 else | |
3001 { | |
3002 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt); | |
3003 RTjpeg_idct(bp+j, RTjpeg_block, RTjpeg_width); | |
3004 } | |
3005 if(*sp==-1)sp++; | |
3006 else | |
3007 { | |
3008 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt); | |
3009 RTjpeg_idct(bp+j+8, RTjpeg_block, RTjpeg_width); | |
3010 } | |
3011 if(*sp==-1)sp++; | |
3012 else | |
3013 { | |
3014 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt); | |
3015 RTjpeg_idct(bp2+k, RTjpeg_block, RTjpeg_width>>1); | |
3016 } | |
3017 if(*sp==-1)sp++; | |
3018 else | |
3019 { | |
3020 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt); | |
3021 RTjpeg_idct(bp3+k, RTjpeg_block, RTjpeg_width>>1); | |
3022 } | |
3023 } | |
3024 bp+=RTjpeg_width<<3; | |
3025 bp2+=RTjpeg_width<<2; | |
3026 bp3+=RTjpeg_width<<2; | |
3027 } | |
3028 #ifdef MMX | |
3029 emms(); | |
3030 #endif | |
3031 } | |
3032 | |
3033 void RTjpeg_decompressYUV420(__s8 *sp, __u8 *bp) | |
3034 { | |
3035 register __s8 * bp1 = bp + (RTjpeg_width<<3); | |
3036 register __s8 * bp2 = bp + RTjpeg_Ysize; | |
3037 register __s8 * bp3 = bp2 + (RTjpeg_Csize>>1); | |
3038 int i, j,k; | |
3039 | |
3040 #ifdef MMX | |
3041 emms(); | |
3042 #endif | |
3043 | |
3044 /* Y */ | |
3045 for(i=RTjpeg_height>>1; i; i-=8) | |
3046 { | |
3047 for(k=0, j=0; j<RTjpeg_width; j+=16, k+=8) { | |
3048 if(*sp==-1)sp++; | |
3049 else | |
3050 { | |
3051 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt); | |
3052 RTjpeg_idct(bp+j, RTjpeg_block, RTjpeg_width); | |
3053 } | |
3054 if(*sp==-1)sp++; | |
3055 else | |
3056 { | |
3057 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt); | |
3058 RTjpeg_idct(bp+j+8, RTjpeg_block, RTjpeg_width); | |
3059 } | |
3060 if(*sp==-1)sp++; | |
3061 else | |
3062 { | |
3063 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt); | |
3064 RTjpeg_idct(bp1+j, RTjpeg_block, RTjpeg_width); | |
3065 } | |
3066 if(*sp==-1)sp++; | |
3067 else | |
3068 { | |
3069 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt); | |
3070 RTjpeg_idct(bp1+j+8, RTjpeg_block, RTjpeg_width); | |
3071 } | |
3072 if(*sp==-1)sp++; | |
3073 else | |
3074 { | |
3075 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt); | |
3076 RTjpeg_idct(bp2+k, RTjpeg_block, RTjpeg_width>>1); | |
3077 } | |
3078 if(*sp==-1)sp++; | |
3079 else | |
3080 { | |
3081 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt); | |
3082 RTjpeg_idct(bp3+k, RTjpeg_block, RTjpeg_width>>1); | |
3083 } | |
3084 } | |
3085 bp+=RTjpeg_width<<4; | |
3086 bp1+=RTjpeg_width<<4; | |
3087 bp2+=RTjpeg_width<<2; | |
3088 bp3+=RTjpeg_width<<2; | |
3089 } | |
3090 #ifdef MMX | |
3091 emms(); | |
3092 #endif | |
3093 } | |
3094 | |
3095 void RTjpeg_decompress8(__s8 *sp, __u8 *bp) | |
3096 { | |
3097 int i, j; | |
3098 | |
3099 #ifdef MMX | |
3100 emms(); | |
3101 #endif | |
3102 | |
3103 /* Y */ | |
3104 for(i=0; i<RTjpeg_height; i+=8) | |
3105 { | |
3106 for(j=0; j<RTjpeg_width; j+=8) | |
3107 if(*sp==-1)sp++; | |
3108 else | |
3109 { | |
3110 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt); | |
3111 RTjpeg_idct(bp+j, RTjpeg_block, RTjpeg_width); | |
3112 } | |
3113 bp+=RTjpeg_width<<3; | |
3114 } | |
3115 } | |
3116 | |
3117 /* | |
3118 External Function | |
3119 | |
3120 Initialise additional data structures for motion compensation | |
3121 | |
3122 */ | |
3123 | |
3124 void RTjpeg_init_mcompress(void) | |
3125 { | |
3126 unsigned long tmp; | |
3127 | |
3128 if(!RTjpeg_old) | |
3129 { | |
3130 RTjpeg_old=malloc((4*RTjpeg_width*RTjpeg_height)+32); | |
3131 tmp=(unsigned long)RTjpeg_old; | |
3132 tmp+=32; | |
3133 tmp=tmp>>5; | |
3134 RTjpeg_old=(__s16 *)(tmp<<5); | |
3135 } | |
3136 if (!RTjpeg_old) | |
3137 { | |
3138 fprintf(stderr, "RTjpeg: Could not allocate memory\n"); | |
3139 exit(-1); | |
3140 } | |
3141 bzero(RTjpeg_old, ((4*RTjpeg_width*RTjpeg_height))); | |
3142 } | |
3143 | |
3144 #ifdef MMX | |
3145 | |
3146 int RTjpeg_bcomp(__s16 *old, mmx_t *mask) | |
3147 { | |
3148 int i; | |
3149 mmx_t *mold=(mmx_t *)old; | |
3150 mmx_t *mblock=(mmx_t *)RTjpeg_block; | |
3151 volatile mmx_t result; | |
3152 static mmx_t neg=(mmx_t)(unsigned long long)0xffffffffffffffffULL; | |
3153 | |
3154 movq_m2r(*mask, mm7); | |
3155 movq_m2r(neg, mm6); | |
3156 pxor_r2r(mm5, mm5); | |
3157 | |
3158 for(i=0; i<8; i++) | |
3159 { | |
3160 movq_m2r(*(mblock++), mm0); | |
3161 movq_m2r(*(mblock++), mm2); | |
3162 movq_m2r(*(mold++), mm1); | |
3163 movq_m2r(*(mold++), mm3); | |
3164 psubsw_r2r(mm1, mm0); | |
3165 psubsw_r2r(mm3, mm2); | |
3166 movq_r2r(mm0, mm1); | |
3167 movq_r2r(mm2, mm3); | |
3168 pcmpgtw_r2r(mm7, mm0); | |
3169 pcmpgtw_r2r(mm7, mm2); | |
3170 pxor_r2r(mm6, mm1); | |
3171 pxor_r2r(mm6, mm3); | |
3172 pcmpgtw_r2r(mm7, mm1); | |
3173 pcmpgtw_r2r(mm7, mm3); | |
3174 por_r2r(mm0, mm5); | |
3175 por_r2r(mm2, mm5); | |
3176 por_r2r(mm1, mm5); | |
3177 por_r2r(mm3, mm5); | |
3178 } | |
3179 movq_r2m(mm5, result); | |
3180 | |
3181 if(result.q) | |
3182 { | |
3183 // if(!RTjpeg_mtest) | |
3184 // for(i=0; i<16; i++)((__u64 *)old)[i]=((__u64 *)RTjpeg_block)[i]; | |
3185 return 0; | |
3186 } | |
3187 // printf("."); | |
3188 return 1; | |
3189 } | |
3190 | |
3191 #else | |
3192 int RTjpeg_bcomp(__s16 *old, __u16 *mask) | |
3193 { | |
3194 int i; | |
3195 | |
3196 for(i=0; i<64; i++) | |
3197 if(abs(old[i]-RTjpeg_block[i])>*mask) | |
3198 { | |
3199 if(!RTjpeg_mtest) | |
3200 for(i=0; i<16; i++)((__u64 *)old)[i]=((__u64 *)RTjpeg_block)[i]; | |
3201 return 0; | |
3202 } | |
3203 return 1; | |
3204 } | |
3205 #endif | |
3206 | |
3207 void RTjpeg_set_test(int i) | |
3208 { | |
3209 RTjpeg_mtest=i; | |
3210 } | |
3211 | |
3212 int RTjpeg_mcompressYUV420(__s8 *sp, unsigned char *bp, __u16 lmask, __u16 cmask) | |
3213 { | |
3214 __s8 * sb; | |
3215 //rh __s16 *block; | |
3216 register __s8 * bp1 = bp + (RTjpeg_width<<3); | |
3217 register __s8 * bp2 = bp + RTjpeg_Ysize; | |
3218 register __s8 * bp3 = bp2 + (RTjpeg_Csize>>1); | |
3219 register int i, j, k; | |
3220 | |
3221 #ifdef MMX | |
3222 emms(); | |
3223 RTjpeg_lmask=(mmx_t)(((__u64)lmask<<48)|((__u64)lmask<<32)|((__u64)lmask<<16)|lmask); | |
3224 RTjpeg_cmask=(mmx_t)(((__u64)cmask<<48)|((__u64)cmask<<32)|((__u64)cmask<<16)|cmask); | |
3225 #else | |
3226 RTjpeg_lmask=lmask; | |
3227 RTjpeg_cmask=cmask; | |
3228 #endif | |
3229 | |
3230 sb=sp; | |
3231 block=RTjpeg_old; | |
3232 /* Y */ | |
3233 for(i=RTjpeg_height>>1; i; i-=8) | |
3234 { | |
3235 for(j=0, k=0; j<RTjpeg_width; j+=16, k+=8) | |
3236 { | |
3237 RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_Ywidth); | |
3238 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt); | |
3239 if(RTjpeg_bcomp(block, &RTjpeg_lmask)) | |
3240 { | |
3241 *((__u8 *)sp++)=255; | |
3242 } | |
3243 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8); | |
3244 block+=64; | |
3245 | |
3246 RTjpeg_dctY(bp+j+8, RTjpeg_block, RTjpeg_Ywidth); | |
3247 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt); | |
3248 if(RTjpeg_bcomp(block, &RTjpeg_lmask)) | |
3249 { | |
3250 *((__u8 *)sp++)=255; | |
3251 } | |
3252 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8); | |
3253 block+=64; | |
3254 | |
3255 RTjpeg_dctY(bp1+j, RTjpeg_block, RTjpeg_Ywidth); | |
3256 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt); | |
3257 if(RTjpeg_bcomp(block, &RTjpeg_lmask)) | |
3258 { | |
3259 *((__u8 *)sp++)=255; | |
3260 } | |
3261 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8); | |
3262 block+=64; | |
3263 | |
3264 RTjpeg_dctY(bp1+j+8, RTjpeg_block, RTjpeg_Ywidth); | |
3265 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt); | |
3266 if(RTjpeg_bcomp(block, &RTjpeg_lmask)) | |
3267 { | |
3268 *((__u8 *)sp++)=255; | |
3269 } | |
3270 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8); | |
3271 block+=64; | |
3272 | |
3273 RTjpeg_dctY(bp2+k, RTjpeg_block, RTjpeg_Cwidth); | |
3274 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt); | |
3275 if(RTjpeg_bcomp(block, &RTjpeg_cmask)) | |
3276 { | |
3277 *((__u8 *)sp++)=255; | |
3278 } | |
3279 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8); | |
3280 block+=64; | |
3281 | |
3282 RTjpeg_dctY(bp3+k, RTjpeg_block, RTjpeg_Cwidth); | |
3283 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt); | |
3284 if(RTjpeg_bcomp(block, &RTjpeg_cmask)) | |
3285 { | |
3286 *((__u8 *)sp++)=255; | |
3287 } | |
3288 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8); | |
3289 block+=64; | |
3290 } | |
3291 bp+=RTjpeg_width<<4; | |
3292 bp1+=RTjpeg_width<<4; | |
3293 bp2+=RTjpeg_width<<2; | |
3294 bp3+=RTjpeg_width<<2; | |
3295 | |
3296 } | |
3297 #ifdef MMX | |
3298 emms(); | |
3299 #endif | |
3300 return (sp-sb); | |
3301 } | |
3302 | |
3303 | |
3304 int RTjpeg_mcompressYUV422(__s8 *sp, unsigned char *bp, __u16 lmask, __u16 cmask) | |
3305 { | |
3306 __s8 * sb; | |
3307 __s16 *block; | |
3308 register __s8 * bp2; | |
3309 register __s8 * bp3; | |
3310 register int i, j, k; | |
3311 | |
3312 #ifdef MMX | |
3313 emms(); | |
3314 RTjpeg_lmask=(mmx_t)(((__u64)lmask<<48)|((__u64)lmask<<32)|((__u64)lmask<<16)|lmask); | |
3315 RTjpeg_cmask=(mmx_t)(((__u64)cmask<<48)|((__u64)cmask<<32)|((__u64)cmask<<16)|cmask); | |
3316 #else | |
3317 RTjpeg_lmask=lmask; | |
3318 RTjpeg_cmask=cmask; | |
3319 #endif | |
3320 | |
3321 bp = bp - RTjpeg_width*0; | |
3322 bp2 = bp + RTjpeg_Ysize-RTjpeg_width*0; | |
3323 bp3 = bp2 + RTjpeg_Csize; | |
3324 | |
3325 sb=sp; | |
3326 block=RTjpeg_old; | |
3327 /* Y */ | |
3328 for(i=RTjpeg_height; i; i-=8) | |
3329 { | |
3330 for(j=0, k=0; j<RTjpeg_width; j+=16, k+=8) | |
3331 { | |
3332 RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_Ywidth); | |
3333 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt); | |
3334 if(RTjpeg_bcomp(block, &RTjpeg_lmask)) | |
3335 { | |
3336 *((__u8 *)sp++)=255; | |
3337 } | |
3338 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8); | |
3339 block+=64; | |
3340 | |
3341 RTjpeg_dctY(bp+j+8, RTjpeg_block, RTjpeg_Ywidth); | |
3342 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt); | |
3343 if(RTjpeg_bcomp(block, &RTjpeg_lmask)) | |
3344 { | |
3345 *((__u8 *)sp++)=255; | |
3346 } | |
3347 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8); | |
3348 block+=64; | |
3349 | |
3350 RTjpeg_dctY(bp2+k, RTjpeg_block, RTjpeg_Cwidth); | |
3351 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt); | |
3352 if(RTjpeg_bcomp(block, &RTjpeg_cmask)) | |
3353 { | |
3354 *((__u8 *)sp++)=255; | |
3355 } | |
3356 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8); | |
3357 block+=64; | |
3358 | |
3359 RTjpeg_dctY(bp3+k, RTjpeg_block, RTjpeg_Cwidth); | |
3360 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt); | |
3361 if(RTjpeg_bcomp(block, &RTjpeg_cmask)) | |
3362 { | |
3363 *((__u8 *)sp++)=255; | |
3364 } | |
3365 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8); | |
3366 block+=64; | |
3367 | |
3368 } | |
3369 bp+=RTjpeg_width<<3; | |
3370 bp2+=RTjpeg_width<<2; | |
3371 bp3+=RTjpeg_width<<2; | |
3372 } | |
3373 printf ("%d\n", block - RTjpeg_old); | |
3374 #ifdef MMX | |
3375 emms(); | |
3376 #endif | |
3377 return (sp-sb); | |
3378 } | |
3379 | |
3380 int RTjpeg_mcompress8(__s8 *sp, unsigned char *bp, __u16 lmask) | |
3381 { | |
3382 __s8 * sb; | |
3383 __s16 *block; | |
3384 int i, j; | |
3385 | |
3386 #ifdef MMX | |
3387 emms(); | |
3388 RTjpeg_lmask=(mmx_t)(((__u64)lmask<<48)|((__u64)lmask<<32)|((__u64)lmask<<16)|lmask); | |
3389 #else | |
3390 RTjpeg_lmask=lmask; | |
3391 #endif | |
3392 | |
3393 | |
3394 sb=sp; | |
3395 block=RTjpeg_old; | |
3396 /* Y */ | |
3397 for(i=0; i<RTjpeg_height; i+=8) | |
3398 { | |
3399 for(j=0; j<RTjpeg_width; j+=8) | |
3400 { | |
3401 RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_width); | |
3402 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt); | |
3403 if(RTjpeg_bcomp(block, &RTjpeg_lmask)) | |
3404 { | |
3405 *((__u8 *)sp++)=255; | |
3406 // printf("* %d ", sp[-1]); | |
3407 } else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8); | |
3408 block+=64; | |
3409 } | |
3410 bp+=RTjpeg_width<<3; | |
3411 } | |
3412 #ifdef MMX | |
3413 emms(); | |
3414 #endif | |
3415 return (sp-sb); | |
3416 } | |
3417 | |
3418 void RTjpeg_color_init(void) | |
3419 { | |
3420 } | |
3421 | |
3422 #define KcrR 76284 | |
3423 #define KcrG 53281 | |
3424 #define KcbG 25625 | |
3425 #define KcbB 132252 | |
3426 #define Ky 76284 | |
3427 | |
3428 void RTjpeg_yuv422rgb(__u8 *buf, __u8 *rgb, int stride) | |
3429 { | |
3430 int tmp; | |
3431 int i, j; | |
3432 __s32 y, crR, crG, cbG, cbB; | |
3433 __u8 *bufcr, *bufcb, *bufy, *bufoute; | |
3434 int yskip; | |
3435 | |
3436 yskip=RTjpeg_width; | |
3437 | |
3438 bufcb=&buf[RTjpeg_width*RTjpeg_height]; | |
3439 bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/2]; | |
3440 bufy=&buf[0]; | |
3441 bufoute=rgb; | |
3442 | |
3443 for(i=0; i<(RTjpeg_height); i++) | |
3444 { | |
3445 for(j=0; j<RTjpeg_width; j+=2) | |
3446 { | |
3447 crR=(*bufcr-128)*KcrR; | |
3448 crG=(*(bufcr++)-128)*KcrG; | |
3449 cbG=(*bufcb-128)*KcbG; | |
3450 cbB=(*(bufcb++)-128)*KcbB; | |
3451 | |
3452 y=(bufy[j]-16)*Ky; | |
3453 | |
3454 tmp=(y+crR)>>16; | |
3455 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); | |
3456 tmp=(y-crG-cbG)>>16; | |
3457 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); | |
3458 tmp=(y+cbB)>>16; | |
3459 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); | |
3460 | |
3461 y=(bufy[j+1]-16)*Ky; | |
3462 | |
3463 tmp=(y+crR)>>16; | |
3464 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); | |
3465 tmp=(y-crG-cbG)>>16; | |
3466 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); | |
3467 tmp=(y+cbB)>>16; | |
3468 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); | |
3469 | |
3470 } | |
3471 bufy+=yskip; | |
3472 } | |
3473 } | |
3474 | |
3475 | |
3476 void RTjpeg_yuv420rgb(__u8 *buf, __u8 *rgb, int stride) | |
3477 { | |
3478 int tmp; | |
3479 int i, j; | |
3480 __s32 y, crR, crG, cbG, cbB; | |
3481 __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto; | |
3482 int oskip, yskip; | |
3483 | |
3484 if(stride==0) | |
3485 oskip=RTjpeg_width*3; | |
3486 else | |
3487 oskip=2*stride-RTjpeg_width*3; | |
3488 | |
3489 yskip=RTjpeg_width; | |
3490 | |
3491 bufcb=&buf[RTjpeg_width*RTjpeg_height]; | |
3492 bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/4]; | |
3493 bufy=&buf[0]; | |
3494 bufoute=rgb; | |
3495 bufouto=rgb+RTjpeg_width*3; | |
3496 | |
3497 for(i=0; i<(RTjpeg_height>>1); i++) | |
3498 { | |
3499 for(j=0; j<RTjpeg_width; j+=2) | |
3500 { | |
3501 crR=(*bufcr-128)*KcrR; | |
3502 crG=(*(bufcr++)-128)*KcrG; | |
3503 cbG=(*bufcb-128)*KcbG; | |
3504 cbB=(*(bufcb++)-128)*KcbB; | |
3505 | |
3506 y=(bufy[j]-16)*Ky; | |
3507 | |
3508 tmp=(y+crR)>>16; | |
3509 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); | |
3510 tmp=(y-crG-cbG)>>16; | |
3511 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); | |
3512 tmp=(y+cbB)>>16; | |
3513 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); | |
3514 | |
3515 y=(bufy[j+1]-16)*Ky; | |
3516 | |
3517 tmp=(y+crR)>>16; | |
3518 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); | |
3519 tmp=(y-crG-cbG)>>16; | |
3520 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); | |
3521 tmp=(y+cbB)>>16; | |
3522 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); | |
3523 | |
3524 y=(bufy[j+yskip]-16)*Ky; | |
3525 | |
3526 tmp=(y+crR)>>16; | |
3527 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); | |
3528 tmp=(y-crG-cbG)>>16; | |
3529 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); | |
3530 tmp=(y+cbB)>>16; | |
3531 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); | |
3532 | |
3533 y=(bufy[j+1+yskip]-16)*Ky; | |
3534 | |
3535 tmp=(y+crR)>>16; | |
3536 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); | |
3537 tmp=(y-crG-cbG)>>16; | |
3538 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); | |
3539 tmp=(y+cbB)>>16; | |
3540 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); | |
3541 | |
3542 } | |
3543 bufoute+=oskip; | |
3544 bufouto+=oskip; | |
3545 bufy+=yskip<<1; | |
3546 } | |
3547 } | |
3548 | |
3549 | |
3550 void RTjpeg_yuvrgb32(__u8 *buf, __u8 *rgb, int stride) | |
3551 { | |
3552 int tmp; | |
3553 int i, j; | |
3554 __s32 y, crR, crG, cbG, cbB; | |
3555 __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto; | |
3556 int oskip, yskip; | |
3557 | |
3558 if(stride==0) | |
3559 oskip=RTjpeg_width*4; | |
3560 else | |
3561 oskip = 2*stride-RTjpeg_width*4; | |
3562 yskip=RTjpeg_width; | |
3563 | |
3564 bufcb=&buf[RTjpeg_width*RTjpeg_height]; | |
3565 bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/2]; | |
3566 bufy=&buf[0]; | |
3567 bufoute=rgb; | |
3568 bufouto=rgb+RTjpeg_width*4; | |
3569 | |
3570 for(i=0; i<(RTjpeg_height>>1); i++) | |
3571 { | |
3572 for(j=0; j<RTjpeg_width; j+=2) | |
3573 { | |
3574 crR=(*bufcr-128)*KcrR; | |
3575 crG=(*(bufcr++)-128)*KcrG; | |
3576 cbG=(*bufcb-128)*KcbG; | |
3577 cbB=(*(bufcb++)-128)*KcbB; | |
3578 | |
3579 y=(bufy[j]-16)*Ky; | |
3580 | |
3581 tmp=(y+cbB)>>16; | |
3582 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); | |
3583 tmp=(y-crG-cbG)>>16; | |
3584 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); | |
3585 tmp=(y+crR)>>16; | |
3586 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); | |
3587 bufoute++; | |
3588 | |
3589 y=(bufy[j+1]-16)*Ky; | |
3590 | |
3591 tmp=(y+cbB)>>16; | |
3592 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); | |
3593 tmp=(y-crG-cbG)>>16; | |
3594 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); | |
3595 tmp=(y+crR)>>16; | |
3596 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); | |
3597 bufoute++; | |
3598 | |
3599 y=(bufy[j+yskip]-16)*Ky; | |
3600 | |
3601 tmp=(y+cbB)>>16; | |
3602 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); | |
3603 tmp=(y-crG-cbG)>>16; | |
3604 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); | |
3605 tmp=(y+crR)>>16; | |
3606 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); | |
3607 bufouto++; | |
3608 | |
3609 y=(bufy[j+1+yskip]-16)*Ky; | |
3610 | |
3611 tmp=(y+cbB)>>16; | |
3612 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); | |
3613 tmp=(y-crG-cbG)>>16; | |
3614 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); | |
3615 tmp=(y+crR)>>16; | |
3616 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); | |
3617 bufouto++; | |
3618 | |
3619 } | |
3620 bufoute+=oskip; | |
3621 bufouto+=oskip; | |
3622 bufy+=yskip<<1; | |
3623 } | |
3624 } | |
3625 | |
3626 void RTjpeg_yuvrgb24(__u8 *buf, __u8 *rgb, int stride) | |
3627 { | |
3628 int tmp; | |
3629 int i, j; | |
3630 __s32 y, crR, crG, cbG, cbB; | |
3631 __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto; | |
3632 int oskip, yskip; | |
3633 | |
3634 if(stride==0) | |
3635 oskip=RTjpeg_width*3; | |
3636 else | |
3637 oskip=2*stride - RTjpeg_width*3; | |
3638 | |
3639 yskip=RTjpeg_width; | |
3640 | |
3641 bufcb=&buf[RTjpeg_width*RTjpeg_height]; | |
3642 bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/4]; | |
3643 bufy=&buf[0]; | |
3644 bufoute=rgb; | |
3645 bufouto=rgb+RTjpeg_width*3; | |
3646 | |
3647 for(i=0; i<(RTjpeg_height>>1); i++) | |
3648 { | |
3649 for(j=0; j<RTjpeg_width; j+=2) | |
3650 { | |
3651 crR=(*bufcr-128)*KcrR; | |
3652 crG=(*(bufcr++)-128)*KcrG; | |
3653 cbG=(*bufcb-128)*KcbG; | |
3654 cbB=(*(bufcb++)-128)*KcbB; | |
3655 | |
3656 y=(bufy[j]-16)*Ky; | |
3657 | |
3658 tmp=(y+cbB)>>16; | |
3659 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); | |
3660 tmp=(y-crG-cbG)>>16; | |
3661 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); | |
3662 tmp=(y+crR)>>16; | |
3663 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); | |
3664 | |
3665 y=(bufy[j+1]-16)*Ky; | |
3666 | |
3667 tmp=(y+cbB)>>16; | |
3668 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); | |
3669 tmp=(y-crG-cbG)>>16; | |
3670 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); | |
3671 tmp=(y+crR)>>16; | |
3672 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); | |
3673 | |
3674 y=(bufy[j+yskip]-16)*Ky; | |
3675 | |
3676 tmp=(y+cbB)>>16; | |
3677 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); | |
3678 tmp=(y-crG-cbG)>>16; | |
3679 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); | |
3680 tmp=(y+crR)>>16; | |
3681 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); | |
3682 | |
3683 y=(bufy[j+1+yskip]-16)*Ky; | |
3684 | |
3685 tmp=(y+cbB)>>16; | |
3686 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); | |
3687 tmp=(y-crG-cbG)>>16; | |
3688 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); | |
3689 tmp=(y+crR)>>16; | |
3690 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); | |
3691 | |
3692 } | |
3693 bufoute+=oskip; | |
3694 bufouto+=oskip; | |
3695 bufy+=yskip<<1; | |
3696 } | |
3697 } | |
3698 | |
3699 void RTjpeg_yuvrgb16(__u8 *buf, __u8 *rgb, int stride) | |
3700 { | |
3701 int tmp; | |
3702 int i, j; | |
3703 __s32 y, crR, crG, cbG, cbB; | |
3704 __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto; | |
3705 int oskip, yskip; | |
3706 unsigned char r, g, b; | |
3707 | |
3708 if(stride==0) | |
3709 oskip=RTjpeg_width*2; | |
3710 else | |
3711 oskip=2*stride-RTjpeg_width*2; | |
3712 | |
3713 yskip=RTjpeg_width; | |
3714 | |
3715 bufcb=&buf[RTjpeg_width*RTjpeg_height]; | |
3716 bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/4]; | |
3717 bufy=&buf[0]; | |
3718 bufoute=rgb; | |
3719 bufouto=rgb+RTjpeg_width*2; | |
3720 | |
3721 for(i=0; i<(RTjpeg_height>>1); i++) | |
3722 { | |
3723 for(j=0; j<RTjpeg_width; j+=2) | |
3724 { | |
3725 crR=(*bufcr-128)*KcrR; | |
3726 crG=(*(bufcr++)-128)*KcrG; | |
3727 cbG=(*bufcb-128)*KcbG; | |
3728 cbB=(*(bufcb++)-128)*KcbB; | |
3729 | |
3730 y=(bufy[j]-16)*Ky; | |
3731 | |
3732 tmp=(y+cbB)>>16; | |
3733 b=(tmp>255)?255:((tmp<0)?0:tmp); | |
3734 tmp=(y-crG-cbG)>>16; | |
3735 g=(tmp>255)?255:((tmp<0)?0:tmp); | |
3736 tmp=(y+crR)>>16; | |
3737 r=(tmp>255)?255:((tmp<0)?0:tmp); | |
3738 tmp=(int)((int)b >> 3); | |
3739 tmp|=(int)(((int)g >> 2) << 5); | |
3740 tmp|=(int)(((int)r >> 3) << 11); | |
3741 *(bufoute++)=tmp&0xff; | |
3742 *(bufoute++)=tmp>>8; | |
3743 | |
3744 | |
3745 y=(bufy[j+1]-16)*Ky; | |
3746 | |
3747 tmp=(y+cbB)>>16; | |
3748 b=(tmp>255)?255:((tmp<0)?0:tmp); | |
3749 tmp=(y-crG-cbG)>>16; | |
3750 g=(tmp>255)?255:((tmp<0)?0:tmp); | |
3751 tmp=(y+crR)>>16; | |
3752 r=(tmp>255)?255:((tmp<0)?0:tmp); | |
3753 tmp=(int)((int)b >> 3); | |
3754 tmp|=(int)(((int)g >> 2) << 5); | |
3755 tmp|=(int)(((int)r >> 3) << 11); | |
3756 *(bufoute++)=tmp&0xff; | |
3757 *(bufoute++)=tmp>>8; | |
3758 | |
3759 y=(bufy[j+yskip]-16)*Ky; | |
3760 | |
3761 tmp=(y+cbB)>>16; | |
3762 b=(tmp>255)?255:((tmp<0)?0:tmp); | |
3763 tmp=(y-crG-cbG)>>16; | |
3764 g=(tmp>255)?255:((tmp<0)?0:tmp); | |
3765 tmp=(y+crR)>>16; | |
3766 r=(tmp>255)?255:((tmp<0)?0:tmp); | |
3767 tmp=(int)((int)b >> 3); | |
3768 tmp|=(int)(((int)g >> 2) << 5); | |
3769 tmp|=(int)(((int)r >> 3) << 11); | |
3770 *(bufouto++)=tmp&0xff; | |
3771 *(bufouto++)=tmp>>8; | |
3772 | |
3773 y=(bufy[j+1+yskip]-16)*Ky; | |
3774 | |
3775 tmp=(y+cbB)>>16; | |
3776 b=(tmp>255)?255:((tmp<0)?0:tmp); | |
3777 tmp=(y-crG-cbG)>>16; | |
3778 g=(tmp>255)?255:((tmp<0)?0:tmp); | |
3779 tmp=(y+crR)>>16; | |
3780 r=(tmp>255)?255:((tmp<0)?0:tmp); | |
3781 tmp=(int)((int)b >> 3); | |
3782 tmp|=(int)(((int)g >> 2) << 5); | |
3783 tmp|=(int)(((int)r >> 3) << 11); | |
3784 *(bufouto++)=tmp&0xff; | |
3785 *(bufouto++)=tmp>>8; | |
3786 | |
3787 } | |
3788 bufoute+=oskip; | |
3789 bufouto+=oskip; | |
3790 bufy+=yskip<<1; | |
3791 } | |
3792 } | |
3793 | |
3794 /* fix stride */ | |
3795 | |
3796 void RTjpeg_yuvrgb8(__u8 *buf, __u8 *rgb, int stride) | |
3797 { | |
9763 | 3798 memcpy(rgb, buf, RTjpeg_width*RTjpeg_height); |
3802 | 3799 } |
3800 |