comparison libmpcodecs/native/RTjpegN.c @ 5602:628c85c15c7b

moved to libmpcodecs/native/
author arpi
date Sat, 13 Apr 2002 18:03:02 +0000
parents RTjpegN.c@ab0797fc1215
children e9bd97d5c5cc
comparison
equal deleted inserted replaced
5601:fd85802f755b 5602:628c85c15c7b
1 /*
2 RTjpeg (C) Justin Schoeman 1998 (justin@suntiger.ee.up.ac.za)
3
4 With modifications by:
5 (c) 1998, 1999 by Joerg Walter <trouble@moes.pmnet.uni-oldenburg.de>
6 and
7 (c) 1999 by Wim Taymans <wim.taymans@tvd.be>
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; either version 2 of the License, or
12 (at your option) any later version.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22
23 */
24
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28
29 #include "config.h"
30 #ifdef HAVE_MMX
31 #define MMX
32 #endif
33
34 #include "RTjpegN.h"
35
36 #ifdef MMX
37 #include "mmx.h"
38 #endif
39
40 //#define SHOWBLOCK 1
41 #define BETTERCOMPRESSION 1
42
43 static const unsigned char RTjpeg_ZZ[64]={
44 0,
45 8, 1,
46 2, 9, 16,
47 24, 17, 10, 3,
48 4, 11, 18, 25, 32,
49 40, 33, 26, 19, 12, 5,
50 6, 13, 20, 27, 34, 41, 48,
51 56, 49, 42, 35, 28, 21, 14, 7,
52 15, 22, 29, 36, 43, 50, 57,
53 58, 51, 44, 37, 30, 23,
54 31, 38, 45, 52, 59,
55 60, 53, 46, 39,
56 47, 54, 61,
57 62, 55,
58 63 };
59
60 static const __u64 RTjpeg_aan_tab[64]={
61 4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL, 3374581504ULL, 2324432128ULL, 1184891264ULL,
62 5957222912ULL, 8263040512ULL, 7783580160ULL, 7005009920ULL, 5957222912ULL, 4680582144ULL, 3224107520ULL, 1643641088ULL,
63 5611718144ULL, 7783580160ULL, 7331904512ULL, 6598688768ULL, 5611718144ULL, 4408998912ULL, 3036936960ULL, 1548224000ULL,
64 5050464768ULL, 7005009920ULL, 6598688768ULL, 5938608128ULL, 5050464768ULL, 3968072960ULL, 2733115392ULL, 1393296000ULL,
65 4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL, 3374581504ULL, 2324432128ULL, 1184891264ULL,
66 3374581504ULL, 4680582144ULL, 4408998912ULL, 3968072960ULL, 3374581504ULL, 2651326208ULL, 1826357504ULL, 931136000ULL,
67 2324432128ULL, 3224107520ULL, 3036936960ULL, 2733115392ULL, 2324432128ULL, 1826357504ULL, 1258030336ULL, 641204288ULL,
68 1184891264ULL, 1643641088ULL, 1548224000ULL, 1393296000ULL, 1184891264ULL, 931136000ULL, 641204288ULL, 326894240ULL,
69 };
70
71 #ifndef MMX
72 static __s32 RTjpeg_ws[64+31];
73 #endif
74 __u8 RTjpeg_alldata[2*64+4*64+4*64+4*64+4*64+32];
75
76 static __s16 *block; // rh
77 static __s16 *RTjpeg_block;
78 static __s32 *RTjpeg_lqt;
79 static __s32 *RTjpeg_cqt;
80 static __u32 *RTjpeg_liqt;
81 static __u32 *RTjpeg_ciqt;
82
83 static unsigned char RTjpeg_lb8;
84 static unsigned char RTjpeg_cb8;
85 static int RTjpeg_width, RTjpeg_height;
86 static int RTjpeg_Ywidth, RTjpeg_Cwidth;
87 static int RTjpeg_Ysize, RTjpeg_Csize;
88
89 static __s16 *RTjpeg_old=NULL;
90
91 #ifdef MMX
92 mmx_t RTjpeg_lmask;
93 mmx_t RTjpeg_cmask;
94 #else
95 __u16 RTjpeg_lmask;
96 __u16 RTjpeg_cmask;
97 #endif
98 int RTjpeg_mtest=0;
99
100 static const unsigned char RTjpeg_lum_quant_tbl[64] = {
101 16, 11, 10, 16, 24, 40, 51, 61,
102 12, 12, 14, 19, 26, 58, 60, 55,
103 14, 13, 16, 24, 40, 57, 69, 56,
104 14, 17, 22, 29, 51, 87, 80, 62,
105 18, 22, 37, 56, 68, 109, 103, 77,
106 24, 35, 55, 64, 81, 104, 113, 92,
107 49, 64, 78, 87, 103, 121, 120, 101,
108 72, 92, 95, 98, 112, 100, 103, 99
109 };
110
111 static const unsigned char RTjpeg_chrom_quant_tbl[64] = {
112 17, 18, 24, 47, 99, 99, 99, 99,
113 18, 21, 26, 66, 99, 99, 99, 99,
114 24, 26, 56, 99, 99, 99, 99, 99,
115 47, 66, 99, 99, 99, 99, 99, 99,
116 99, 99, 99, 99, 99, 99, 99, 99,
117 99, 99, 99, 99, 99, 99, 99, 99,
118 99, 99, 99, 99, 99, 99, 99, 99,
119 99, 99, 99, 99, 99, 99, 99, 99
120 };
121
122 #ifdef BETTERCOMPRESSION
123
124 /*--------------------------------------------------*/
125 /* better encoding, but needs a lot more cpu time */
126 /* seems to be more effective than old method +lzo */
127 /* with this encoding lzo isn't efficient anymore */
128 /* there is still more potential for better */
129 /* encoding but that would need even more cputime */
130 /* anyway your mileage may vary */
131 /* */
132 /* written by Martin BIELY and Roman HOCHLEITNER */
133 /*--------------------------------------------------*/
134
135 /* +++++++++++++++++++++++++++++++++++++++++++++++++++*/
136 /* Block to Stream (encoding) */
137 /* */
138
139 int RTjpeg_b2s(__s16 *data, __s8 *strm, __u8 bt8)
140 {
141 register int ci, co=1;
142 register __s16 ZZvalue;
143 register unsigned char bitten;
144 register unsigned char bitoff;
145
146 #ifdef SHOWBLOCK
147
148 int ii;
149 for (ii=0; ii < 64; ii++) {
150 fprintf(stdout, "%d ", data[RTjpeg_ZZ[ii]]);
151 }
152 fprintf(stdout, "\n\n");
153
154 #endif
155
156 // *strm++ = 0x10;
157 // *strm = 0x00;
158 //
159 // return 2;
160
161 // first byte allways written
162 (__u8)strm[0]=
163 (__u8)(data[RTjpeg_ZZ[0]]>254) ? 254:((data[RTjpeg_ZZ[0]]<0)?0:data[RTjpeg_ZZ[0]]);
164
165
166 ci=63;
167 while (data[RTjpeg_ZZ[ci]]==0 && ci>0) ci--;
168
169 bitten = ((unsigned char)ci) << 2;
170
171 if (ci==0) {
172 (__u8)strm[1]= bitten;
173 co = 2;
174 return (int)co;
175 }
176
177 /* bitoff=0 because the high 6bit contain first non zero position */
178 bitoff = 0;
179 co = 1;
180
181 for(; ci>0; ci--) {
182
183 ZZvalue = data[RTjpeg_ZZ[ci]];
184
185 switch(ZZvalue) {
186 case 0:
187 break;
188 case 1:
189 bitten |= (0x01<<bitoff);
190 break;
191 case -1:
192 bitten |= (0x03<<bitoff);
193 break;
194 default:
195 bitten |= (0x02<<bitoff);
196 goto HERZWEH;
197 break;
198 }
199
200 if( bitoff == 0 ) {
201 (__u8)strm[co]= bitten;
202 bitten = 0;
203 bitoff = 8;
204 co++;
205 } /* "fall through" */
206 bitoff-=2;
207
208 }
209
210 /* ci must be 0 */
211 if(bitoff != 6) {
212
213 (__u8)strm[co]= bitten;
214 co++;
215
216 }
217 goto BAUCHWEH;
218
219 HERZWEH:
220 /* ci cannot be 0 */
221 /* correct bitoff to nibble boundaries */
222
223 switch(bitoff){
224 case 4:
225 case 6:
226 bitoff = 0;
227 break;
228 case 2:
229 case 0:
230 (__u8)strm[co]= bitten;
231 bitoff = 4;
232 co++;
233 bitten = 0; // clear half nibble values in bitten
234 break;
235 default:
236 break;
237 }
238
239 for(; ci>0; ci--) {
240
241 ZZvalue = data[RTjpeg_ZZ[ci]];
242
243 if( (ZZvalue > 7) || (ZZvalue < -7) ) {
244 bitten |= (0x08<<bitoff);
245 goto HIRNWEH;
246 }
247
248 bitten |= (ZZvalue&0xf)<<bitoff;
249
250 if( bitoff == 0 ) {
251 (__u8)strm[co]= bitten;
252 bitten = 0;
253 bitoff = 8;
254 co++;
255 } /* "fall thru" */
256 bitoff-=4;
257 }
258
259 /* ci must be 0 */
260 if( bitoff == 0 ) {
261 (__u8)strm[co]= bitten;
262 co++;
263 }
264 goto BAUCHWEH;
265
266 HIRNWEH:
267
268 (__u8)strm[co]= bitten;
269 co++;
270
271
272 /* bitting is over now we bite */
273 for(; ci>0; ci--) {
274
275 ZZvalue = data[RTjpeg_ZZ[ci]];
276
277 if(ZZvalue>0)
278 {
279 strm[co++]=(__s8)(ZZvalue>127)?127:ZZvalue;
280 }
281 else
282 {
283 strm[co++]=(__s8)(ZZvalue<-128)?-128:ZZvalue;
284 }
285
286 }
287
288
289 BAUCHWEH:
290 /* we gotoo much now we are ill */
291 #ifdef SHOWBLOCK
292 {
293 int i;
294 fprintf(stdout, "\nco = '%d'\n", co);
295 for (i=0; i < co+2; i++) {
296 fprintf(stdout, "%d ", strm[i]);
297 }
298 fprintf(stdout, "\n\n");
299 }
300 #endif
301
302 return (int)co;
303 }
304
305 /* +++++++++++++++++++++++++++++++++++++++++++++++++++*/
306 /* Stream to Block (decoding) */
307 /* */
308
309 int RTjpeg_s2b(__s16 *data, __s8 *strm, __u8 bt8, __u32 *qtbl)
310 {
311 int ci;
312 register int co;
313 register int i;
314 register unsigned char bitten;
315 register unsigned char bitoff;
316
317 /* first byte always read */
318 i=RTjpeg_ZZ[0];
319 data[i]=((__u8)strm[0])*qtbl[i];
320
321 /* we start at the behind */
322
323 bitten = ((unsigned char)strm[1]) >> 2;
324 co = 63;
325 for(; co > bitten; co--) {
326
327 data[RTjpeg_ZZ[co]] = 0;
328
329 }
330
331 if (co==0) {
332 ci = 2;
333 goto AUTOBAHN;
334 }
335
336 /* we have to read the last 2 bits of the second byte */
337 ci=1;
338 bitoff = 0;
339
340 for(; co>0; co--) {
341
342 bitten = ((unsigned char)strm[ci]) >> bitoff;
343 bitten &= 0x03;
344
345 i=RTjpeg_ZZ[co];
346
347 switch( bitten ) {
348 case 0x03:
349 data[i]= -qtbl[i];
350 break;
351 case 0x02:
352 goto FUSSWEG;
353 break;
354 case 0x01:
355 data[i]= qtbl[i];
356 break;
357 case 0x00:
358 data[i]= 0;
359 break;
360 default:
361
362 }
363
364 if( bitoff == 0 ) {
365 bitoff = 8;
366 ci++;
367 }
368 bitoff -= 2;
369 }
370 /* co is 0 now */
371 /* data is written properly */
372
373 /* if bitoff!=6 then ci is the index, but should be the byte count, so we increment by 1 */
374 if (bitoff!=6) ci++;
375
376 goto AUTOBAHN;
377
378
379 FUSSWEG:
380 /* correct bitoff to nibble */
381 switch(bitoff){
382 case 4:
383 case 6:
384 bitoff = 0;
385 break;
386 case 2:
387 case 0:
388 /* we have to read from the next byte */
389 ci++;
390 bitoff = 4;
391 break;
392 default:
393 break;
394 }
395
396 for(; co>0; co--) {
397
398 bitten = ((unsigned char)strm[ci]) >> bitoff;
399 bitten &= 0x0f;
400
401 i=RTjpeg_ZZ[co];
402
403 if( bitten == 0x08 ) {
404 goto STRASSE;
405 }
406
407 /* the compiler cannot do sign extension for signed nibbles */
408 if( bitten & 0x08 ) {
409 bitten |= 0xf0;
410 }
411 /* the unsigned char bitten now is a valid signed char */
412
413 data[i]=((signed char)bitten)*qtbl[i];
414
415 if( bitoff == 0 ) {
416 bitoff = 8;
417 ci++;
418 }
419 bitoff -= 4;
420 }
421 /* co is 0 */
422
423 /* if bitoff!=4 then ci is the index, but should be the byte count, so we increment by 1 */
424 if (bitoff!=4) ci++;
425
426 goto AUTOBAHN;
427
428 STRASSE:
429 ci++;
430
431 for(; co>0; co--) {
432 i=RTjpeg_ZZ[co];
433 data[i]=strm[ci++]*qtbl[i];
434 }
435
436 /* ci now is the count, because it points to next element => no incrementing */
437
438 AUTOBAHN:
439
440 #ifdef SHOWBLOCK
441 fprintf(stdout, "\nci = '%d'\n", ci);
442 for (i=0; i < 64; i++) {
443 fprintf(stdout, "%d ", data[RTjpeg_ZZ[i]]);
444 }
445 fprintf(stdout, "\n\n");
446 #endif
447
448 return ci;
449 }
450
451 #else
452
453 int RTjpeg_b2s(__s16 *data, __s8 *strm, __u8 bt8)
454 {
455 register int ci, co=1, tmp;
456 register __s16 ZZvalue;
457
458 #ifdef SHOWBLOCK
459
460 int ii;
461 for (ii=0; ii < 64; ii++) {
462 fprintf(stdout, "%d ", data[RTjpeg_ZZ[ii]]);
463 }
464 fprintf(stdout, "\n\n");
465
466 #endif
467
468 (__u8)strm[0]=(__u8)(data[RTjpeg_ZZ[0]]>254) ? 254:((data[RTjpeg_ZZ[0]]<0)?0:data[RTjpeg_ZZ[0]]);
469
470 for(ci=1; ci<=bt8; ci++)
471 {
472 ZZvalue = data[RTjpeg_ZZ[ci]];
473
474 if(ZZvalue>0)
475 {
476 strm[co++]=(__s8)(ZZvalue>127)?127:ZZvalue;
477 }
478 else
479 {
480 strm[co++]=(__s8)(ZZvalue<-128)?-128:ZZvalue;
481 }
482 }
483
484 for(; ci<64; ci++)
485 {
486 ZZvalue = data[RTjpeg_ZZ[ci]];
487
488 if(ZZvalue>0)
489 {
490 strm[co++]=(__s8)(ZZvalue>63)?63:ZZvalue;
491 }
492 else if(ZZvalue<0)
493 {
494 strm[co++]=(__s8)(ZZvalue<-64)?-64:ZZvalue;
495 }
496 else /* compress zeros */
497 {
498 tmp=ci;
499 do
500 {
501 ci++;
502 }
503 while((ci<64)&&(data[RTjpeg_ZZ[ci]]==0));
504
505 strm[co++]=(__s8)(63+(ci-tmp));
506 ci--;
507 }
508 }
509 return (int)co;
510 }
511
512 int RTjpeg_s2b(__s16 *data, __s8 *strm, __u8 bt8, __u32 *qtbl)
513 {
514 int ci=1, co=1, tmp;
515 register int i;
516
517 i=RTjpeg_ZZ[0];
518 data[i]=((__u8)strm[0])*qtbl[i];
519
520 for(co=1; co<=bt8; co++)
521 {
522 i=RTjpeg_ZZ[co];
523 data[i]=strm[ci++]*qtbl[i];
524 }
525
526 for(; co<64; co++)
527 {
528 if(strm[ci]>63)
529 {
530 tmp=co+strm[ci]-63;
531 for(; co<tmp; co++)data[RTjpeg_ZZ[co]]=0;
532 co--;
533 } else
534 {
535 i=RTjpeg_ZZ[co];
536 data[i]=strm[ci]*qtbl[i];
537 }
538 ci++;
539 }
540 return (int)ci;
541 }
542 #endif
543
544 #if defined(MMX)
545 void RTjpeg_quant_init(void)
546 {
547 int i;
548 __s16 *qtbl;
549
550 qtbl=(__s16 *)RTjpeg_lqt;
551 for(i=0; i<64; i++)qtbl[i]=(__s16)RTjpeg_lqt[i];
552
553 qtbl=(__s16 *)RTjpeg_cqt;
554 for(i=0; i<64; i++)qtbl[i]=(__s16)RTjpeg_cqt[i];
555 }
556
557 static mmx_t RTjpeg_ones=(mmx_t)(long long)0x0001000100010001LL;
558 static mmx_t RTjpeg_half=(mmx_t)(long long)0x7fff7fff7fff7fffLL;
559
560 void RTjpeg_quant(__s16 *block, __s32 *qtbl)
561 {
562 int i;
563 mmx_t *bl, *ql;
564
565 ql=(mmx_t *)qtbl;
566 bl=(mmx_t *)block;
567
568 movq_m2r(RTjpeg_ones, mm6);
569 movq_m2r(RTjpeg_half, mm7);
570
571 for(i=16; i; i--)
572 {
573 movq_m2r(*(ql++), mm0); /* quant vals (4) */
574 movq_m2r(*bl, mm2); /* block vals (4) */
575 movq_r2r(mm0, mm1);
576 movq_r2r(mm2, mm3);
577
578 punpcklwd_r2r(mm6, mm0); /* 1 qb 1 qa */
579 punpckhwd_r2r(mm6, mm1); /* 1 qd 1 qc */
580
581 punpcklwd_r2r(mm7, mm2); /* 32767 bb 32767 ba */
582 punpckhwd_r2r(mm7, mm3); /* 32767 bd 32767 bc */
583
584 pmaddwd_r2r(mm2, mm0); /* 32767+bb*qb 32767+ba*qa */
585 pmaddwd_r2r(mm3, mm1); /* 32767+bd*qd 32767+bc*qc */
586
587 psrad_i2r(16, mm0);
588 psrad_i2r(16, mm1);
589
590 packssdw_r2r(mm1, mm0);
591
592 movq_r2m(mm0, *(bl++));
593
594 }
595 }
596 #else
597 void RTjpeg_quant_init(void)
598 {
599 }
600
601 void RTjpeg_quant(__s16 *block, __s32 *qtbl)
602 {
603 int i;
604
605 for(i=0; i<64; i++)
606 block[i]=(__s16)((block[i]*qtbl[i]+32767)>>16);
607 }
608 #endif
609
610 /*
611 * Perform the forward DCT on one block of samples.
612 */
613 #ifdef MMX
614 static mmx_t RTjpeg_C4 =(mmx_t)(long long)0x2D412D412D412D41LL;
615 static mmx_t RTjpeg_C6 =(mmx_t)(long long)0x187E187E187E187ELL;
616 static mmx_t RTjpeg_C2mC6=(mmx_t)(long long)0x22A322A322A322A3LL;
617 static mmx_t RTjpeg_C2pC6=(mmx_t)(long long)0x539F539F539F539FLL;
618 static mmx_t RTjpeg_zero =(mmx_t)(long long)0x0000000000000000LL;
619
620 #else
621
622 #define FIX_0_382683433 ((__s32) 98) /* FIX(0.382683433) */
623 #define FIX_0_541196100 ((__s32) 139) /* FIX(0.541196100) */
624 #define FIX_0_707106781 ((__s32) 181) /* FIX(0.707106781) */
625 #define FIX_1_306562965 ((__s32) 334) /* FIX(1.306562965) */
626
627 #define DESCALE10(x) (__s16)( ((x)+128) >> 8)
628 #define DESCALE20(x) (__s16)(((x)+32768) >> 16)
629 #define D_MULTIPLY(var,const) ((__s32) ((var) * (const)))
630 #endif
631
632 void RTjpeg_dct_init(void)
633 {
634 int i;
635
636 for(i=0; i<64; i++)
637 {
638 RTjpeg_lqt[i]=(((__u64)RTjpeg_lqt[i]<<32)/RTjpeg_aan_tab[i]);
639 RTjpeg_cqt[i]=(((__u64)RTjpeg_cqt[i]<<32)/RTjpeg_aan_tab[i]);
640 }
641 }
642
643 void RTjpeg_dctY(__u8 *idata, __s16 *odata, int rskip)
644 {
645 #ifndef MMX
646 __s32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
647 __s32 tmp10, tmp11, tmp12, tmp13;
648 __s32 z1, z2, z3, z4, z5, z11, z13;
649 __u8 *idataptr;
650 __s16 *odataptr;
651 __s32 *wsptr;
652 int ctr;
653
654 idataptr = idata;
655 wsptr = RTjpeg_ws;
656 for (ctr = 7; ctr >= 0; ctr--) {
657 tmp0 = idataptr[0] + idataptr[7];
658 tmp7 = idataptr[0] - idataptr[7];
659 tmp1 = idataptr[1] + idataptr[6];
660 tmp6 = idataptr[1] - idataptr[6];
661 tmp2 = idataptr[2] + idataptr[5];
662 tmp5 = idataptr[2] - idataptr[5];
663 tmp3 = idataptr[3] + idataptr[4];
664 tmp4 = idataptr[3] - idataptr[4];
665
666 tmp10 = (tmp0 + tmp3); /* phase 2 */
667 tmp13 = tmp0 - tmp3;
668 tmp11 = (tmp1 + tmp2);
669 tmp12 = tmp1 - tmp2;
670
671 wsptr[0] = (tmp10 + tmp11)<<8; /* phase 3 */
672 wsptr[4] = (tmp10 - tmp11)<<8;
673
674 z1 = D_MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
675 wsptr[2] = (tmp13<<8) + z1; /* phase 5 */
676 wsptr[6] = (tmp13<<8) - z1;
677
678 tmp10 = tmp4 + tmp5; /* phase 2 */
679 tmp11 = tmp5 + tmp6;
680 tmp12 = tmp6 + tmp7;
681
682 z5 = D_MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */
683 z2 = D_MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */
684 z4 = D_MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */
685 z3 = D_MULTIPLY(tmp11, FIX_0_707106781); /* c4 */
686
687 z11 = (tmp7<<8) + z3; /* phase 5 */
688 z13 = (tmp7<<8) - z3;
689
690 wsptr[5] = z13 + z2; /* phase 6 */
691 wsptr[3] = z13 - z2;
692 wsptr[1] = z11 + z4;
693 wsptr[7] = z11 - z4;
694
695 idataptr += rskip<<3; /* advance pointer to next row */
696 wsptr += 8;
697 }
698
699 wsptr = RTjpeg_ws;
700 odataptr=odata;
701 for (ctr = 7; ctr >= 0; ctr--) {
702 tmp0 = wsptr[0] + wsptr[56];
703 tmp7 = wsptr[0] - wsptr[56];
704 tmp1 = wsptr[8] + wsptr[48];
705 tmp6 = wsptr[8] - wsptr[48];
706 tmp2 = wsptr[16] + wsptr[40];
707 tmp5 = wsptr[16] - wsptr[40];
708 tmp3 = wsptr[24] + wsptr[32];
709 tmp4 = wsptr[24] - wsptr[32];
710
711 tmp10 = tmp0 + tmp3; /* phase 2 */
712 tmp13 = tmp0 - tmp3;
713 tmp11 = tmp1 + tmp2;
714 tmp12 = tmp1 - tmp2;
715
716 odataptr[0] = DESCALE10(tmp10 + tmp11); /* phase 3 */
717 odataptr[32] = DESCALE10(tmp10 - tmp11);
718
719 z1 = D_MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
720 odataptr[16] = DESCALE20((tmp13<<8) + z1); /* phase 5 */
721 odataptr[48] = DESCALE20((tmp13<<8) - z1);
722
723 tmp10 = tmp4 + tmp5; /* phase 2 */
724 tmp11 = tmp5 + tmp6;
725 tmp12 = tmp6 + tmp7;
726
727 z5 = D_MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */
728 z2 = D_MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */
729 z4 = D_MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */
730 z3 = D_MULTIPLY(tmp11, FIX_0_707106781); /* c4 */
731
732 z11 = (tmp7<<8) + z3; /* phase 5 */
733 z13 = (tmp7<<8) - z3;
734
735 odataptr[40] = DESCALE20(z13 + z2); /* phase 6 */
736 odataptr[24] = DESCALE20(z13 - z2);
737 odataptr[8] = DESCALE20(z11 + z4);
738 odataptr[56] = DESCALE20(z11 - z4);
739
740 odataptr++; /* advance pointer to next column */
741 wsptr++;
742 }
743 #else
744 volatile mmx_t tmp6, tmp7;
745 register mmx_t *dataptr = (mmx_t *)odata;
746 mmx_t *idata2 = (mmx_t *)idata;
747
748 // first copy the input 8 bit to the destination 16 bits
749
750 movq_m2r(RTjpeg_zero, mm2);
751
752
753 movq_m2r(*idata2, mm0);
754 movq_r2r(mm0, mm1);
755
756 punpcklbw_r2r(mm2, mm0);
757 movq_r2m(mm0, *(dataptr));
758
759 punpckhbw_r2r(mm2, mm1);
760 movq_r2m(mm1, *(dataptr+1));
761
762 idata2 += rskip;
763
764 movq_m2r(*idata2, mm0);
765 movq_r2r(mm0, mm1);
766
767 punpcklbw_r2r(mm2, mm0);
768 movq_r2m(mm0, *(dataptr+2));
769
770 punpckhbw_r2r(mm2, mm1);
771 movq_r2m(mm1, *(dataptr+3));
772
773 idata2 += rskip;
774
775 movq_m2r(*idata2, mm0);
776 movq_r2r(mm0, mm1);
777
778 punpcklbw_r2r(mm2, mm0);
779 movq_r2m(mm0, *(dataptr+4));
780
781 punpckhbw_r2r(mm2, mm1);
782 movq_r2m(mm1, *(dataptr+5));
783
784 idata2 += rskip;
785
786 movq_m2r(*idata2, mm0);
787 movq_r2r(mm0, mm1);
788
789 punpcklbw_r2r(mm2, mm0);
790 movq_r2m(mm0, *(dataptr+6));
791
792 punpckhbw_r2r(mm2, mm1);
793 movq_r2m(mm1, *(dataptr+7));
794
795 idata2 += rskip;
796
797 movq_m2r(*idata2, mm0);
798 movq_r2r(mm0, mm1);
799
800 punpcklbw_r2r(mm2, mm0);
801 movq_r2m(mm0, *(dataptr+8));
802
803 punpckhbw_r2r(mm2, mm1);
804 movq_r2m(mm1, *(dataptr+9));
805
806 idata2 += rskip;
807
808 movq_m2r(*idata2, mm0);
809 movq_r2r(mm0, mm1);
810
811 punpcklbw_r2r(mm2, mm0);
812 movq_r2m(mm0, *(dataptr+10));
813
814 punpckhbw_r2r(mm2, mm1);
815 movq_r2m(mm1, *(dataptr+11));
816
817 idata2 += rskip;
818
819 movq_m2r(*idata2, mm0);
820 movq_r2r(mm0, mm1);
821
822 punpcklbw_r2r(mm2, mm0);
823 movq_r2m(mm0, *(dataptr+12));
824
825 punpckhbw_r2r(mm2, mm1);
826 movq_r2m(mm1, *(dataptr+13));
827
828 idata2 += rskip;
829
830 movq_m2r(*idata2, mm0);
831 movq_r2r(mm0, mm1);
832
833 punpcklbw_r2r(mm2, mm0);
834 movq_r2m(mm0, *(dataptr+14));
835
836 punpckhbw_r2r(mm2, mm1);
837 movq_r2m(mm1, *(dataptr+15));
838
839 /* Start Transpose to do calculations on rows */
840
841 movq_m2r(*(dataptr+9), mm7); // m03:m02|m01:m00 - first line (line 4)and copy into m5
842
843 movq_m2r(*(dataptr+13), mm6); // m23:m22|m21:m20 - third line (line 6)and copy into m2
844 movq_r2r(mm7, mm5);
845
846 punpcklwd_m2r(*(dataptr+11), mm7); // m11:m01|m10:m00 - interleave first and second lines
847 movq_r2r(mm6, mm2);
848
849 punpcklwd_m2r(*(dataptr+15), mm6); // m31:m21|m30:m20 - interleave third and fourth lines
850 movq_r2r(mm7, mm1);
851
852 movq_m2r(*(dataptr+11), mm3); // m13:m13|m11:m10 - second line
853 punpckldq_r2r(mm6, mm7); // m30:m20|m10:m00 - interleave to produce result 1
854
855 movq_m2r(*(dataptr+15), mm0); // m13:m13|m11:m10 - fourth line
856 punpckhdq_r2r(mm6, mm1); // m31:m21|m11:m01 - interleave to produce result 2
857
858 movq_r2m(mm7,*(dataptr+9)); // write result 1
859 punpckhwd_r2r(mm3, mm5); // m13:m03|m12:m02 - interleave first and second lines
860
861 movq_r2m(mm1,*(dataptr+11)); // write result 2
862 punpckhwd_r2r(mm0, mm2); // m33:m23|m32:m22 - interleave third and fourth lines
863
864 movq_r2r(mm5, mm1);
865 punpckldq_r2r(mm2, mm5); // m32:m22|m12:m02 - interleave to produce result 3
866
867 movq_m2r(*(dataptr+1), mm0); // m03:m02|m01:m00 - first line, 4x4
868 punpckhdq_r2r(mm2, mm1); // m33:m23|m13:m03 - interleave to produce result 4
869
870 movq_r2m(mm5,*(dataptr+13)); // write result 3
871
872 // last 4x4 done
873
874 movq_r2m(mm1, *(dataptr+15)); // write result 4, last 4x4
875
876 movq_m2r(*(dataptr+5), mm2); // m23:m22|m21:m20 - third line
877 movq_r2r(mm0, mm6);
878
879 punpcklwd_m2r(*(dataptr+3), mm0); // m11:m01|m10:m00 - interleave first and second lines
880 movq_r2r(mm2, mm7);
881
882 punpcklwd_m2r(*(dataptr+7), mm2); // m31:m21|m30:m20 - interleave third and fourth lines
883 movq_r2r(mm0, mm4);
884
885 //
886 movq_m2r(*(dataptr+8), mm1); // n03:n02|n01:n00 - first line
887 punpckldq_r2r(mm2, mm0); // m30:m20|m10:m00 - interleave to produce first result
888
889 movq_m2r(*(dataptr+12), mm3); // n23:n22|n21:n20 - third line
890 punpckhdq_r2r(mm2, mm4); // m31:m21|m11:m01 - interleave to produce second result
891
892 punpckhwd_m2r(*(dataptr+3), mm6); // m13:m03|m12:m02 - interleave first and second lines
893 movq_r2r(mm1, mm2); // copy first line
894
895 punpckhwd_m2r(*(dataptr+7), mm7); // m33:m23|m32:m22 - interleave third and fourth lines
896 movq_r2r(mm6, mm5); // copy first intermediate result
897
898 movq_r2m(mm0, *(dataptr+8)); // write result 1
899 punpckhdq_r2r(mm7, mm5); // m33:m23|m13:m03 - produce third result
900
901 punpcklwd_m2r(*(dataptr+10), mm1); // n11:n01|n10:n00 - interleave first and second lines
902 movq_r2r(mm3, mm0); // copy third line
903
904 punpckhwd_m2r(*(dataptr+10), mm2); // n13:n03|n12:n02 - interleave first and second lines
905
906 movq_r2m(mm4, *(dataptr+10)); // write result 2 out
907 punpckldq_r2r(mm7, mm6); // m32:m22|m12:m02 - produce fourth result
908
909 punpcklwd_m2r(*(dataptr+14), mm3); // n31:n21|n30:n20 - interleave third and fourth lines
910 movq_r2r(mm1, mm4);
911
912 movq_r2m(mm6, *(dataptr+12)); // write result 3 out
913 punpckldq_r2r(mm3, mm1); // n30:n20|n10:n00 - produce first result
914
915 punpckhwd_m2r(*(dataptr+14), mm0); // n33:n23|n32:n22 - interleave third and fourth lines
916 movq_r2r(mm2, mm6);
917
918 movq_r2m(mm5, *(dataptr+14)); // write result 4 out
919 punpckhdq_r2r(mm3, mm4); // n31:n21|n11:n01- produce second result
920
921 movq_r2m(mm1, *(dataptr+1)); // write result 5 out - (first result for other 4 x 4 block)
922 punpckldq_r2r(mm0, mm2); // n32:n22|n12:n02- produce third result
923
924 movq_r2m(mm4, *(dataptr+3)); // write result 6 out
925 punpckhdq_r2r(mm0, mm6); // n33:n23|n13:n03 - produce fourth result
926
927 movq_r2m(mm2, *(dataptr+5)); // write result 7 out
928
929 movq_m2r(*dataptr, mm0); // m03:m02|m01:m00 - first line, first 4x4
930
931 movq_r2m(mm6, *(dataptr+7)); // write result 8 out
932
933
934 // Do first 4x4 quadrant, which is used in the beginning of the DCT:
935
936 movq_m2r(*(dataptr+4), mm7); // m23:m22|m21:m20 - third line
937 movq_r2r(mm0, mm2);
938
939 punpcklwd_m2r(*(dataptr+2), mm0); // m11:m01|m10:m00 - interleave first and second lines
940 movq_r2r(mm7, mm4);
941
942 punpcklwd_m2r(*(dataptr+6), mm7); // m31:m21|m30:m20 - interleave third and fourth lines
943 movq_r2r(mm0, mm1);
944
945 movq_m2r(*(dataptr+2), mm6); // m13:m12|m11:m10 - second line
946 punpckldq_r2r(mm7, mm0); // m30:m20|m10:m00 - interleave to produce result 1
947
948 movq_m2r(*(dataptr+6), mm5); // m33:m32|m31:m30 - fourth line
949 punpckhdq_r2r(mm7, mm1); // m31:m21|m11:m01 - interleave to produce result 2
950
951 movq_r2r(mm0, mm7); // write result 1
952 punpckhwd_r2r(mm6, mm2); // m13:m03|m12:m02 - interleave first and second lines
953
954 psubw_m2r(*(dataptr+14), mm7); // tmp07=x0-x7 /* Stage 1 */
955 movq_r2r(mm1, mm6); // write result 2
956
957 paddw_m2r(*(dataptr+14), mm0); // tmp00=x0+x7 /* Stage 1 */
958 punpckhwd_r2r(mm5, mm4); // m33:m23|m32:m22 - interleave third and fourth lines
959
960 paddw_m2r(*(dataptr+12), mm1); // tmp01=x1+x6 /* Stage 1 */
961 movq_r2r(mm2, mm3); // copy first intermediate result
962
963 psubw_m2r(*(dataptr+12), mm6); // tmp06=x1-x6 /* Stage 1 */
964 punpckldq_r2r(mm4, mm2); // m32:m22|m12:m02 - interleave to produce result 3
965
966 movq_r2m(mm7, tmp7);
967 movq_r2r(mm2, mm5); // write result 3
968
969 movq_r2m(mm6, tmp6);
970 punpckhdq_r2r(mm4, mm3); // m33:m23|m13:m03 - interleave to produce result 4
971
972 paddw_m2r(*(dataptr+10), mm2); // tmp02=x2+5 /* Stage 1 */
973 movq_r2r(mm3, mm4); // write result 4
974
975 /************************************************************************************************
976 End of Transpose
977 ************************************************************************************************/
978
979
980 paddw_m2r(*(dataptr+8), mm3); // tmp03=x3+x4 /* stage 1*/
981 movq_r2r(mm0, mm7);
982
983 psubw_m2r(*(dataptr+8), mm4); // tmp04=x3-x4 /* stage 1*/
984 movq_r2r(mm1, mm6);
985
986 paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03 /* even 2 */
987 psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03 /* even 2 */
988
989 psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02 /* even 2 */
990 paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02 /* even 2 */
991
992 psubw_m2r(*(dataptr+10), mm5); // tmp05=x2-x5 /* stage 1*/
993 paddw_r2r(mm7, mm6); // tmp12 + tmp13
994
995 /* stage 3 */
996
997 movq_m2r(tmp6, mm2);
998 movq_r2r(mm0, mm3);
999
1000 psllw_i2r(2, mm6); // m8 * 2^2
1001 paddw_r2r(mm1, mm0);
1002
1003 pmulhw_m2r(RTjpeg_C4, mm6); // z1
1004 psubw_r2r(mm1, mm3);
1005
1006 movq_r2m(mm0, *dataptr);
1007 movq_r2r(mm7, mm0);
1008
1009 /* Odd part */
1010 movq_r2m(mm3, *(dataptr+8));
1011 paddw_r2r(mm5, mm4); // tmp10
1012
1013 movq_m2r(tmp7, mm3);
1014 paddw_r2r(mm6, mm0); // tmp32
1015
1016 paddw_r2r(mm2, mm5); // tmp11
1017 psubw_r2r(mm6, mm7); // tmp33
1018
1019 movq_r2m(mm0, *(dataptr+4));
1020 paddw_r2r(mm3, mm2); // tmp12
1021
1022 /* stage 4 */
1023
1024 movq_r2m(mm7, *(dataptr+12));
1025 movq_r2r(mm4, mm1); // copy of tmp10
1026
1027 psubw_r2r(mm2, mm1); // tmp10 - tmp12
1028 psllw_i2r(2, mm4); // m8 * 2^2
1029
1030 movq_m2r(RTjpeg_C2mC6, mm0);
1031 psllw_i2r(2, mm1);
1032
1033 pmulhw_m2r(RTjpeg_C6, mm1); // z5
1034 psllw_i2r(2, mm2);
1035
1036 pmulhw_r2r(mm0, mm4); // z5
1037
1038 /* stage 5 */
1039
1040 pmulhw_m2r(RTjpeg_C2pC6, mm2);
1041 psllw_i2r(2, mm5);
1042
1043 pmulhw_m2r(RTjpeg_C4, mm5); // z3
1044 movq_r2r(mm3, mm0); // copy tmp7
1045
1046 movq_m2r(*(dataptr+1), mm7);
1047 paddw_r2r(mm1, mm4); // z2
1048
1049 paddw_r2r(mm1, mm2); // z4
1050
1051 paddw_r2r(mm5, mm0); // z11
1052 psubw_r2r(mm5, mm3); // z13
1053
1054 /* stage 6 */
1055
1056 movq_r2r(mm3, mm5); // copy z13
1057 psubw_r2r(mm4, mm3); // y3=z13 - z2
1058
1059 paddw_r2r(mm4, mm5); // y5=z13 + z2
1060 movq_r2r(mm0, mm6); // copy z11
1061
1062 movq_r2m(mm3, *(dataptr+6)); //save y3
1063 psubw_r2r(mm2, mm0); // y7=z11 - z4
1064
1065 movq_r2m(mm5, *(dataptr+10)); //save y5
1066 paddw_r2r(mm2, mm6); // y1=z11 + z4
1067
1068 movq_r2m(mm0, *(dataptr+14)); //save y7
1069
1070 /************************************************
1071 * End of 1st 4 rows
1072 ************************************************/
1073
1074 movq_m2r(*(dataptr+3), mm1); // load x1 /* stage 1 */
1075 movq_r2r(mm7, mm0); // copy x0
1076
1077 movq_r2m(mm6, *(dataptr+2)); //save y1
1078
1079 movq_m2r(*(dataptr+5), mm2); // load x2 /* stage 1 */
1080 movq_r2r(mm1, mm6); // copy x1
1081
1082 paddw_m2r(*(dataptr+15), mm0); // tmp00 = x0 + x7
1083
1084 movq_m2r(*(dataptr+7), mm3); // load x3 /* stage 1 */
1085 movq_r2r(mm2, mm5); // copy x2
1086
1087 psubw_m2r(*(dataptr+15), mm7); // tmp07 = x0 - x7
1088 movq_r2r(mm3, mm4); // copy x3
1089
1090 paddw_m2r(*(dataptr+13), mm1); // tmp01 = x1 + x6
1091
1092 movq_r2m(mm7, tmp7); // save tmp07
1093 movq_r2r(mm0, mm7); // copy tmp00
1094
1095 psubw_m2r(*(dataptr+13), mm6); // tmp06 = x1 - x6
1096
1097 /* stage 2, Even Part */
1098
1099 paddw_m2r(*(dataptr+9), mm3); // tmp03 = x3 + x4
1100
1101 movq_r2m(mm6, tmp6); // save tmp07
1102 movq_r2r(mm1, mm6); // copy tmp01
1103
1104 paddw_m2r(*(dataptr+11), mm2); // tmp02 = x2 + x5
1105 paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03
1106
1107 psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03
1108
1109 psubw_m2r(*(dataptr+9), mm4); // tmp04 = x3 - x4
1110 psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02
1111
1112 paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02
1113
1114 psubw_m2r(*(dataptr+11), mm5); // tmp05 = x2 - x5
1115 paddw_r2r(mm7, mm6); // tmp12 + tmp13
1116
1117 /* stage 3, Even and stage 4 & 5 even */
1118
1119 movq_m2r(tmp6, mm2); // load tmp6
1120 movq_r2r(mm0, mm3); // copy tmp10
1121
1122 psllw_i2r(2, mm6); // shift z1
1123 paddw_r2r(mm1, mm0); // y0=tmp10 + tmp11
1124
1125 pmulhw_m2r(RTjpeg_C4, mm6); // z1
1126 psubw_r2r(mm1, mm3); // y4=tmp10 - tmp11
1127
1128 movq_r2m(mm0, *(dataptr+1)); //save y0
1129 movq_r2r(mm7, mm0); // copy tmp13
1130
1131 /* odd part */
1132
1133 movq_r2m(mm3, *(dataptr+9)); //save y4
1134 paddw_r2r(mm5, mm4); // tmp10 = tmp4 + tmp5
1135
1136 movq_m2r(tmp7, mm3); // load tmp7
1137 paddw_r2r(mm6, mm0); // tmp32 = tmp13 + z1
1138
1139 paddw_r2r(mm2, mm5); // tmp11 = tmp5 + tmp6
1140 psubw_r2r(mm6, mm7); // tmp33 = tmp13 - z1
1141
1142 movq_r2m(mm0, *(dataptr+5)); //save y2
1143 paddw_r2r(mm3, mm2); // tmp12 = tmp6 + tmp7
1144
1145 /* stage 4 */
1146
1147 movq_r2m(mm7, *(dataptr+13)); //save y6
1148 movq_r2r(mm4, mm1); // copy tmp10
1149
1150 psubw_r2r(mm2, mm1); // tmp10 - tmp12
1151 psllw_i2r(2, mm4); // shift tmp10
1152
1153 movq_m2r(RTjpeg_C2mC6, mm0); // load C2mC6
1154 psllw_i2r(2, mm1); // shift (tmp10-tmp12)
1155
1156 pmulhw_m2r(RTjpeg_C6, mm1); // z5
1157 psllw_i2r(2, mm5); // prepare for multiply
1158
1159 pmulhw_r2r(mm0, mm4); // multiply by converted real
1160
1161 /* stage 5 */
1162
1163 pmulhw_m2r(RTjpeg_C4, mm5); // z3
1164 psllw_i2r(2, mm2); // prepare for multiply
1165
1166 pmulhw_m2r(RTjpeg_C2pC6, mm2); // multiply
1167 movq_r2r(mm3, mm0); // copy tmp7
1168
1169 movq_m2r(*(dataptr+9), mm7); // m03:m02|m01:m00 - first line (line 4)and copy into mm7
1170 paddw_r2r(mm1, mm4); // z2
1171
1172 paddw_r2r(mm5, mm0); // z11
1173 psubw_r2r(mm5, mm3); // z13
1174
1175 /* stage 6 */
1176
1177 movq_r2r(mm3, mm5); // copy z13
1178 paddw_r2r(mm1, mm2); // z4
1179
1180 movq_r2r(mm0, mm6); // copy z11
1181 psubw_r2r(mm4, mm5); // y3
1182
1183 paddw_r2r(mm2, mm6); // y1
1184 paddw_r2r(mm4, mm3); // y5
1185
1186 movq_r2m(mm5, *(dataptr+7)); //save y3
1187
1188 movq_r2m(mm6, *(dataptr+3)); //save y1
1189 psubw_r2r(mm2, mm0); // y7
1190
1191 /************************************************************************************************
1192 Start of Transpose
1193 ************************************************************************************************/
1194
1195 movq_m2r(*(dataptr+13), mm6); // m23:m22|m21:m20 - third line (line 6)and copy into m2
1196 movq_r2r(mm7, mm5); // copy first line
1197
1198 punpcklwd_r2r(mm3, mm7); // m11:m01|m10:m00 - interleave first and second lines
1199 movq_r2r(mm6, mm2); // copy third line
1200
1201 punpcklwd_r2r(mm0, mm6); // m31:m21|m30:m20 - interleave third and fourth lines
1202 movq_r2r(mm7, mm1); // copy first intermediate result
1203
1204 punpckldq_r2r(mm6, mm7); // m30:m20|m10:m00 - interleave to produce result 1
1205
1206 punpckhdq_r2r(mm6, mm1); // m31:m21|m11:m01 - interleave to produce result 2
1207
1208 movq_r2m(mm7, *(dataptr+9)); // write result 1
1209 punpckhwd_r2r(mm3, mm5); // m13:m03|m12:m02 - interleave first and second lines
1210
1211 movq_r2m(mm1, *(dataptr+11)); // write result 2
1212 punpckhwd_r2r(mm0, mm2); // m33:m23|m32:m22 - interleave third and fourth lines
1213
1214 movq_r2r(mm5, mm1); // copy first intermediate result
1215 punpckldq_r2r(mm2, mm5); // m32:m22|m12:m02 - interleave to produce result 3
1216
1217 movq_m2r(*(dataptr+1), mm0); // m03:m02|m01:m00 - first line, 4x4
1218 punpckhdq_r2r(mm2, mm1); // m33:m23|m13:m03 - interleave to produce result 4
1219
1220 movq_r2m(mm5, *(dataptr+13)); // write result 3
1221
1222 /****** last 4x4 done */
1223
1224 movq_r2m(mm1, *(dataptr+15)); // write result 4, last 4x4
1225
1226 movq_m2r(*(dataptr+5), mm2); // m23:m22|m21:m20 - third line
1227 movq_r2r(mm0, mm6); // copy first line
1228
1229 punpcklwd_m2r(*(dataptr+3), mm0); // m11:m01|m10:m00 - interleave first and second lines
1230 movq_r2r(mm2, mm7); // copy third line
1231
1232 punpcklwd_m2r(*(dataptr+7), mm2); // m31:m21|m30:m20 - interleave third and fourth lines
1233 movq_r2r(mm0, mm4); // copy first intermediate result
1234
1235
1236
1237 movq_m2r(*(dataptr+8), mm1); // n03:n02|n01:n00 - first line
1238 punpckldq_r2r(mm2, mm0); // m30:m20|m10:m00 - interleave to produce first result
1239
1240 movq_m2r(*(dataptr+12), mm3); // n23:n22|n21:n20 - third line
1241 punpckhdq_r2r(mm2, mm4); // m31:m21|m11:m01 - interleave to produce second result
1242
1243 punpckhwd_m2r(*(dataptr+3), mm6); // m13:m03|m12:m02 - interleave first and second lines
1244 movq_r2r(mm1, mm2); // copy first line
1245
1246 punpckhwd_m2r(*(dataptr+7), mm7); // m33:m23|m32:m22 - interleave third and fourth lines
1247 movq_r2r(mm6, mm5); // copy first intermediate result
1248
1249 movq_r2m(mm0, *(dataptr+8)); // write result 1
1250 punpckhdq_r2r(mm7, mm5); // m33:m23|m13:m03 - produce third result
1251
1252 punpcklwd_m2r(*(dataptr+10), mm1); // n11:n01|n10:n00 - interleave first and second lines
1253 movq_r2r(mm3, mm0); // copy third line
1254
1255 punpckhwd_m2r(*(dataptr+10), mm2); // n13:n03|n12:n02 - interleave first and second lines
1256
1257 movq_r2m(mm4, *(dataptr+10)); // write result 2 out
1258 punpckldq_r2r(mm7, mm6); // m32:m22|m12:m02 - produce fourth result
1259
1260 punpcklwd_m2r(*(dataptr+14), mm3); // n33:n23|n32:n22 - interleave third and fourth lines
1261 movq_r2r(mm1, mm4); // copy second intermediate result
1262
1263 movq_r2m(mm6, *(dataptr+12)); // write result 3 out
1264 punpckldq_r2r(mm3, mm1); //
1265
1266 punpckhwd_m2r(*(dataptr+14), mm0); // n33:n23|n32:n22 - interleave third and fourth lines
1267 movq_r2r(mm2, mm6); // copy second intermediate result
1268
1269 movq_r2m(mm5, *(dataptr+14)); // write result 4 out
1270 punpckhdq_r2r(mm3, mm4); // n31:n21|n11:n01- produce second result
1271
1272 movq_r2m(mm1, *(dataptr+1)); // write result 5 out - (first result for other 4 x 4 block)
1273 punpckldq_r2r(mm0, mm2); // n32:n22|n12:n02- produce third result
1274
1275 movq_r2m(mm4, *(dataptr+3)); // write result 6 out
1276 punpckhdq_r2r(mm0, mm6); // n33:n23|n13:n03 - produce fourth result
1277
1278 movq_r2m(mm2, *(dataptr+5)); // write result 7 out
1279
1280 movq_m2r(*dataptr, mm0); // m03:m02|m01:m00 - first line, first 4x4
1281
1282 movq_r2m(mm6, *(dataptr+7)); // write result 8 out
1283
1284 // Do first 4x4 quadrant, which is used in the beginning of the DCT:
1285
1286 movq_m2r(*(dataptr+4), mm7); // m23:m22|m21:m20 - third line
1287 movq_r2r(mm0, mm2); // copy first line
1288
1289 punpcklwd_m2r(*(dataptr+2), mm0); // m11:m01|m10:m00 - interleave first and second lines
1290 movq_r2r(mm7, mm4); // copy third line
1291
1292 punpcklwd_m2r(*(dataptr+6), mm7); // m31:m21|m30:m20 - interleave third and fourth lines
1293 movq_r2r(mm0, mm1); // copy first intermediate result
1294
1295 movq_m2r(*(dataptr+2), mm6); // m13:m12|m11:m10 - second line
1296 punpckldq_r2r(mm7, mm0); // m30:m20|m10:m00 - interleave to produce result 1
1297
1298 movq_m2r(*(dataptr+6), mm5); // m33:m32|m31:m30 - fourth line
1299 punpckhdq_r2r(mm7, mm1); // m31:m21|m11:m01 - interleave to produce result 2
1300
1301 movq_r2r(mm0, mm7); // write result 1
1302 punpckhwd_r2r(mm6, mm2); // m13:m03|m12:m02 - interleave first and second lines
1303
1304 psubw_m2r(*(dataptr+14), mm7); // tmp07=x0-x7 /* Stage 1 */
1305 movq_r2r(mm1, mm6); // write result 2
1306
1307 paddw_m2r(*(dataptr+14), mm0); // tmp00=x0+x7 /* Stage 1 */
1308 punpckhwd_r2r(mm5, mm4); // m33:m23|m32:m22 - interleave third and fourth lines
1309
1310 paddw_m2r(*(dataptr+12), mm1); // tmp01=x1+x6 /* Stage 1 */
1311 movq_r2r(mm2, mm3); // copy first intermediate result
1312
1313 psubw_m2r(*(dataptr+12), mm6); // tmp06=x1-x6 /* Stage 1 */
1314 punpckldq_r2r(mm4, mm2); // m32:m22|m12:m02 - interleave to produce result 3
1315
1316 movq_r2m(mm7, tmp7); // save tmp07
1317 movq_r2r(mm2, mm5); // write result 3
1318
1319 movq_r2m(mm6, tmp6); // save tmp06
1320
1321 punpckhdq_r2r(mm4, mm3); // m33:m23|m13:m03 - interleave to produce result 4
1322
1323 paddw_m2r(*(dataptr+10), mm2); // tmp02=x2+x5 /* stage 1 */
1324 movq_r2r(mm3, mm4); // write result 4
1325
1326 /************************************************************************************************
1327 End of Transpose 2
1328 ************************************************************************************************/
1329
1330 paddw_m2r(*(dataptr+8), mm3); // tmp03=x3+x4 /* stage 1*/
1331 movq_r2r(mm0, mm7);
1332
1333 psubw_m2r(*(dataptr+8), mm4); // tmp04=x3-x4 /* stage 1*/
1334 movq_r2r(mm1, mm6);
1335
1336 paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03 /* even 2 */
1337 psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03 /* even 2 */
1338
1339 psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02 /* even 2 */
1340 paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02 /* even 2 */
1341
1342 psubw_m2r(*(dataptr+10), mm5); // tmp05=x2-x5 /* stage 1*/
1343 paddw_r2r(mm7, mm6); // tmp12 + tmp13
1344
1345 /* stage 3 */
1346
1347 movq_m2r(tmp6, mm2);
1348 movq_r2r(mm0, mm3);
1349
1350 psllw_i2r(2, mm6); // m8 * 2^2
1351 paddw_r2r(mm1, mm0);
1352
1353 pmulhw_m2r(RTjpeg_C4, mm6); // z1
1354 psubw_r2r(mm1, mm3);
1355
1356 movq_r2m(mm0, *dataptr);
1357 movq_r2r(mm7, mm0);
1358
1359 /* Odd part */
1360 movq_r2m(mm3, *(dataptr+8));
1361 paddw_r2r(mm5, mm4); // tmp10
1362
1363 movq_m2r(tmp7, mm3);
1364 paddw_r2r(mm6, mm0); // tmp32
1365
1366 paddw_r2r(mm2, mm5); // tmp11
1367 psubw_r2r(mm6, mm7); // tmp33
1368
1369 movq_r2m(mm0, *(dataptr+4));
1370 paddw_r2r(mm3, mm2); // tmp12
1371
1372 /* stage 4 */
1373 movq_r2m(mm7, *(dataptr+12));
1374 movq_r2r(mm4, mm1); // copy of tmp10
1375
1376 psubw_r2r(mm2, mm1); // tmp10 - tmp12
1377 psllw_i2r(2, mm4); // m8 * 2^2
1378
1379 movq_m2r(RTjpeg_C2mC6, mm0);
1380 psllw_i2r(2, mm1);
1381
1382 pmulhw_m2r(RTjpeg_C6, mm1); // z5
1383 psllw_i2r(2, mm2);
1384
1385 pmulhw_r2r(mm0, mm4); // z5
1386
1387 /* stage 5 */
1388
1389 pmulhw_m2r(RTjpeg_C2pC6, mm2);
1390 psllw_i2r(2, mm5);
1391
1392 pmulhw_m2r(RTjpeg_C4, mm5); // z3
1393 movq_r2r(mm3, mm0); // copy tmp7
1394
1395 movq_m2r(*(dataptr+1), mm7);
1396 paddw_r2r(mm1, mm4); // z2
1397
1398 paddw_r2r(mm1, mm2); // z4
1399
1400 paddw_r2r(mm5, mm0); // z11
1401 psubw_r2r(mm5, mm3); // z13
1402
1403 /* stage 6 */
1404
1405 movq_r2r(mm3, mm5); // copy z13
1406 psubw_r2r(mm4, mm3); // y3=z13 - z2
1407
1408 paddw_r2r(mm4, mm5); // y5=z13 + z2
1409 movq_r2r(mm0, mm6); // copy z11
1410
1411 movq_r2m(mm3, *(dataptr+6)); //save y3
1412 psubw_r2r(mm2, mm0); // y7=z11 - z4
1413
1414 movq_r2m(mm5, *(dataptr+10)); //save y5
1415 paddw_r2r(mm2, mm6); // y1=z11 + z4
1416
1417 movq_r2m(mm0, *(dataptr+14)); //save y7
1418
1419 /************************************************
1420 * End of 1st 4 rows
1421 ************************************************/
1422
1423 movq_m2r(*(dataptr+3), mm1); // load x1 /* stage 1 */
1424 movq_r2r(mm7, mm0); // copy x0
1425
1426 movq_r2m(mm6, *(dataptr+2)); //save y1
1427
1428 movq_m2r(*(dataptr+5), mm2); // load x2 /* stage 1 */
1429 movq_r2r(mm1, mm6); // copy x1
1430
1431 paddw_m2r(*(dataptr+15), mm0); // tmp00 = x0 + x7
1432
1433 movq_m2r(*(dataptr+7), mm3); // load x3 /* stage 1 */
1434 movq_r2r(mm2, mm5); // copy x2
1435
1436 psubw_m2r(*(dataptr+15), mm7); // tmp07 = x0 - x7
1437 movq_r2r(mm3, mm4); // copy x3
1438
1439 paddw_m2r(*(dataptr+13), mm1); // tmp01 = x1 + x6
1440
1441 movq_r2m(mm7, tmp7); // save tmp07
1442 movq_r2r(mm0, mm7); // copy tmp00
1443
1444 psubw_m2r(*(dataptr+13), mm6); // tmp06 = x1 - x6
1445
1446 /* stage 2, Even Part */
1447
1448 paddw_m2r(*(dataptr+9), mm3); // tmp03 = x3 + x4
1449
1450 movq_r2m(mm6, tmp6); // save tmp07
1451 movq_r2r(mm1, mm6); // copy tmp01
1452
1453 paddw_m2r(*(dataptr+11), mm2); // tmp02 = x2 + x5
1454 paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03
1455
1456 psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03
1457
1458 psubw_m2r(*(dataptr+9), mm4); // tmp04 = x3 - x4
1459 psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02
1460
1461 paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02
1462
1463 psubw_m2r(*(dataptr+11), mm5); // tmp05 = x2 - x5
1464 paddw_r2r(mm7, mm6); // tmp12 + tmp13
1465
1466 /* stage 3, Even and stage 4 & 5 even */
1467
1468 movq_m2r(tmp6, mm2); // load tmp6
1469 movq_r2r(mm0, mm3); // copy tmp10
1470
1471 psllw_i2r(2, mm6); // shift z1
1472 paddw_r2r(mm1, mm0); // y0=tmp10 + tmp11
1473
1474 pmulhw_m2r(RTjpeg_C4, mm6); // z1
1475 psubw_r2r(mm1, mm3); // y4=tmp10 - tmp11
1476
1477 movq_r2m(mm0, *(dataptr+1)); //save y0
1478 movq_r2r(mm7, mm0); // copy tmp13
1479
1480 /* odd part */
1481
1482 movq_r2m(mm3, *(dataptr+9)); //save y4
1483 paddw_r2r(mm5, mm4); // tmp10 = tmp4 + tmp5
1484
1485 movq_m2r(tmp7, mm3); // load tmp7
1486 paddw_r2r(mm6, mm0); // tmp32 = tmp13 + z1
1487
1488 paddw_r2r(mm2, mm5); // tmp11 = tmp5 + tmp6
1489 psubw_r2r(mm6, mm7); // tmp33 = tmp13 - z1
1490
1491 movq_r2m(mm0, *(dataptr+5)); //save y2
1492 paddw_r2r(mm3, mm2); // tmp12 = tmp6 + tmp7
1493
1494 /* stage 4 */
1495
1496 movq_r2m(mm7, *(dataptr+13)); //save y6
1497 movq_r2r(mm4, mm1); // copy tmp10
1498
1499 psubw_r2r(mm2, mm1); // tmp10 - tmp12
1500 psllw_i2r(2, mm4); // shift tmp10
1501
1502 movq_m2r(RTjpeg_C2mC6, mm0); // load C2mC6
1503 psllw_i2r(2, mm1); // shift (tmp10-tmp12)
1504
1505 pmulhw_m2r(RTjpeg_C6, mm1); // z5
1506 psllw_i2r(2, mm5); // prepare for multiply
1507
1508 pmulhw_r2r(mm0, mm4); // multiply by converted real
1509
1510 /* stage 5 */
1511
1512 pmulhw_m2r(RTjpeg_C4, mm5); // z3
1513 psllw_i2r(2, mm2); // prepare for multiply
1514
1515 pmulhw_m2r(RTjpeg_C2pC6, mm2); // multiply
1516 movq_r2r(mm3, mm0); // copy tmp7
1517
1518 movq_m2r(*(dataptr+9), mm7); // m03:m02|m01:m00 - first line (line 4)and copy into mm7
1519 paddw_r2r(mm1, mm4); // z2
1520
1521 paddw_r2r(mm5, mm0); // z11
1522 psubw_r2r(mm5, mm3); // z13
1523
1524 /* stage 6 */
1525
1526 movq_r2r(mm3, mm5); // copy z13
1527 paddw_r2r(mm1, mm2); // z4
1528
1529 movq_r2r(mm0, mm6); // copy z11
1530 psubw_r2r(mm4, mm5); // y3
1531
1532 paddw_r2r(mm2, mm6); // y1
1533 paddw_r2r(mm4, mm3); // y5
1534
1535 movq_r2m(mm5, *(dataptr+7)); //save y3
1536 psubw_r2r(mm2, mm0); // yè=z11 - z4
1537
1538 movq_r2m(mm3, *(dataptr+11)); //save y5
1539
1540 movq_r2m(mm6, *(dataptr+3)); //save y1
1541
1542 movq_r2m(mm0, *(dataptr+15)); //save y7
1543
1544
1545 #endif
1546 }
1547
1548 #define FIX_1_082392200 ((__s32) 277) /* FIX(1.082392200) */
1549 #define FIX_1_414213562 ((__s32) 362) /* FIX(1.414213562) */
1550 #define FIX_1_847759065 ((__s32) 473) /* FIX(1.847759065) */
1551 #define FIX_2_613125930 ((__s32) 669) /* FIX(2.613125930) */
1552
1553 #define DESCALE(x) (__s16)( ((x)+4) >> 3)
1554
1555 /* clip yuv to 16..235 (should be 16..240 for cr/cb but ... */
1556
1557 #define RL(x) ((x)>235) ? 235 : (((x)<16) ? 16 : (x))
1558 #define MULTIPLY(var,const) (((__s32) ((var) * (const)) + 128)>>8)
1559
1560 void RTjpeg_idct_init(void)
1561 {
1562 int i;
1563
1564 for(i=0; i<64; i++)
1565 {
1566 RTjpeg_liqt[i]=((__u64)RTjpeg_liqt[i]*RTjpeg_aan_tab[i])>>32;
1567 RTjpeg_ciqt[i]=((__u64)RTjpeg_ciqt[i]*RTjpeg_aan_tab[i])>>32;
1568 }
1569 }
1570
1571 void RTjpeg_idct(__u8 *odata, __s16 *data, int rskip)
1572 {
1573 #ifdef MMX
1574
1575 static mmx_t fix_141 = (mmx_t)(long long)0x5a825a825a825a82LL;
1576 static mmx_t fix_184n261 = (mmx_t)(long long)0xcf04cf04cf04cf04LL;
1577 static mmx_t fix_184 = (mmx_t)(long long)0x7641764176417641LL;
1578 static mmx_t fix_n184 = (mmx_t)(long long)0x896f896f896f896fLL;
1579 static mmx_t fix_108n184 = (mmx_t)(long long)0xcf04cf04cf04cf04LL;
1580
1581 mmx_t workspace[64];
1582 mmx_t *wsptr = workspace;
1583 register mmx_t *dataptr = (mmx_t *)odata;
1584 mmx_t *idata = (mmx_t *)data;
1585
1586 rskip = rskip>>3;
1587 /*
1588 * Perform inverse DCT on one block of coefficients.
1589 */
1590
1591 /* Odd part */
1592
1593 movq_m2r(*(idata+10), mm1); // load idata[DCTSIZE*5]
1594
1595 movq_m2r(*(idata+6), mm0); // load idata[DCTSIZE*3]
1596
1597 movq_m2r(*(idata+2), mm3); // load idata[DCTSIZE*1]
1598
1599 movq_r2r(mm1, mm2); // copy tmp6 /* phase 6 */
1600
1601 movq_m2r(*(idata+14), mm4); // load idata[DCTSIZE*7]
1602
1603 paddw_r2r(mm0, mm1); // z13 = tmp6 + tmp5;
1604
1605 psubw_r2r(mm0, mm2); // z10 = tmp6 - tmp5
1606
1607 psllw_i2r(2, mm2); // shift z10
1608 movq_r2r(mm2, mm0); // copy z10
1609
1610 pmulhw_m2r(fix_184n261, mm2); // MULTIPLY( z12, FIX_1_847759065); /* 2*c2 */
1611 movq_r2r(mm3, mm5); // copy tmp4
1612
1613 pmulhw_m2r(fix_n184, mm0); // MULTIPLY(z10, -FIX_1_847759065); /* 2*c2 */
1614 paddw_r2r(mm4, mm3); // z11 = tmp4 + tmp7;
1615
1616 movq_r2r(mm3, mm6); // copy z11 /* phase 5 */
1617 psubw_r2r(mm4, mm5); // z12 = tmp4 - tmp7;
1618
1619 psubw_r2r(mm1, mm6); // z11-z13
1620 psllw_i2r(2, mm5); // shift z12
1621
1622 movq_m2r(*(idata+12), mm4); // load idata[DCTSIZE*6], even part
1623 movq_r2r(mm5, mm7); // copy z12
1624
1625 pmulhw_m2r(fix_108n184, mm5); // MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; /* 2*(c2-c6) */ even part
1626 paddw_r2r(mm1, mm3); // tmp7 = z11 + z13;
1627
1628 //ok
1629
1630 /* Even part */
1631 pmulhw_m2r(fix_184, mm7); // MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; /* -2*(c2+c6) */
1632 psllw_i2r(2, mm6);
1633
1634 movq_m2r(*(idata+4), mm1); // load idata[DCTSIZE*2]
1635
1636 paddw_r2r(mm5, mm0); // tmp10
1637
1638 paddw_r2r(mm7, mm2); // tmp12
1639
1640 pmulhw_m2r(fix_141, mm6); // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
1641 psubw_r2r(mm3, mm2); // tmp6 = tmp12 - tmp7
1642
1643 movq_r2r(mm1, mm5); // copy tmp1
1644 paddw_r2r(mm4, mm1); // tmp13= tmp1 + tmp3; /* phases 5-3 */
1645
1646 psubw_r2r(mm4, mm5); // tmp1-tmp3
1647 psubw_r2r(mm2, mm6); // tmp5 = tmp11 - tmp6;
1648
1649 movq_r2m(mm1, *(wsptr)); // save tmp13 in workspace
1650 psllw_i2r(2, mm5); // shift tmp1-tmp3
1651
1652 movq_m2r(*(idata), mm7); // load idata[DCTSIZE*0]
1653
1654 pmulhw_m2r(fix_141, mm5); // MULTIPLY(tmp1 - tmp3, FIX_1_414213562)
1655 paddw_r2r(mm6, mm0); // tmp4 = tmp10 + tmp5;
1656
1657 movq_m2r(*(idata+8), mm4); // load idata[DCTSIZE*4]
1658
1659 psubw_r2r(mm1, mm5); // tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */
1660
1661 movq_r2m(mm0, *(wsptr+4)); // save tmp4 in workspace
1662 movq_r2r(mm7, mm1); // copy tmp0 /* phase 3 */
1663
1664 movq_r2m(mm5, *(wsptr+2)); // save tmp12 in workspace
1665 psubw_r2r(mm4, mm1); // tmp11 = tmp0 - tmp2;
1666
1667 paddw_r2r(mm4, mm7); // tmp10 = tmp0 + tmp2;
1668 movq_r2r(mm1, mm5); // copy tmp11
1669
1670 paddw_m2r(*(wsptr+2), mm1); // tmp1 = tmp11 + tmp12;
1671 movq_r2r(mm7, mm4); // copy tmp10 /* phase 2 */
1672
1673 paddw_m2r(*(wsptr), mm7); // tmp0 = tmp10 + tmp13;
1674
1675 psubw_m2r(*(wsptr), mm4); // tmp3 = tmp10 - tmp13;
1676 movq_r2r(mm7, mm0); // copy tmp0
1677
1678 psubw_m2r(*(wsptr+2), mm5); // tmp2 = tmp11 - tmp12;
1679 paddw_r2r(mm3, mm7); // wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
1680
1681 psubw_r2r(mm3, mm0); // wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
1682
1683 movq_r2m(mm7, *(wsptr)); // wsptr[DCTSIZE*0]
1684 movq_r2r(mm1, mm3); // copy tmp1
1685
1686 movq_r2m(mm0, *(wsptr+14)); // wsptr[DCTSIZE*7]
1687 paddw_r2r(mm2, mm1); // wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
1688
1689 psubw_r2r(mm2, mm3); // wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
1690
1691 movq_r2m(mm1, *(wsptr+2)); // wsptr[DCTSIZE*1]
1692 movq_r2r(mm4, mm1); // copy tmp3
1693
1694 movq_r2m(mm3, *(wsptr+12)); // wsptr[DCTSIZE*6]
1695
1696 paddw_m2r(*(wsptr+4), mm4); // wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
1697
1698 psubw_m2r(*(wsptr+4), mm1); // wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
1699
1700 movq_r2m(mm4, *(wsptr+8));
1701 movq_r2r(mm5, mm7); // copy tmp2
1702
1703 paddw_r2r(mm6, mm5); // wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5)
1704
1705 movq_r2m(mm1, *(wsptr+6));
1706 psubw_r2r(mm6, mm7); // wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
1707
1708 movq_r2m(mm5, *(wsptr+4));
1709
1710 movq_r2m(mm7, *(wsptr+10));
1711
1712 //ok
1713
1714
1715 /*****************************************************************/
1716
1717 idata++;
1718 wsptr++;
1719
1720 /*****************************************************************/
1721
1722 movq_m2r(*(idata+10), mm1); // load idata[DCTSIZE*5]
1723
1724 movq_m2r(*(idata+6), mm0); // load idata[DCTSIZE*3]
1725
1726 movq_m2r(*(idata+2), mm3); // load idata[DCTSIZE*1]
1727 movq_r2r(mm1, mm2); // copy tmp6 /* phase 6 */
1728
1729 movq_m2r(*(idata+14), mm4); // load idata[DCTSIZE*7]
1730 paddw_r2r(mm0, mm1); // z13 = tmp6 + tmp5;
1731
1732 psubw_r2r(mm0, mm2); // z10 = tmp6 - tmp5
1733
1734 psllw_i2r(2, mm2); // shift z10
1735 movq_r2r(mm2, mm0); // copy z10
1736
1737 pmulhw_m2r(fix_184n261, mm2); // MULTIPLY( z12, FIX_1_847759065); /* 2*c2 */
1738 movq_r2r(mm3, mm5); // copy tmp4
1739
1740 pmulhw_m2r(fix_n184, mm0); // MULTIPLY(z10, -FIX_1_847759065); /* 2*c2 */
1741 paddw_r2r(mm4, mm3); // z11 = tmp4 + tmp7;
1742
1743 movq_r2r(mm3, mm6); // copy z11 /* phase 5 */
1744 psubw_r2r(mm4, mm5); // z12 = tmp4 - tmp7;
1745
1746 psubw_r2r(mm1, mm6); // z11-z13
1747 psllw_i2r(2, mm5); // shift z12
1748
1749 movq_m2r(*(idata+12), mm4); // load idata[DCTSIZE*6], even part
1750 movq_r2r(mm5, mm7); // copy z12
1751
1752 pmulhw_m2r(fix_108n184, mm5); // MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; /* 2*(c2-c6) */ even part
1753 paddw_r2r(mm1, mm3); // tmp7 = z11 + z13;
1754
1755 //ok
1756
1757 /* Even part */
1758 pmulhw_m2r(fix_184, mm7); // MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; /* -2*(c2+c6) */
1759 psllw_i2r(2, mm6);
1760
1761 movq_m2r(*(idata+4), mm1); // load idata[DCTSIZE*2]
1762
1763 paddw_r2r(mm5, mm0); // tmp10
1764
1765 paddw_r2r(mm7, mm2); // tmp12
1766
1767 pmulhw_m2r(fix_141, mm6); // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
1768 psubw_r2r(mm3, mm2); // tmp6 = tmp12 - tmp7
1769
1770 movq_r2r(mm1, mm5); // copy tmp1
1771 paddw_r2r(mm4, mm1); // tmp13= tmp1 + tmp3; /* phases 5-3 */
1772
1773 psubw_r2r(mm4, mm5); // tmp1-tmp3
1774 psubw_r2r(mm2, mm6); // tmp5 = tmp11 - tmp6;
1775
1776 movq_r2m(mm1, *(wsptr)); // save tmp13 in workspace
1777 psllw_i2r(2, mm5); // shift tmp1-tmp3
1778
1779 movq_m2r(*(idata), mm7); // load idata[DCTSIZE*0]
1780 paddw_r2r(mm6, mm0); // tmp4 = tmp10 + tmp5;
1781
1782 pmulhw_m2r(fix_141, mm5); // MULTIPLY(tmp1 - tmp3, FIX_1_414213562)
1783
1784 movq_m2r(*(idata+8), mm4); // load idata[DCTSIZE*4]
1785
1786 psubw_r2r(mm1, mm5); // tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */
1787
1788 movq_r2m(mm0, *(wsptr+4)); // save tmp4 in workspace
1789 movq_r2r(mm7, mm1); // copy tmp0 /* phase 3 */
1790
1791 movq_r2m(mm5, *(wsptr+2)); // save tmp12 in workspace
1792 psubw_r2r(mm4, mm1); // tmp11 = tmp0 - tmp2;
1793
1794 paddw_r2r(mm4, mm7); // tmp10 = tmp0 + tmp2;
1795 movq_r2r(mm1, mm5); // copy tmp11
1796
1797 paddw_m2r(*(wsptr+2), mm1); // tmp1 = tmp11 + tmp12;
1798 movq_r2r(mm7, mm4); // copy tmp10 /* phase 2 */
1799
1800 paddw_m2r(*(wsptr), mm7); // tmp0 = tmp10 + tmp13;
1801
1802 psubw_m2r(*(wsptr), mm4); // tmp3 = tmp10 - tmp13;
1803 movq_r2r(mm7, mm0); // copy tmp0
1804
1805 psubw_m2r(*(wsptr+2), mm5); // tmp2 = tmp11 - tmp12;
1806 paddw_r2r(mm3, mm7); // wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
1807
1808 psubw_r2r(mm3, mm0); // wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
1809
1810 movq_r2m(mm7, *(wsptr)); // wsptr[DCTSIZE*0]
1811 movq_r2r(mm1, mm3); // copy tmp1
1812
1813 movq_r2m(mm0, *(wsptr+14)); // wsptr[DCTSIZE*7]
1814 paddw_r2r(mm2, mm1); // wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
1815
1816 psubw_r2r(mm2, mm3); // wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
1817
1818 movq_r2m(mm1, *(wsptr+2)); // wsptr[DCTSIZE*1]
1819 movq_r2r(mm4, mm1); // copy tmp3
1820
1821 movq_r2m(mm3, *(wsptr+12)); // wsptr[DCTSIZE*6]
1822
1823 paddw_m2r(*(wsptr+4), mm4); // wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
1824
1825 psubw_m2r(*(wsptr+4), mm1); // wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
1826
1827 movq_r2m(mm4, *(wsptr+8));
1828 movq_r2r(mm5, mm7); // copy tmp2
1829
1830 paddw_r2r(mm6, mm5); // wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5)
1831
1832 movq_r2m(mm1, *(wsptr+6));
1833 psubw_r2r(mm6, mm7); // wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
1834
1835 movq_r2m(mm5, *(wsptr+4));
1836
1837 movq_r2m(mm7, *(wsptr+10));
1838
1839 /*****************************************************************/
1840
1841 /* Pass 2: process rows from work array, store into output array. */
1842 /* Note that we must descale the results by a factor of 8 == 2**3, */
1843 /* and also undo the PASS1_BITS scaling. */
1844
1845 /*****************************************************************/
1846 /* Even part */
1847
1848 wsptr--;
1849
1850 // tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]);
1851 // tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]);
1852 // tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]);
1853 // tmp14 = ((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6]);
1854 movq_m2r(*(wsptr), mm0); // wsptr[0,0],[0,1],[0,2],[0,3]
1855
1856 movq_m2r(*(wsptr+1), mm1); // wsptr[0,4],[0,5],[0,6],[0,7]
1857 movq_r2r(mm0, mm2);
1858
1859 movq_m2r(*(wsptr+2), mm3); // wsptr[1,0],[1,1],[1,2],[1,3]
1860 paddw_r2r(mm1, mm0); // wsptr[0,tmp10],[xxx],[0,tmp13],[xxx]
1861
1862 movq_m2r(*(wsptr+3), mm4); // wsptr[1,4],[1,5],[1,6],[1,7]
1863 psubw_r2r(mm1, mm2); // wsptr[0,tmp11],[xxx],[0,tmp14],[xxx]
1864
1865 movq_r2r(mm0, mm6);
1866 movq_r2r(mm3, mm5);
1867
1868 paddw_r2r(mm4, mm3); // wsptr[1,tmp10],[xxx],[1,tmp13],[xxx]
1869 movq_r2r(mm2, mm1);
1870
1871 psubw_r2r(mm4, mm5); // wsptr[1,tmp11],[xxx],[1,tmp14],[xxx]
1872 punpcklwd_r2r(mm3, mm0); // wsptr[0,tmp10],[1,tmp10],[xxx],[xxx]
1873
1874 movq_m2r(*(wsptr+7), mm7); // wsptr[3,4],[3,5],[3,6],[3,7]
1875 punpckhwd_r2r(mm3, mm6); // wsptr[0,tmp13],[1,tmp13],[xxx],[xxx]
1876
1877 movq_m2r(*(wsptr+4), mm3); // wsptr[2,0],[2,1],[2,2],[2,3]
1878 punpckldq_r2r(mm6, mm0); // wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
1879
1880 punpcklwd_r2r(mm5, mm1); // wsptr[0,tmp11],[1,tmp11],[xxx],[xxx]
1881 movq_r2r(mm3, mm4);
1882
1883 movq_m2r(*(wsptr+6), mm6); // wsptr[3,0],[3,1],[3,2],[3,3]
1884 punpckhwd_r2r(mm5, mm2); // wsptr[0,tmp14],[1,tmp14],[xxx],[xxx]
1885
1886 movq_m2r(*(wsptr+5), mm5); // wsptr[2,4],[2,5],[2,6],[2,7]
1887 punpckldq_r2r(mm2, mm1); // wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
1888
1889
1890 paddw_r2r(mm5, mm3); // wsptr[2,tmp10],[xxx],[2,tmp13],[xxx]
1891 movq_r2r(mm6, mm2);
1892
1893 psubw_r2r(mm5, mm4); // wsptr[2,tmp11],[xxx],[2,tmp14],[xxx]
1894 paddw_r2r(mm7, mm6); // wsptr[3,tmp10],[xxx],[3,tmp13],[xxx]
1895
1896 movq_r2r(mm3, mm5);
1897 punpcklwd_r2r(mm6, mm3); // wsptr[2,tmp10],[3,tmp10],[xxx],[xxx]
1898
1899 psubw_r2r(mm7, mm2); // wsptr[3,tmp11],[xxx],[3,tmp14],[xxx]
1900 punpckhwd_r2r(mm6, mm5); // wsptr[2,tmp13],[3,tmp13],[xxx],[xxx]
1901
1902 movq_r2r(mm4, mm7);
1903 punpckldq_r2r(mm5, mm3); // wsptr[2,tmp10],[3,tmp10],[2,tmp13],[3,tmp13]
1904
1905 punpcklwd_r2r(mm2, mm4); // wsptr[2,tmp11],[3,tmp11],[xxx],[xxx]
1906
1907 punpckhwd_r2r(mm2, mm7); // wsptr[2,tmp14],[3,tmp14],[xxx],[xxx]
1908
1909 punpckldq_r2r(mm7, mm4); // wsptr[2,tmp11],[3,tmp11],[2,tmp14],[3,tmp14]
1910 movq_r2r(mm1, mm6);
1911
1912 //ok
1913
1914 // mm0 = ;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
1915 // mm1 = ;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
1916
1917
1918 movq_r2r(mm0, mm2);
1919 punpckhdq_r2r(mm4, mm6); // wsptr[0,tmp14],[1,tmp14],[2,tmp14],[3,tmp14]
1920
1921 punpckldq_r2r(mm4, mm1); // wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11]
1922 psllw_i2r(2, mm6);
1923
1924 pmulhw_m2r(fix_141, mm6);
1925 punpckldq_r2r(mm3, mm0); // wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10]
1926
1927 punpckhdq_r2r(mm3, mm2); // wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13]
1928 movq_r2r(mm0, mm7);
1929
1930 // tmp0 = tmp10 + tmp13;
1931 // tmp3 = tmp10 - tmp13;
1932 paddw_r2r(mm2, mm0); // [0,tmp0],[1,tmp0],[2,tmp0],[3,tmp0]
1933 psubw_r2r(mm2, mm7); // [0,tmp3],[1,tmp3],[2,tmp3],[3,tmp3]
1934
1935 // tmp12 = MULTIPLY(tmp14, FIX_1_414213562) - tmp13;
1936 psubw_r2r(mm2, mm6); // wsptr[0,tmp12],[1,tmp12],[2,tmp12],[3,tmp12]
1937 // tmp1 = tmp11 + tmp12;
1938 // tmp2 = tmp11 - tmp12;
1939 movq_r2r(mm1, mm5);
1940
1941 //OK
1942
1943 /* Odd part */
1944
1945 // z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];
1946 // z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];
1947 // z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];
1948 // z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];
1949 movq_m2r(*(wsptr), mm3); // wsptr[0,0],[0,1],[0,2],[0,3]
1950 paddw_r2r(mm6, mm1); // [0,tmp1],[1,tmp1],[2,tmp1],[3,tmp1]
1951
1952 movq_m2r(*(wsptr+1), mm4); // wsptr[0,4],[0,5],[0,6],[0,7]
1953 psubw_r2r(mm6, mm5); // [0,tmp2],[1,tmp2],[2,tmp2],[3,tmp2]
1954
1955 movq_r2r(mm3, mm6);
1956 punpckldq_r2r(mm4, mm3); // wsptr[0,0],[0,1],[0,4],[0,5]
1957
1958 punpckhdq_r2r(mm6, mm4); // wsptr[0,6],[0,7],[0,2],[0,3]
1959 movq_r2r(mm3, mm2);
1960
1961 //Save tmp0 and tmp1 in wsptr
1962 movq_r2m(mm0, *(wsptr)); // save tmp0
1963 paddw_r2r(mm4, mm2); // wsptr[xxx],[0,z11],[xxx],[0,z13]
1964
1965
1966 //Continue with z10 --- z13
1967 movq_m2r(*(wsptr+2), mm6); // wsptr[1,0],[1,1],[1,2],[1,3]
1968 psubw_r2r(mm4, mm3); // wsptr[xxx],[0,z12],[xxx],[0,z10]
1969
1970 movq_m2r(*(wsptr+3), mm0); // wsptr[1,4],[1,5],[1,6],[1,7]
1971 movq_r2r(mm6, mm4);
1972
1973 movq_r2m(mm1, *(wsptr+1)); // save tmp1
1974 punpckldq_r2r(mm0, mm6); // wsptr[1,0],[1,1],[1,4],[1,5]
1975
1976 punpckhdq_r2r(mm4, mm0); // wsptr[1,6],[1,7],[1,2],[1,3]
1977 movq_r2r(mm6, mm1);
1978
1979 //Save tmp2 and tmp3 in wsptr
1980 paddw_r2r(mm0, mm6); // wsptr[xxx],[1,z11],[xxx],[1,z13]
1981 movq_r2r(mm2, mm4);
1982
1983 //Continue with z10 --- z13
1984 movq_r2m(mm5, *(wsptr+2)); // save tmp2
1985 punpcklwd_r2r(mm6, mm2); // wsptr[xxx],[xxx],[0,z11],[1,z11]
1986
1987 psubw_r2r(mm0, mm1); // wsptr[xxx],[1,z12],[xxx],[1,z10]
1988 punpckhwd_r2r(mm6, mm4); // wsptr[xxx],[xxx],[0,z13],[1,z13]
1989
1990 movq_r2r(mm3, mm0);
1991 punpcklwd_r2r(mm1, mm3); // wsptr[xxx],[xxx],[0,z12],[1,z12]
1992
1993 movq_r2m(mm7, *(wsptr+3)); // save tmp3
1994 punpckhwd_r2r(mm1, mm0); // wsptr[xxx],[xxx],[0,z10],[1,z10]
1995
1996 movq_m2r(*(wsptr+4), mm6); // wsptr[2,0],[2,1],[2,2],[2,3]
1997 punpckhdq_r2r(mm2, mm0); // wsptr[0,z10],[1,z10],[0,z11],[1,z11]
1998
1999 movq_m2r(*(wsptr+5), mm7); // wsptr[2,4],[2,5],[2,6],[2,7]
2000 punpckhdq_r2r(mm4, mm3); // wsptr[0,z12],[1,z12],[0,z13],[1,z13]
2001
2002 movq_m2r(*(wsptr+6), mm1); // wsptr[3,0],[3,1],[3,2],[3,3]
2003 movq_r2r(mm6, mm4);
2004
2005 punpckldq_r2r(mm7, mm6); // wsptr[2,0],[2,1],[2,4],[2,5]
2006 movq_r2r(mm1, mm5);
2007
2008 punpckhdq_r2r(mm4, mm7); // wsptr[2,6],[2,7],[2,2],[2,3]
2009 movq_r2r(mm6, mm2);
2010
2011 movq_m2r(*(wsptr+7), mm4); // wsptr[3,4],[3,5],[3,6],[3,7]
2012 paddw_r2r(mm7, mm6); // wsptr[xxx],[2,z11],[xxx],[2,z13]
2013
2014 psubw_r2r(mm7, mm2); // wsptr[xxx],[2,z12],[xxx],[2,z10]
2015 punpckldq_r2r(mm4, mm1); // wsptr[3,0],[3,1],[3,4],[3,5]
2016
2017 punpckhdq_r2r(mm5, mm4); // wsptr[3,6],[3,7],[3,2],[3,3]
2018 movq_r2r(mm1, mm7);
2019
2020 paddw_r2r(mm4, mm1); // wsptr[xxx],[3,z11],[xxx],[3,z13]
2021 psubw_r2r(mm4, mm7); // wsptr[xxx],[3,z12],[xxx],[3,z10]
2022
2023 movq_r2r(mm6, mm5);
2024 punpcklwd_r2r(mm1, mm6); // wsptr[xxx],[xxx],[2,z11],[3,z11]
2025
2026 punpckhwd_r2r(mm1, mm5); // wsptr[xxx],[xxx],[2,z13],[3,z13]
2027 movq_r2r(mm2, mm4);
2028
2029 punpcklwd_r2r(mm7, mm2); // wsptr[xxx],[xxx],[2,z12],[3,z12]
2030
2031 punpckhwd_r2r(mm7, mm4); // wsptr[xxx],[xxx],[2,z10],[3,z10]
2032
2033 punpckhdq_r2r(mm6, mm4); /// wsptr[2,z10],[3,z10],[2,z11],[3,z11]
2034
2035 punpckhdq_r2r(mm5, mm2); // wsptr[2,z12],[3,z12],[2,z13],[3,z13]
2036 movq_r2r(mm0, mm5);
2037
2038 punpckldq_r2r(mm4, mm0); // wsptr[0,z10],[1,z10],[2,z10],[3,z10]
2039
2040 punpckhdq_r2r(mm4, mm5); // wsptr[0,z11],[1,z11],[2,z11],[3,z11]
2041 movq_r2r(mm3, mm4);
2042
2043 punpckhdq_r2r(mm2, mm4); // wsptr[0,z13],[1,z13],[2,z13],[3,z13]
2044 movq_r2r(mm5, mm1);
2045
2046 punpckldq_r2r(mm2, mm3); // wsptr[0,z12],[1,z12],[2,z12],[3,z12]
2047 // tmp7 = z11 + z13; /* phase 5 */
2048 // tmp8 = z11 - z13; /* phase 5 */
2049 psubw_r2r(mm4, mm1); // tmp8
2050
2051 paddw_r2r(mm4, mm5); // tmp7
2052 // tmp21 = MULTIPLY(tmp8, FIX_1_414213562); /* 2*c4 */
2053 psllw_i2r(2, mm1);
2054
2055 psllw_i2r(2, mm0);
2056
2057 pmulhw_m2r(fix_141, mm1); // tmp21
2058 // tmp20 = MULTIPLY(z12, (FIX_1_082392200- FIX_1_847759065)) /* 2*(c2-c6) */
2059 // + MULTIPLY(z10, - FIX_1_847759065); /* 2*c2 */
2060 psllw_i2r(2, mm3);
2061 movq_r2r(mm0, mm7);
2062
2063 pmulhw_m2r(fix_n184, mm7);
2064 movq_r2r(mm3, mm6);
2065
2066 movq_m2r(*(wsptr), mm2); // tmp0,final1
2067
2068 pmulhw_m2r(fix_108n184, mm6);
2069 // tmp22 = MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) /* -2*(c2+c6) */
2070 // + MULTIPLY(z12, FIX_1_847759065); /* 2*c2 */
2071 movq_r2r(mm2, mm4); // final1
2072
2073 pmulhw_m2r(fix_184n261, mm0);
2074 paddw_r2r(mm5, mm2); // tmp0+tmp7,final1
2075
2076 pmulhw_m2r(fix_184, mm3);
2077 psubw_r2r(mm5, mm4); // tmp0-tmp7,final1
2078
2079 // tmp6 = tmp22 - tmp7; /* phase 2 */
2080 psraw_i2r(3, mm2); // outptr[0,0],[1,0],[2,0],[3,0],final1
2081
2082 paddw_r2r(mm6, mm7); // tmp20
2083 psraw_i2r(3, mm4); // outptr[0,7],[1,7],[2,7],[3,7],final1
2084
2085 paddw_r2r(mm0, mm3); // tmp22
2086
2087 // tmp5 = tmp21 - tmp6;
2088 psubw_r2r(mm5, mm3); // tmp6
2089
2090 // tmp4 = tmp20 + tmp5;
2091 movq_m2r(*(wsptr+1), mm0); // tmp1,final2
2092 psubw_r2r(mm3, mm1); // tmp5
2093
2094 movq_r2r(mm0, mm6); // final2
2095 paddw_r2r(mm3, mm0); // tmp1+tmp6,final2
2096
2097 /* Final output stage: scale down by a factor of 8 and range-limit */
2098
2099
2100 // outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3)
2101 // & RANGE_MASK];
2102 // outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3)
2103 // & RANGE_MASK]; final1
2104
2105
2106 // outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3)
2107 // & RANGE_MASK];
2108 // outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3)
2109 // & RANGE_MASK]; final2
2110 psubw_r2r(mm3, mm6); // tmp1-tmp6,final2
2111 psraw_i2r(3, mm0); // outptr[0,1],[1,1],[2,1],[3,1]
2112
2113 psraw_i2r(3, mm6); // outptr[0,6],[1,6],[2,6],[3,6]
2114
2115 packuswb_r2r(mm4, mm0); // out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7]
2116
2117 movq_m2r(*(wsptr+2), mm5); // tmp2,final3
2118 packuswb_r2r(mm6, mm2); // out[0,0],[1,0],[2,0],[3,0],[0,6],[1,6],[2,6],[3,6]
2119
2120 // outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3)
2121 // & RANGE_MASK];
2122 // outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3)
2123 // & RANGE_MASK]; final3
2124 paddw_r2r(mm1, mm7); // tmp4
2125 movq_r2r(mm5, mm3);
2126
2127 paddw_r2r(mm1, mm5); // tmp2+tmp5
2128 psubw_r2r(mm1, mm3); // tmp2-tmp5
2129
2130 psraw_i2r(3, mm5); // outptr[0,2],[1,2],[2,2],[3,2]
2131
2132 movq_m2r(*(wsptr+3), mm4); // tmp3,final4
2133 psraw_i2r(3, mm3); // outptr[0,5],[1,5],[2,5],[3,5]
2134
2135
2136
2137 // outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3)
2138 // & RANGE_MASK];
2139 // outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3)
2140 // & RANGE_MASK]; final4
2141 movq_r2r(mm4, mm6);
2142 paddw_r2r(mm7, mm4); // tmp3+tmp4
2143
2144 psubw_r2r(mm7, mm6); // tmp3-tmp4
2145 psraw_i2r(3, mm4); // outptr[0,4],[1,4],[2,4],[3,4]
2146
2147 // mov ecx, [dataptr]
2148
2149 psraw_i2r(3, mm6); // outptr[0,3],[1,3],[2,3],[3,3]
2150
2151 packuswb_r2r(mm4, mm5); // out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4]
2152
2153 packuswb_r2r(mm3, mm6); // out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5]
2154 movq_r2r(mm2, mm4);
2155
2156 movq_r2r(mm5, mm7);
2157 punpcklbw_r2r(mm0, mm2); // out[0,0],[0,1],[1,0],[1,1],[2,0],[2,1],[3,0],[3,1]
2158
2159 punpckhbw_r2r(mm0, mm4); // out[0,6],[0,7],[1,6],[1,7],[2,6],[2,7],[3,6],[3,7]
2160 movq_r2r(mm2, mm1);
2161
2162 punpcklbw_r2r(mm6, mm5); // out[0,2],[0,3],[1,2],[1,3],[2,2],[2,3],[3,2],[3,3]
2163
2164 // add dataptr, 4
2165
2166 punpckhbw_r2r(mm6, mm7); // out[0,4],[0,5],[1,4],[1,5],[2,4],[2,5],[3,4],[3,5]
2167
2168 punpcklwd_r2r(mm5, mm2); // out[0,0],[0,1],[0,2],[0,3],[1,0],[1,1],[1,2],[1,3]
2169
2170 // add ecx, output_col
2171
2172 movq_r2r(mm7, mm6);
2173 punpckhwd_r2r(mm5, mm1); // out[2,0],[2,1],[2,2],[2,3],[3,0],[3,1],[3,2],[3,3]
2174
2175 movq_r2r(mm2, mm0);
2176 punpcklwd_r2r(mm4, mm6); // out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7]
2177
2178 // mov idata, [dataptr]
2179
2180 punpckldq_r2r(mm6, mm2); // out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7]
2181
2182 // add dataptr, 4
2183
2184 movq_r2r(mm1, mm3);
2185
2186 // add idata, output_col
2187
2188 punpckhwd_r2r(mm4, mm7); // out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7]
2189
2190 movq_r2m(mm2, *(dataptr));
2191
2192 punpckhdq_r2r(mm6, mm0); // out[1,0],[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7]
2193
2194 dataptr += rskip;
2195 movq_r2m(mm0, *(dataptr));
2196
2197 punpckldq_r2r(mm7, mm1); // out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7]
2198 punpckhdq_r2r(mm7, mm3); // out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7]
2199
2200 dataptr += rskip;
2201 movq_r2m(mm1, *(dataptr));
2202
2203 dataptr += rskip;
2204 movq_r2m(mm3, *(dataptr));
2205
2206 /*******************************************************************/
2207
2208 wsptr += 8;
2209
2210 /*******************************************************************/
2211
2212 // tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]);
2213 // tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]);
2214 // tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]);
2215 // tmp14 = ((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6]);
2216 movq_m2r(*(wsptr), mm0); // wsptr[0,0],[0,1],[0,2],[0,3]
2217
2218 movq_m2r(*(wsptr+1), mm1); // wsptr[0,4],[0,5],[0,6],[0,7]
2219 movq_r2r(mm0, mm2);
2220
2221 movq_m2r(*(wsptr+2), mm3); // wsptr[1,0],[1,1],[1,2],[1,3]
2222 paddw_r2r(mm1, mm0); // wsptr[0,tmp10],[xxx],[0,tmp13],[xxx]
2223
2224 movq_m2r(*(wsptr+3), mm4); // wsptr[1,4],[1,5],[1,6],[1,7]
2225 psubw_r2r(mm1, mm2); // wsptr[0,tmp11],[xxx],[0,tmp14],[xxx]
2226
2227 movq_r2r(mm0, mm6);
2228 movq_r2r(mm3, mm5);
2229
2230 paddw_r2r(mm4, mm3); // wsptr[1,tmp10],[xxx],[1,tmp13],[xxx]
2231 movq_r2r(mm2, mm1);
2232
2233 psubw_r2r(mm4, mm5); // wsptr[1,tmp11],[xxx],[1,tmp14],[xxx]
2234 punpcklwd_r2r(mm3, mm0); // wsptr[0,tmp10],[1,tmp10],[xxx],[xxx]
2235
2236 movq_m2r(*(wsptr+7), mm7); // wsptr[3,4],[3,5],[3,6],[3,7]
2237 punpckhwd_r2r(mm3, mm6); // wsptr[0,tmp13],[1,tmp13],[xxx],[xxx]
2238
2239 movq_m2r(*(wsptr+4), mm3); // wsptr[2,0],[2,1],[2,2],[2,3]
2240 punpckldq_r2r(mm6, mm0); // wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
2241
2242 punpcklwd_r2r(mm5, mm1); // wsptr[0,tmp11],[1,tmp11],[xxx],[xxx]
2243 movq_r2r(mm3, mm4);
2244
2245 movq_m2r(*(wsptr+6), mm6); // wsptr[3,0],[3,1],[3,2],[3,3]
2246 punpckhwd_r2r(mm5, mm2); // wsptr[0,tmp14],[1,tmp14],[xxx],[xxx]
2247
2248 movq_m2r(*(wsptr+5), mm5); // wsptr[2,4],[2,5],[2,6],[2,7]
2249 punpckldq_r2r(mm2, mm1); // wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
2250
2251 paddw_r2r(mm5, mm3); // wsptr[2,tmp10],[xxx],[2,tmp13],[xxx]
2252 movq_r2r(mm6, mm2);
2253
2254 psubw_r2r(mm5, mm4); // wsptr[2,tmp11],[xxx],[2,tmp14],[xxx]
2255 paddw_r2r(mm7, mm6); // wsptr[3,tmp10],[xxx],[3,tmp13],[xxx]
2256
2257 movq_r2r(mm3, mm5);
2258 punpcklwd_r2r(mm6, mm3); // wsptr[2,tmp10],[3,tmp10],[xxx],[xxx]
2259
2260 psubw_r2r(mm7, mm2); // wsptr[3,tmp11],[xxx],[3,tmp14],[xxx]
2261 punpckhwd_r2r(mm6, mm5); // wsptr[2,tmp13],[3,tmp13],[xxx],[xxx]
2262
2263 movq_r2r(mm4, mm7);
2264 punpckldq_r2r(mm5, mm3); // wsptr[2,tmp10],[3,tmp10],[2,tmp13],[3,tmp13]
2265
2266 punpcklwd_r2r(mm2, mm4); // wsptr[2,tmp11],[3,tmp11],[xxx],[xxx]
2267
2268 punpckhwd_r2r(mm2, mm7); // wsptr[2,tmp14],[3,tmp14],[xxx],[xxx]
2269
2270 punpckldq_r2r(mm7, mm4); // wsptr[2,tmp11],[3,tmp11],[2,tmp14],[3,tmp14]
2271 movq_r2r(mm1, mm6);
2272
2273 //OK
2274
2275 // mm0 = ;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
2276 // mm1 = ;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
2277
2278 movq_r2r(mm0, mm2);
2279 punpckhdq_r2r(mm4, mm6); // wsptr[0,tmp14],[1,tmp14],[2,tmp14],[3,tmp14]
2280
2281 punpckldq_r2r(mm4, mm1); // wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11]
2282 psllw_i2r(2, mm6);
2283
2284 pmulhw_m2r(fix_141, mm6);
2285 punpckldq_r2r(mm3, mm0); // wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10]
2286
2287 punpckhdq_r2r(mm3, mm2); // wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13]
2288 movq_r2r(mm0, mm7);
2289
2290 // tmp0 = tmp10 + tmp13;
2291 // tmp3 = tmp10 - tmp13;
2292 paddw_r2r(mm2, mm0); // [0,tmp0],[1,tmp0],[2,tmp0],[3,tmp0]
2293 psubw_r2r(mm2, mm7); // [0,tmp3],[1,tmp3],[2,tmp3],[3,tmp3]
2294
2295 // tmp12 = MULTIPLY(tmp14, FIX_1_414213562) - tmp13;
2296 psubw_r2r(mm2, mm6); // wsptr[0,tmp12],[1,tmp12],[2,tmp12],[3,tmp12]
2297 // tmp1 = tmp11 + tmp12;
2298 // tmp2 = tmp11 - tmp12;
2299 movq_r2r(mm1, mm5);
2300
2301 //OK
2302
2303
2304 /* Odd part */
2305
2306 // z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];
2307 // z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];
2308 // z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];
2309 // z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];
2310 movq_m2r(*(wsptr), mm3); // wsptr[0,0],[0,1],[0,2],[0,3]
2311 paddw_r2r(mm6, mm1); // [0,tmp1],[1,tmp1],[2,tmp1],[3,tmp1]
2312
2313 movq_m2r(*(wsptr+1), mm4); // wsptr[0,4],[0,5],[0,6],[0,7]
2314 psubw_r2r(mm6, mm5); // [0,tmp2],[1,tmp2],[2,tmp2],[3,tmp2]
2315
2316 movq_r2r(mm3, mm6);
2317 punpckldq_r2r(mm4, mm3); // wsptr[0,0],[0,1],[0,4],[0,5]
2318
2319 punpckhdq_r2r(mm6, mm4); // wsptr[0,6],[0,7],[0,2],[0,3]
2320 movq_r2r(mm3, mm2);
2321
2322 //Save tmp0 and tmp1 in wsptr
2323 movq_r2m(mm0, *(wsptr)); // save tmp0
2324 paddw_r2r(mm4, mm2); // wsptr[xxx],[0,z11],[xxx],[0,z13]
2325
2326
2327 //Continue with z10 --- z13
2328 movq_m2r(*(wsptr+2), mm6); // wsptr[1,0],[1,1],[1,2],[1,3]
2329 psubw_r2r(mm4, mm3); // wsptr[xxx],[0,z12],[xxx],[0,z10]
2330
2331 movq_m2r(*(wsptr+3), mm0); // wsptr[1,4],[1,5],[1,6],[1,7]
2332 movq_r2r(mm6, mm4);
2333
2334 movq_r2m(mm1, *(wsptr+1)); // save tmp1
2335 punpckldq_r2r(mm0, mm6); // wsptr[1,0],[1,1],[1,4],[1,5]
2336
2337 punpckhdq_r2r(mm4, mm0); // wsptr[1,6],[1,7],[1,2],[1,3]
2338 movq_r2r(mm6, mm1);
2339
2340 //Save tmp2 and tmp3 in wsptr
2341 paddw_r2r(mm0, mm6); // wsptr[xxx],[1,z11],[xxx],[1,z13]
2342 movq_r2r(mm2, mm4);
2343
2344 //Continue with z10 --- z13
2345 movq_r2m(mm5, *(wsptr+2)); // save tmp2
2346 punpcklwd_r2r(mm6, mm2); // wsptr[xxx],[xxx],[0,z11],[1,z11]
2347
2348 psubw_r2r(mm0, mm1); // wsptr[xxx],[1,z12],[xxx],[1,z10]
2349 punpckhwd_r2r(mm6, mm4); // wsptr[xxx],[xxx],[0,z13],[1,z13]
2350
2351 movq_r2r(mm3, mm0);
2352 punpcklwd_r2r(mm1, mm3); // wsptr[xxx],[xxx],[0,z12],[1,z12]
2353
2354 movq_r2m(mm7, *(wsptr+3)); // save tmp3
2355 punpckhwd_r2r(mm1, mm0); // wsptr[xxx],[xxx],[0,z10],[1,z10]
2356
2357 movq_m2r(*(wsptr+4), mm6); // wsptr[2,0],[2,1],[2,2],[2,3]
2358 punpckhdq_r2r(mm2, mm0); // wsptr[0,z10],[1,z10],[0,z11],[1,z11]
2359
2360 movq_m2r(*(wsptr+5), mm7); // wsptr[2,4],[2,5],[2,6],[2,7]
2361 punpckhdq_r2r(mm4, mm3); // wsptr[0,z12],[1,z12],[0,z13],[1,z13]
2362
2363 movq_m2r(*(wsptr+6), mm1); // wsptr[3,0],[3,1],[3,2],[3,3]
2364 movq_r2r(mm6, mm4);
2365
2366 punpckldq_r2r(mm7, mm6); // wsptr[2,0],[2,1],[2,4],[2,5]
2367 movq_r2r(mm1, mm5);
2368
2369 punpckhdq_r2r(mm4, mm7); // wsptr[2,6],[2,7],[2,2],[2,3]
2370 movq_r2r(mm6, mm2);
2371
2372 movq_m2r(*(wsptr+7), mm4); // wsptr[3,4],[3,5],[3,6],[3,7]
2373 paddw_r2r(mm7, mm6); // wsptr[xxx],[2,z11],[xxx],[2,z13]
2374
2375 psubw_r2r(mm7, mm2); // wsptr[xxx],[2,z12],[xxx],[2,z10]
2376 punpckldq_r2r(mm4, mm1); // wsptr[3,0],[3,1],[3,4],[3,5]
2377
2378 punpckhdq_r2r(mm5, mm4); // wsptr[3,6],[3,7],[3,2],[3,3]
2379 movq_r2r(mm1, mm7);
2380
2381 paddw_r2r(mm4, mm1); // wsptr[xxx],[3,z11],[xxx],[3,z13]
2382 psubw_r2r(mm4, mm7); // wsptr[xxx],[3,z12],[xxx],[3,z10]
2383
2384 movq_r2r(mm6, mm5);
2385 punpcklwd_r2r(mm1, mm6); // wsptr[xxx],[xxx],[2,z11],[3,z11]
2386
2387 punpckhwd_r2r(mm1, mm5); // wsptr[xxx],[xxx],[2,z13],[3,z13]
2388 movq_r2r(mm2, mm4);
2389
2390 punpcklwd_r2r(mm7, mm2); // wsptr[xxx],[xxx],[2,z12],[3,z12]
2391
2392 punpckhwd_r2r(mm7, mm4); // wsptr[xxx],[xxx],[2,z10],[3,z10]
2393
2394 punpckhdq_r2r(mm6, mm4); // wsptr[2,z10],[3,z10],[2,z11],[3,z11]
2395
2396 punpckhdq_r2r(mm5, mm2); // wsptr[2,z12],[3,z12],[2,z13],[3,z13]
2397 movq_r2r(mm0, mm5);
2398
2399 punpckldq_r2r(mm4, mm0); // wsptr[0,z10],[1,z10],[2,z10],[3,z10]
2400
2401 punpckhdq_r2r(mm4, mm5); // wsptr[0,z11],[1,z11],[2,z11],[3,z11]
2402 movq_r2r(mm3, mm4);
2403
2404 punpckhdq_r2r(mm2, mm4); // wsptr[0,z13],[1,z13],[2,z13],[3,z13]
2405 movq_r2r(mm5, mm1);
2406
2407 punpckldq_r2r(mm2, mm3); // wsptr[0,z12],[1,z12],[2,z12],[3,z12]
2408 // tmp7 = z11 + z13; /* phase 5 */
2409 // tmp8 = z11 - z13; /* phase 5 */
2410 psubw_r2r(mm4, mm1); // tmp8
2411
2412 paddw_r2r(mm4, mm5); // tmp7
2413 // tmp21 = MULTIPLY(tmp8, FIX_1_414213562); /* 2*c4 */
2414 psllw_i2r(2, mm1);
2415
2416 psllw_i2r(2, mm0);
2417
2418 pmulhw_m2r(fix_141, mm1); // tmp21
2419 // tmp20 = MULTIPLY(z12, (FIX_1_082392200- FIX_1_847759065)) /* 2*(c2-c6) */
2420 // + MULTIPLY(z10, - FIX_1_847759065); /* 2*c2 */
2421 psllw_i2r(2, mm3);
2422 movq_r2r(mm0, mm7);
2423
2424 pmulhw_m2r(fix_n184, mm7);
2425 movq_r2r(mm3, mm6);
2426
2427 movq_m2r(*(wsptr), mm2); // tmp0,final1
2428
2429 pmulhw_m2r(fix_108n184, mm6);
2430 // tmp22 = MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) /* -2*(c2+c6) */
2431 // + MULTIPLY(z12, FIX_1_847759065); /* 2*c2 */
2432 movq_r2r(mm2, mm4); // final1
2433
2434 pmulhw_m2r(fix_184n261, mm0);
2435 paddw_r2r(mm5, mm2); // tmp0+tmp7,final1
2436
2437 pmulhw_m2r(fix_184, mm3);
2438 psubw_r2r(mm5, mm4); // tmp0-tmp7,final1
2439
2440 // tmp6 = tmp22 - tmp7; /* phase 2 */
2441 psraw_i2r(3, mm2); // outptr[0,0],[1,0],[2,0],[3,0],final1
2442
2443 paddw_r2r(mm6, mm7); // tmp20
2444 psraw_i2r(3, mm4); // outptr[0,7],[1,7],[2,7],[3,7],final1
2445
2446 paddw_r2r(mm0, mm3); // tmp22
2447
2448 // tmp5 = tmp21 - tmp6;
2449 psubw_r2r(mm5, mm3); // tmp6
2450
2451 // tmp4 = tmp20 + tmp5;
2452 movq_m2r(*(wsptr+1), mm0); // tmp1,final2
2453 psubw_r2r(mm3, mm1); // tmp5
2454
2455 movq_r2r(mm0, mm6); // final2
2456 paddw_r2r(mm3, mm0); // tmp1+tmp6,final2
2457
2458 /* Final output stage: scale down by a factor of 8 and range-limit */
2459
2460 // outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3)
2461 // & RANGE_MASK];
2462 // outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3)
2463 // & RANGE_MASK]; final1
2464
2465
2466 // outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3)
2467 // & RANGE_MASK];
2468 // outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3)
2469 // & RANGE_MASK]; final2
2470 psubw_r2r(mm3, mm6); // tmp1-tmp6,final2
2471 psraw_i2r(3, mm0); // outptr[0,1],[1,1],[2,1],[3,1]
2472
2473 psraw_i2r(3, mm6); // outptr[0,6],[1,6],[2,6],[3,6]
2474
2475 packuswb_r2r(mm4, mm0); // out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7]
2476
2477 movq_m2r(*(wsptr+2), mm5); // tmp2,final3
2478 packuswb_r2r(mm6, mm2); // out[0,0],[1,0],[2,0],[3,0],[0,6],[1,6],[2,6],[3,6]
2479
2480 // outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3)
2481 // & RANGE_MASK];
2482 // outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3)
2483 // & RANGE_MASK]; final3
2484 paddw_r2r(mm1, mm7); // tmp4
2485 movq_r2r(mm5, mm3);
2486
2487 paddw_r2r(mm1, mm5); // tmp2+tmp5
2488 psubw_r2r(mm1, mm3); // tmp2-tmp5
2489
2490 psraw_i2r(3, mm5); // outptr[0,2],[1,2],[2,2],[3,2]
2491
2492 movq_m2r(*(wsptr+3), mm4); // tmp3,final4
2493 psraw_i2r(3, mm3); // outptr[0,5],[1,5],[2,5],[3,5]
2494
2495
2496
2497 // outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3)
2498 // & RANGE_MASK];
2499 // outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3)
2500 // & RANGE_MASK]; final4
2501 movq_r2r(mm4, mm6);
2502 paddw_r2r(mm7, mm4); // tmp3+tmp4
2503
2504 psubw_r2r(mm7, mm6); // tmp3-tmp4
2505 psraw_i2r(3, mm4); // outptr[0,4],[1,4],[2,4],[3,4]
2506
2507 psraw_i2r(3, mm6); // outptr[0,3],[1,3],[2,3],[3,3]
2508
2509 /*
2510 movq_r2m(mm4, *dummy);
2511 fprintf(stderr, "3-4 %016llx\n", dummy);
2512 movq_r2m(mm4, *dummy);
2513 fprintf(stderr, "3+4 %016llx\n", dummy);
2514 */
2515
2516
2517 packuswb_r2r(mm4, mm5); // out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4]
2518
2519 packuswb_r2r(mm3, mm6); // out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5]
2520 movq_r2r(mm2, mm4);
2521
2522 movq_r2r(mm5, mm7);
2523 punpcklbw_r2r(mm0, mm2); // out[0,0],[0,1],[1,0],[1,1],[2,0],[2,1],[3,0],[3,1]
2524
2525 punpckhbw_r2r(mm0, mm4); // out[0,6],[0,7],[1,6],[1,7],[2,6],[2,7],[3,6],[3,7]
2526 movq_r2r(mm2, mm1);
2527
2528 punpcklbw_r2r(mm6, mm5); // out[0,2],[0,3],[1,2],[1,3],[2,2],[2,3],[3,2],[3,3]
2529
2530 punpckhbw_r2r(mm6, mm7); // out[0,4],[0,5],[1,4],[1,5],[2,4],[2,5],[3,4],[3,5]
2531
2532 punpcklwd_r2r(mm5, mm2); // out[0,0],[0,1],[0,2],[0,3],[1,0],[1,1],[1,2],[1,3]
2533
2534 movq_r2r(mm7, mm6);
2535 punpckhwd_r2r(mm5, mm1); // out[2,0],[2,1],[2,2],[2,3],[3,0],[3,1],[3,2],[3,3]
2536
2537 movq_r2r(mm2, mm0);
2538 punpcklwd_r2r(mm4, mm6); // out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7]
2539
2540 punpckldq_r2r(mm6, mm2); // out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7]
2541
2542 movq_r2r(mm1, mm3);
2543
2544 punpckhwd_r2r(mm4, mm7); // out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7]
2545
2546 dataptr += rskip;
2547 movq_r2m(mm2, *(dataptr));
2548
2549 punpckhdq_r2r(mm6, mm0); // out[1,0],[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7]
2550
2551 dataptr += rskip;
2552 movq_r2m(mm0, *(dataptr));
2553
2554 punpckldq_r2r(mm7, mm1); // out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7]
2555
2556 punpckhdq_r2r(mm7, mm3); // out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7]
2557
2558 dataptr += rskip;
2559 movq_r2m(mm1, *(dataptr));
2560
2561 dataptr += rskip;
2562 movq_r2m(mm3, *(dataptr));
2563
2564 #else
2565 __s32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2566 __s32 tmp10, tmp11, tmp12, tmp13;
2567 __s32 z5, z10, z11, z12, z13;
2568 __s16 *inptr;
2569 __s32 *wsptr;
2570 __u8 *outptr;
2571 int ctr;
2572 __s32 dcval;
2573 __s32 workspace[64];
2574
2575 inptr = data;
2576 wsptr = workspace;
2577 for (ctr = 8; ctr > 0; ctr--) {
2578
2579 if ((inptr[8] | inptr[16] | inptr[24] |
2580 inptr[32] | inptr[40] | inptr[48] | inptr[56]) == 0) {
2581 dcval = inptr[0];
2582 wsptr[0] = dcval;
2583 wsptr[8] = dcval;
2584 wsptr[16] = dcval;
2585 wsptr[24] = dcval;
2586 wsptr[32] = dcval;
2587 wsptr[40] = dcval;
2588 wsptr[48] = dcval;
2589 wsptr[56] = dcval;
2590
2591 inptr++;
2592 wsptr++;
2593 continue;
2594 }
2595
2596 tmp0 = inptr[0];
2597 tmp1 = inptr[16];
2598 tmp2 = inptr[32];
2599 tmp3 = inptr[48];
2600
2601 tmp10 = tmp0 + tmp2;
2602 tmp11 = tmp0 - tmp2;
2603
2604 tmp13 = tmp1 + tmp3;
2605 tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13;
2606
2607 tmp0 = tmp10 + tmp13;
2608 tmp3 = tmp10 - tmp13;
2609 tmp1 = tmp11 + tmp12;
2610 tmp2 = tmp11 - tmp12;
2611
2612 tmp4 = inptr[8];
2613 tmp5 = inptr[24];
2614 tmp6 = inptr[40];
2615 tmp7 = inptr[56];
2616
2617 z13 = tmp6 + tmp5;
2618 z10 = tmp6 - tmp5;
2619 z11 = tmp4 + tmp7;
2620 z12 = tmp4 - tmp7;
2621
2622 tmp7 = z11 + z13;
2623 tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562);
2624
2625 z5 = MULTIPLY(z10 + z12, FIX_1_847759065);
2626 tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5;
2627 tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5;
2628
2629 tmp6 = tmp12 - tmp7;
2630 tmp5 = tmp11 - tmp6;
2631 tmp4 = tmp10 + tmp5;
2632
2633 wsptr[0] = (__s32) (tmp0 + tmp7);
2634 wsptr[56] = (__s32) (tmp0 - tmp7);
2635 wsptr[8] = (__s32) (tmp1 + tmp6);
2636 wsptr[48] = (__s32) (tmp1 - tmp6);
2637 wsptr[16] = (__s32) (tmp2 + tmp5);
2638 wsptr[40] = (__s32) (tmp2 - tmp5);
2639 wsptr[32] = (__s32) (tmp3 + tmp4);
2640 wsptr[24] = (__s32) (tmp3 - tmp4);
2641
2642 inptr++;
2643 wsptr++;
2644 }
2645
2646 wsptr = workspace;
2647 for (ctr = 0; ctr < 8; ctr++) {
2648 outptr = &(odata[ctr*rskip]);
2649
2650 tmp10 = wsptr[0] + wsptr[4];
2651 tmp11 = wsptr[0] - wsptr[4];
2652
2653 tmp13 = wsptr[2] + wsptr[6];
2654 tmp12 = MULTIPLY(wsptr[2] - wsptr[6], FIX_1_414213562) - tmp13;
2655
2656 tmp0 = tmp10 + tmp13;
2657 tmp3 = tmp10 - tmp13;
2658 tmp1 = tmp11 + tmp12;
2659 tmp2 = tmp11 - tmp12;
2660
2661 z13 = wsptr[5] + wsptr[3];
2662 z10 = wsptr[5] - wsptr[3];
2663 z11 = wsptr[1] + wsptr[7];
2664 z12 = wsptr[1] - wsptr[7];
2665
2666 tmp7 = z11 + z13;
2667 tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562);
2668
2669 z5 = MULTIPLY(z10 + z12, FIX_1_847759065);
2670 tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5;
2671 tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5;
2672
2673 tmp6 = tmp12 - tmp7;
2674 tmp5 = tmp11 - tmp6;
2675 tmp4 = tmp10 + tmp5;
2676
2677 outptr[0] = RL(DESCALE(tmp0 + tmp7));
2678 outptr[7] = RL(DESCALE(tmp0 - tmp7));
2679 outptr[1] = RL(DESCALE(tmp1 + tmp6));
2680 outptr[6] = RL(DESCALE(tmp1 - tmp6));
2681 outptr[2] = RL(DESCALE(tmp2 + tmp5));
2682 outptr[5] = RL(DESCALE(tmp2 - tmp5));
2683 outptr[4] = RL(DESCALE(tmp3 + tmp4));
2684 outptr[3] = RL(DESCALE(tmp3 - tmp4));
2685
2686 wsptr += 8;
2687 }
2688 #endif
2689 }
2690 /*
2691
2692 Main Routines
2693
2694 This file contains most of the initialisation and control functions
2695
2696 (C) Justin Schoeman 1998
2697
2698 */
2699
2700 /*
2701
2702 Private function
2703
2704 Initialise all the cache-aliged data blocks
2705
2706 */
2707
2708 void RTjpeg_init_data(void)
2709 {
2710 unsigned long dptr;
2711
2712 dptr=(unsigned long)&(RTjpeg_alldata[0]);
2713 dptr+=32;
2714 dptr=dptr>>5;
2715 dptr=dptr<<5; /* cache align data */
2716
2717 RTjpeg_block=(__s16 *)dptr;
2718 dptr+=sizeof(__s16)*64;
2719 RTjpeg_lqt=(__s32 *)dptr;
2720 dptr+=sizeof(__s32)*64;
2721 RTjpeg_cqt=(__s32 *)dptr;
2722 dptr+=sizeof(__s32)*64;
2723 RTjpeg_liqt=(__u32 *)dptr;
2724 dptr+=sizeof(__u32)*64;
2725 RTjpeg_ciqt=(__u32 *)dptr;
2726 }
2727
2728 /*
2729
2730 External Function
2731
2732 Re-set quality factor
2733
2734 Input: buf -> pointer to 128 ints for quant values store to pass back to
2735 init_decompress.
2736 Q -> quality factor (192=best, 32=worst)
2737 */
2738
2739 void RTjpeg_init_Q(__u8 Q)
2740 {
2741 int i;
2742 __u64 qual;
2743
2744 qual=(__u64)Q<<(32-7); /* 32 bit FP, 255=2, 0=0 */
2745
2746 for(i=0; i<64; i++)
2747 {
2748 RTjpeg_lqt[i]=(__s32)((qual/((__u64)RTjpeg_lum_quant_tbl[i]<<16))>>3);
2749 if(RTjpeg_lqt[i]==0)RTjpeg_lqt[i]=1;
2750 RTjpeg_cqt[i]=(__s32)((qual/((__u64)RTjpeg_chrom_quant_tbl[i]<<16))>>3);
2751 if(RTjpeg_cqt[i]==0)RTjpeg_cqt[i]=1;
2752 RTjpeg_liqt[i]=(1<<16)/(RTjpeg_lqt[i]<<3);
2753 RTjpeg_ciqt[i]=(1<<16)/(RTjpeg_cqt[i]<<3);
2754 RTjpeg_lqt[i]=((1<<16)/RTjpeg_liqt[i])>>3;
2755 RTjpeg_cqt[i]=((1<<16)/RTjpeg_ciqt[i])>>3;
2756 }
2757
2758 RTjpeg_lb8=0;
2759 while(RTjpeg_liqt[RTjpeg_ZZ[++RTjpeg_lb8]]<=8);
2760 RTjpeg_lb8--;
2761 RTjpeg_cb8=0;
2762 while(RTjpeg_ciqt[RTjpeg_ZZ[++RTjpeg_cb8]]<=8);
2763 RTjpeg_cb8--;
2764
2765 RTjpeg_dct_init();
2766 RTjpeg_idct_init();
2767 RTjpeg_quant_init();
2768 }
2769
2770 /*
2771
2772 External Function
2773
2774 Initialise compression.
2775
2776 Input: buf -> pointer to 128 ints for quant values store to pass back to
2777 init_decompress.
2778 width -> width of image
2779 height -> height of image
2780 Q -> quality factor (192=best, 32=worst)
2781
2782 */
2783
2784 void RTjpeg_init_compress(__u32 *buf, int width, int height, __u8 Q)
2785 {
2786 int i;
2787 __u64 qual;
2788
2789 RTjpeg_init_data();
2790
2791 RTjpeg_width=width;
2792 RTjpeg_height=height;
2793 RTjpeg_Ywidth = RTjpeg_width>>3;
2794 RTjpeg_Ysize=width * height;
2795 RTjpeg_Cwidth = RTjpeg_width>>4;
2796 RTjpeg_Csize= (width>>1) * height;
2797
2798 qual=(__u64)Q<<(32-7); /* 32 bit FP, 255=2, 0=0 */
2799
2800 for(i=0; i<64; i++)
2801 {
2802 RTjpeg_lqt[i]=(__s32)((qual/((__u64)RTjpeg_lum_quant_tbl[i]<<16))>>3);
2803 if(RTjpeg_lqt[i]==0)RTjpeg_lqt[i]=1;
2804 RTjpeg_cqt[i]=(__s32)((qual/((__u64)RTjpeg_chrom_quant_tbl[i]<<16))>>3);
2805 if(RTjpeg_cqt[i]==0)RTjpeg_cqt[i]=1;
2806 RTjpeg_liqt[i]=(1<<16)/(RTjpeg_lqt[i]<<3);
2807 RTjpeg_ciqt[i]=(1<<16)/(RTjpeg_cqt[i]<<3);
2808 RTjpeg_lqt[i]=((1<<16)/RTjpeg_liqt[i])>>3;
2809 RTjpeg_cqt[i]=((1<<16)/RTjpeg_ciqt[i])>>3;
2810 }
2811
2812 RTjpeg_lb8=0;
2813 while(RTjpeg_liqt[RTjpeg_ZZ[++RTjpeg_lb8]]<=8);
2814 RTjpeg_lb8--;
2815 RTjpeg_cb8=0;
2816 while(RTjpeg_ciqt[RTjpeg_ZZ[++RTjpeg_cb8]]<=8);
2817 RTjpeg_cb8--;
2818
2819 RTjpeg_dct_init();
2820 RTjpeg_quant_init();
2821
2822 for(i=0; i<64; i++)
2823 buf[i]=RTjpeg_liqt[i];
2824 for(i=0; i<64; i++)
2825 buf[64+i]=RTjpeg_ciqt[i];
2826 }
2827
2828 void RTjpeg_init_decompress(__u32 *buf, int width, int height)
2829 {
2830 int i;
2831
2832 RTjpeg_init_data();
2833
2834 RTjpeg_width=width;
2835 RTjpeg_height=height;
2836 RTjpeg_Ywidth = RTjpeg_width>>3;
2837 RTjpeg_Ysize=width * height;
2838 RTjpeg_Cwidth = RTjpeg_width>>4;
2839 RTjpeg_Csize= (width>>1) * height;
2840
2841 for(i=0; i<64; i++)
2842 {
2843 RTjpeg_liqt[i]=buf[i];
2844 RTjpeg_ciqt[i]=buf[i+64];
2845 }
2846
2847 RTjpeg_lb8=0;
2848 while(RTjpeg_liqt[RTjpeg_ZZ[++RTjpeg_lb8]]<=8);
2849 RTjpeg_lb8--;
2850 RTjpeg_cb8=0;
2851 while(RTjpeg_ciqt[RTjpeg_ZZ[++RTjpeg_cb8]]<=8);
2852 RTjpeg_cb8--;
2853
2854 RTjpeg_idct_init();
2855
2856 // RTjpeg_color_init();
2857 }
2858
2859 int RTjpeg_compressYUV420(__s8 *sp, unsigned char *bp)
2860 {
2861 __s8 * sb;
2862 register __s8 * bp1 = bp + (RTjpeg_width<<3);
2863 register __s8 * bp2 = bp + RTjpeg_Ysize;
2864 register __s8 * bp3 = bp2 + (RTjpeg_Csize>>1);
2865 register int i, j, k;
2866
2867 #ifdef MMX
2868 emms();
2869 #endif
2870 sb=sp;
2871 /* Y */
2872 for(i=RTjpeg_height>>1; i; i-=8)
2873 {
2874 for(j=0, k=0; j<RTjpeg_width; j+=16, k+=8)
2875 {
2876 RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_Ywidth);
2877 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
2878 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
2879
2880 RTjpeg_dctY(bp+j+8, RTjpeg_block, RTjpeg_Ywidth);
2881 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
2882 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
2883
2884 RTjpeg_dctY(bp1+j, RTjpeg_block, RTjpeg_Ywidth);
2885 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
2886 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
2887
2888 RTjpeg_dctY(bp1+j+8, RTjpeg_block, RTjpeg_Ywidth);
2889 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
2890 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
2891
2892 RTjpeg_dctY(bp2+k, RTjpeg_block, RTjpeg_Cwidth);
2893 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
2894 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
2895
2896 RTjpeg_dctY(bp3+k, RTjpeg_block, RTjpeg_Cwidth);
2897 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
2898 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
2899
2900 }
2901 bp+=RTjpeg_width<<4;
2902 bp1+=RTjpeg_width<<4;
2903 bp2+=RTjpeg_width<<2;
2904 bp3+=RTjpeg_width<<2;
2905
2906 }
2907 #ifdef MMX
2908 emms();
2909 #endif
2910 return (sp-sb);
2911 }
2912
2913 int RTjpeg_compressYUV422(__s8 *sp, unsigned char *bp)
2914 {
2915 __s8 * sb;
2916 register __s8 * bp2 = bp + RTjpeg_Ysize;
2917 register __s8 * bp3 = bp2 + RTjpeg_Csize;
2918 register int i, j, k;
2919
2920 #ifdef MMX
2921 emms();
2922 #endif
2923 sb=sp;
2924 /* Y */
2925 for(i=RTjpeg_height; i; i-=8)
2926 {
2927 for(j=0, k=0; j<RTjpeg_width; j+=16, k+=8)
2928 {
2929 RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_Ywidth);
2930 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
2931 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
2932
2933 RTjpeg_dctY(bp+j+8, RTjpeg_block, RTjpeg_Ywidth);
2934 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
2935 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
2936
2937 RTjpeg_dctY(bp2+k, RTjpeg_block, RTjpeg_Cwidth);
2938 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
2939 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
2940
2941 RTjpeg_dctY(bp3+k, RTjpeg_block, RTjpeg_Cwidth);
2942 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
2943 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
2944
2945 }
2946 bp+=RTjpeg_width<<3;
2947 bp2+=RTjpeg_width<<2;
2948 bp3+=RTjpeg_width<<2;
2949
2950 }
2951 #ifdef MMX
2952 emms();
2953 #endif
2954 return (sp-sb);
2955 }
2956
2957 int RTjpeg_compress8(__s8 *sp, unsigned char *bp)
2958 {
2959 __s8 * sb;
2960 int i, j;
2961
2962 #ifdef MMX
2963 emms();
2964 #endif
2965
2966 sb=sp;
2967 /* Y */
2968 for(i=0; i<RTjpeg_height; i+=8)
2969 {
2970 for(j=0; j<RTjpeg_width; j+=8)
2971 {
2972 RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_width);
2973 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
2974 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
2975 }
2976 bp+=RTjpeg_width;
2977 }
2978
2979 #ifdef MMX
2980 emms();
2981 #endif
2982 return (sp-sb);
2983 }
2984
2985 void RTjpeg_decompressYUV422(__s8 *sp, __u8 *bp)
2986 {
2987 register __s8 * bp2 = bp + RTjpeg_Ysize;
2988 register __s8 * bp3 = bp2 + (RTjpeg_Csize);
2989 int i, j,k;
2990
2991 #ifdef MMX
2992 emms();
2993 #endif
2994
2995 /* Y */
2996 for(i=RTjpeg_height; i; i-=8)
2997 {
2998 for(k=0, j=0; j<RTjpeg_width; j+=16, k+=8) {
2999 if(*sp==-1)sp++;
3000 else
3001 {
3002 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
3003 RTjpeg_idct(bp+j, RTjpeg_block, RTjpeg_width);
3004 }
3005 if(*sp==-1)sp++;
3006 else
3007 {
3008 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
3009 RTjpeg_idct(bp+j+8, RTjpeg_block, RTjpeg_width);
3010 }
3011 if(*sp==-1)sp++;
3012 else
3013 {
3014 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt);
3015 RTjpeg_idct(bp2+k, RTjpeg_block, RTjpeg_width>>1);
3016 }
3017 if(*sp==-1)sp++;
3018 else
3019 {
3020 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt);
3021 RTjpeg_idct(bp3+k, RTjpeg_block, RTjpeg_width>>1);
3022 }
3023 }
3024 bp+=RTjpeg_width<<3;
3025 bp2+=RTjpeg_width<<2;
3026 bp3+=RTjpeg_width<<2;
3027 }
3028 #ifdef MMX
3029 emms();
3030 #endif
3031 }
3032
3033 void RTjpeg_decompressYUV420(__s8 *sp, __u8 *bp)
3034 {
3035 register __s8 * bp1 = bp + (RTjpeg_width<<3);
3036 register __s8 * bp2 = bp + RTjpeg_Ysize;
3037 register __s8 * bp3 = bp2 + (RTjpeg_Csize>>1);
3038 int i, j,k;
3039
3040 #ifdef MMX
3041 emms();
3042 #endif
3043
3044 /* Y */
3045 for(i=RTjpeg_height>>1; i; i-=8)
3046 {
3047 for(k=0, j=0; j<RTjpeg_width; j+=16, k+=8) {
3048 if(*sp==-1)sp++;
3049 else
3050 {
3051 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
3052 RTjpeg_idct(bp+j, RTjpeg_block, RTjpeg_width);
3053 }
3054 if(*sp==-1)sp++;
3055 else
3056 {
3057 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
3058 RTjpeg_idct(bp+j+8, RTjpeg_block, RTjpeg_width);
3059 }
3060 if(*sp==-1)sp++;
3061 else
3062 {
3063 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
3064 RTjpeg_idct(bp1+j, RTjpeg_block, RTjpeg_width);
3065 }
3066 if(*sp==-1)sp++;
3067 else
3068 {
3069 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
3070 RTjpeg_idct(bp1+j+8, RTjpeg_block, RTjpeg_width);
3071 }
3072 if(*sp==-1)sp++;
3073 else
3074 {
3075 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt);
3076 RTjpeg_idct(bp2+k, RTjpeg_block, RTjpeg_width>>1);
3077 }
3078 if(*sp==-1)sp++;
3079 else
3080 {
3081 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt);
3082 RTjpeg_idct(bp3+k, RTjpeg_block, RTjpeg_width>>1);
3083 }
3084 }
3085 bp+=RTjpeg_width<<4;
3086 bp1+=RTjpeg_width<<4;
3087 bp2+=RTjpeg_width<<2;
3088 bp3+=RTjpeg_width<<2;
3089 }
3090 #ifdef MMX
3091 emms();
3092 #endif
3093 }
3094
3095 void RTjpeg_decompress8(__s8 *sp, __u8 *bp)
3096 {
3097 int i, j;
3098
3099 #ifdef MMX
3100 emms();
3101 #endif
3102
3103 /* Y */
3104 for(i=0; i<RTjpeg_height; i+=8)
3105 {
3106 for(j=0; j<RTjpeg_width; j+=8)
3107 if(*sp==-1)sp++;
3108 else
3109 {
3110 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
3111 RTjpeg_idct(bp+j, RTjpeg_block, RTjpeg_width);
3112 }
3113 bp+=RTjpeg_width<<3;
3114 }
3115 }
3116
3117 /*
3118 External Function
3119
3120 Initialise additional data structures for motion compensation
3121
3122 */
3123
3124 void RTjpeg_init_mcompress(void)
3125 {
3126 unsigned long tmp;
3127
3128 if(!RTjpeg_old)
3129 {
3130 RTjpeg_old=malloc((4*RTjpeg_width*RTjpeg_height)+32);
3131 tmp=(unsigned long)RTjpeg_old;
3132 tmp+=32;
3133 tmp=tmp>>5;
3134 RTjpeg_old=(__s16 *)(tmp<<5);
3135 }
3136 if (!RTjpeg_old)
3137 {
3138 fprintf(stderr, "RTjpeg: Could not allocate memory\n");
3139 exit(-1);
3140 }
3141 bzero(RTjpeg_old, ((4*RTjpeg_width*RTjpeg_height)));
3142 }
3143
3144 #ifdef MMX
3145
3146 int RTjpeg_bcomp(__s16 *old, mmx_t *mask)
3147 {
3148 int i;
3149 mmx_t *mold=(mmx_t *)old;
3150 mmx_t *mblock=(mmx_t *)RTjpeg_block;
3151 volatile mmx_t result;
3152 static mmx_t neg=(mmx_t)(unsigned long long)0xffffffffffffffffULL;
3153
3154 movq_m2r(*mask, mm7);
3155 movq_m2r(neg, mm6);
3156 pxor_r2r(mm5, mm5);
3157
3158 for(i=0; i<8; i++)
3159 {
3160 movq_m2r(*(mblock++), mm0);
3161 movq_m2r(*(mblock++), mm2);
3162 movq_m2r(*(mold++), mm1);
3163 movq_m2r(*(mold++), mm3);
3164 psubsw_r2r(mm1, mm0);
3165 psubsw_r2r(mm3, mm2);
3166 movq_r2r(mm0, mm1);
3167 movq_r2r(mm2, mm3);
3168 pcmpgtw_r2r(mm7, mm0);
3169 pcmpgtw_r2r(mm7, mm2);
3170 pxor_r2r(mm6, mm1);
3171 pxor_r2r(mm6, mm3);
3172 pcmpgtw_r2r(mm7, mm1);
3173 pcmpgtw_r2r(mm7, mm3);
3174 por_r2r(mm0, mm5);
3175 por_r2r(mm2, mm5);
3176 por_r2r(mm1, mm5);
3177 por_r2r(mm3, mm5);
3178 }
3179 movq_r2m(mm5, result);
3180
3181 if(result.q)
3182 {
3183 // if(!RTjpeg_mtest)
3184 // for(i=0; i<16; i++)((__u64 *)old)[i]=((__u64 *)RTjpeg_block)[i];
3185 return 0;
3186 }
3187 // printf(".");
3188 return 1;
3189 }
3190
3191 #else
3192 int RTjpeg_bcomp(__s16 *old, __u16 *mask)
3193 {
3194 int i;
3195
3196 for(i=0; i<64; i++)
3197 if(abs(old[i]-RTjpeg_block[i])>*mask)
3198 {
3199 if(!RTjpeg_mtest)
3200 for(i=0; i<16; i++)((__u64 *)old)[i]=((__u64 *)RTjpeg_block)[i];
3201 return 0;
3202 }
3203 return 1;
3204 }
3205 #endif
3206
3207 void RTjpeg_set_test(int i)
3208 {
3209 RTjpeg_mtest=i;
3210 }
3211
3212 int RTjpeg_mcompressYUV420(__s8 *sp, unsigned char *bp, __u16 lmask, __u16 cmask)
3213 {
3214 __s8 * sb;
3215 //rh __s16 *block;
3216 register __s8 * bp1 = bp + (RTjpeg_width<<3);
3217 register __s8 * bp2 = bp + RTjpeg_Ysize;
3218 register __s8 * bp3 = bp2 + (RTjpeg_Csize>>1);
3219 register int i, j, k;
3220
3221 #ifdef MMX
3222 emms();
3223 RTjpeg_lmask=(mmx_t)(((__u64)lmask<<48)|((__u64)lmask<<32)|((__u64)lmask<<16)|lmask);
3224 RTjpeg_cmask=(mmx_t)(((__u64)cmask<<48)|((__u64)cmask<<32)|((__u64)cmask<<16)|cmask);
3225 #else
3226 RTjpeg_lmask=lmask;
3227 RTjpeg_cmask=cmask;
3228 #endif
3229
3230 sb=sp;
3231 block=RTjpeg_old;
3232 /* Y */
3233 for(i=RTjpeg_height>>1; i; i-=8)
3234 {
3235 for(j=0, k=0; j<RTjpeg_width; j+=16, k+=8)
3236 {
3237 RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_Ywidth);
3238 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
3239 if(RTjpeg_bcomp(block, &RTjpeg_lmask))
3240 {
3241 *((__u8 *)sp++)=255;
3242 }
3243 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
3244 block+=64;
3245
3246 RTjpeg_dctY(bp+j+8, RTjpeg_block, RTjpeg_Ywidth);
3247 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
3248 if(RTjpeg_bcomp(block, &RTjpeg_lmask))
3249 {
3250 *((__u8 *)sp++)=255;
3251 }
3252 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
3253 block+=64;
3254
3255 RTjpeg_dctY(bp1+j, RTjpeg_block, RTjpeg_Ywidth);
3256 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
3257 if(RTjpeg_bcomp(block, &RTjpeg_lmask))
3258 {
3259 *((__u8 *)sp++)=255;
3260 }
3261 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
3262 block+=64;
3263
3264 RTjpeg_dctY(bp1+j+8, RTjpeg_block, RTjpeg_Ywidth);
3265 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
3266 if(RTjpeg_bcomp(block, &RTjpeg_lmask))
3267 {
3268 *((__u8 *)sp++)=255;
3269 }
3270 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
3271 block+=64;
3272
3273 RTjpeg_dctY(bp2+k, RTjpeg_block, RTjpeg_Cwidth);
3274 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
3275 if(RTjpeg_bcomp(block, &RTjpeg_cmask))
3276 {
3277 *((__u8 *)sp++)=255;
3278 }
3279 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
3280 block+=64;
3281
3282 RTjpeg_dctY(bp3+k, RTjpeg_block, RTjpeg_Cwidth);
3283 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
3284 if(RTjpeg_bcomp(block, &RTjpeg_cmask))
3285 {
3286 *((__u8 *)sp++)=255;
3287 }
3288 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
3289 block+=64;
3290 }
3291 bp+=RTjpeg_width<<4;
3292 bp1+=RTjpeg_width<<4;
3293 bp2+=RTjpeg_width<<2;
3294 bp3+=RTjpeg_width<<2;
3295
3296 }
3297 #ifdef MMX
3298 emms();
3299 #endif
3300 return (sp-sb);
3301 }
3302
3303
3304 int RTjpeg_mcompressYUV422(__s8 *sp, unsigned char *bp, __u16 lmask, __u16 cmask)
3305 {
3306 __s8 * sb;
3307 __s16 *block;
3308 register __s8 * bp2;
3309 register __s8 * bp3;
3310 register int i, j, k;
3311
3312 #ifdef MMX
3313 emms();
3314 RTjpeg_lmask=(mmx_t)(((__u64)lmask<<48)|((__u64)lmask<<32)|((__u64)lmask<<16)|lmask);
3315 RTjpeg_cmask=(mmx_t)(((__u64)cmask<<48)|((__u64)cmask<<32)|((__u64)cmask<<16)|cmask);
3316 #else
3317 RTjpeg_lmask=lmask;
3318 RTjpeg_cmask=cmask;
3319 #endif
3320
3321 bp = bp - RTjpeg_width*0;
3322 bp2 = bp + RTjpeg_Ysize-RTjpeg_width*0;
3323 bp3 = bp2 + RTjpeg_Csize;
3324
3325 sb=sp;
3326 block=RTjpeg_old;
3327 /* Y */
3328 for(i=RTjpeg_height; i; i-=8)
3329 {
3330 for(j=0, k=0; j<RTjpeg_width; j+=16, k+=8)
3331 {
3332 RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_Ywidth);
3333 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
3334 if(RTjpeg_bcomp(block, &RTjpeg_lmask))
3335 {
3336 *((__u8 *)sp++)=255;
3337 }
3338 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
3339 block+=64;
3340
3341 RTjpeg_dctY(bp+j+8, RTjpeg_block, RTjpeg_Ywidth);
3342 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
3343 if(RTjpeg_bcomp(block, &RTjpeg_lmask))
3344 {
3345 *((__u8 *)sp++)=255;
3346 }
3347 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
3348 block+=64;
3349
3350 RTjpeg_dctY(bp2+k, RTjpeg_block, RTjpeg_Cwidth);
3351 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
3352 if(RTjpeg_bcomp(block, &RTjpeg_cmask))
3353 {
3354 *((__u8 *)sp++)=255;
3355 }
3356 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
3357 block+=64;
3358
3359 RTjpeg_dctY(bp3+k, RTjpeg_block, RTjpeg_Cwidth);
3360 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
3361 if(RTjpeg_bcomp(block, &RTjpeg_cmask))
3362 {
3363 *((__u8 *)sp++)=255;
3364 }
3365 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
3366 block+=64;
3367
3368 }
3369 bp+=RTjpeg_width<<3;
3370 bp2+=RTjpeg_width<<2;
3371 bp3+=RTjpeg_width<<2;
3372 }
3373 printf ("%d\n", block - RTjpeg_old);
3374 #ifdef MMX
3375 emms();
3376 #endif
3377 return (sp-sb);
3378 }
3379
3380 int RTjpeg_mcompress8(__s8 *sp, unsigned char *bp, __u16 lmask)
3381 {
3382 __s8 * sb;
3383 __s16 *block;
3384 int i, j;
3385
3386 #ifdef MMX
3387 emms();
3388 RTjpeg_lmask=(mmx_t)(((__u64)lmask<<48)|((__u64)lmask<<32)|((__u64)lmask<<16)|lmask);
3389 #else
3390 RTjpeg_lmask=lmask;
3391 #endif
3392
3393
3394 sb=sp;
3395 block=RTjpeg_old;
3396 /* Y */
3397 for(i=0; i<RTjpeg_height; i+=8)
3398 {
3399 for(j=0; j<RTjpeg_width; j+=8)
3400 {
3401 RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_width);
3402 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
3403 if(RTjpeg_bcomp(block, &RTjpeg_lmask))
3404 {
3405 *((__u8 *)sp++)=255;
3406 // printf("* %d ", sp[-1]);
3407 } else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
3408 block+=64;
3409 }
3410 bp+=RTjpeg_width<<3;
3411 }
3412 #ifdef MMX
3413 emms();
3414 #endif
3415 return (sp-sb);
3416 }
3417
3418 void RTjpeg_color_init(void)
3419 {
3420 }
3421
3422 #define KcrR 76284
3423 #define KcrG 53281
3424 #define KcbG 25625
3425 #define KcbB 132252
3426 #define Ky 76284
3427
3428 void RTjpeg_yuv422rgb(__u8 *buf, __u8 *rgb, int stride)
3429 {
3430 int tmp;
3431 int i, j;
3432 __s32 y, crR, crG, cbG, cbB;
3433 __u8 *bufcr, *bufcb, *bufy, *bufoute;
3434 int yskip;
3435
3436 yskip=RTjpeg_width;
3437
3438 bufcb=&buf[RTjpeg_width*RTjpeg_height];
3439 bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/2];
3440 bufy=&buf[0];
3441 bufoute=rgb;
3442
3443 for(i=0; i<(RTjpeg_height); i++)
3444 {
3445 for(j=0; j<RTjpeg_width; j+=2)
3446 {
3447 crR=(*bufcr-128)*KcrR;
3448 crG=(*(bufcr++)-128)*KcrG;
3449 cbG=(*bufcb-128)*KcbG;
3450 cbB=(*(bufcb++)-128)*KcbB;
3451
3452 y=(bufy[j]-16)*Ky;
3453
3454 tmp=(y+crR)>>16;
3455 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3456 tmp=(y-crG-cbG)>>16;
3457 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3458 tmp=(y+cbB)>>16;
3459 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3460
3461 y=(bufy[j+1]-16)*Ky;
3462
3463 tmp=(y+crR)>>16;
3464 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3465 tmp=(y-crG-cbG)>>16;
3466 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3467 tmp=(y+cbB)>>16;
3468 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3469
3470 }
3471 bufy+=yskip;
3472 }
3473 }
3474
3475
3476 void RTjpeg_yuv420rgb(__u8 *buf, __u8 *rgb, int stride)
3477 {
3478 int tmp;
3479 int i, j;
3480 __s32 y, crR, crG, cbG, cbB;
3481 __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto;
3482 int oskip, yskip;
3483
3484 if(stride==0)
3485 oskip=RTjpeg_width*3;
3486 else
3487 oskip=2*stride-RTjpeg_width*3;
3488
3489 yskip=RTjpeg_width;
3490
3491 bufcb=&buf[RTjpeg_width*RTjpeg_height];
3492 bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/4];
3493 bufy=&buf[0];
3494 bufoute=rgb;
3495 bufouto=rgb+RTjpeg_width*3;
3496
3497 for(i=0; i<(RTjpeg_height>>1); i++)
3498 {
3499 for(j=0; j<RTjpeg_width; j+=2)
3500 {
3501 crR=(*bufcr-128)*KcrR;
3502 crG=(*(bufcr++)-128)*KcrG;
3503 cbG=(*bufcb-128)*KcbG;
3504 cbB=(*(bufcb++)-128)*KcbB;
3505
3506 y=(bufy[j]-16)*Ky;
3507
3508 tmp=(y+crR)>>16;
3509 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3510 tmp=(y-crG-cbG)>>16;
3511 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3512 tmp=(y+cbB)>>16;
3513 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3514
3515 y=(bufy[j+1]-16)*Ky;
3516
3517 tmp=(y+crR)>>16;
3518 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3519 tmp=(y-crG-cbG)>>16;
3520 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3521 tmp=(y+cbB)>>16;
3522 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3523
3524 y=(bufy[j+yskip]-16)*Ky;
3525
3526 tmp=(y+crR)>>16;
3527 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3528 tmp=(y-crG-cbG)>>16;
3529 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3530 tmp=(y+cbB)>>16;
3531 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3532
3533 y=(bufy[j+1+yskip]-16)*Ky;
3534
3535 tmp=(y+crR)>>16;
3536 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3537 tmp=(y-crG-cbG)>>16;
3538 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3539 tmp=(y+cbB)>>16;
3540 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3541
3542 }
3543 bufoute+=oskip;
3544 bufouto+=oskip;
3545 bufy+=yskip<<1;
3546 }
3547 }
3548
3549
3550 void RTjpeg_yuvrgb32(__u8 *buf, __u8 *rgb, int stride)
3551 {
3552 int tmp;
3553 int i, j;
3554 __s32 y, crR, crG, cbG, cbB;
3555 __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto;
3556 int oskip, yskip;
3557
3558 if(stride==0)
3559 oskip=RTjpeg_width*4;
3560 else
3561 oskip = 2*stride-RTjpeg_width*4;
3562 yskip=RTjpeg_width;
3563
3564 bufcb=&buf[RTjpeg_width*RTjpeg_height];
3565 bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/2];
3566 bufy=&buf[0];
3567 bufoute=rgb;
3568 bufouto=rgb+RTjpeg_width*4;
3569
3570 for(i=0; i<(RTjpeg_height>>1); i++)
3571 {
3572 for(j=0; j<RTjpeg_width; j+=2)
3573 {
3574 crR=(*bufcr-128)*KcrR;
3575 crG=(*(bufcr++)-128)*KcrG;
3576 cbG=(*bufcb-128)*KcbG;
3577 cbB=(*(bufcb++)-128)*KcbB;
3578
3579 y=(bufy[j]-16)*Ky;
3580
3581 tmp=(y+cbB)>>16;
3582 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3583 tmp=(y-crG-cbG)>>16;
3584 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3585 tmp=(y+crR)>>16;
3586 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3587 bufoute++;
3588
3589 y=(bufy[j+1]-16)*Ky;
3590
3591 tmp=(y+cbB)>>16;
3592 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3593 tmp=(y-crG-cbG)>>16;
3594 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3595 tmp=(y+crR)>>16;
3596 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3597 bufoute++;
3598
3599 y=(bufy[j+yskip]-16)*Ky;
3600
3601 tmp=(y+cbB)>>16;
3602 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3603 tmp=(y-crG-cbG)>>16;
3604 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3605 tmp=(y+crR)>>16;
3606 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3607 bufouto++;
3608
3609 y=(bufy[j+1+yskip]-16)*Ky;
3610
3611 tmp=(y+cbB)>>16;
3612 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3613 tmp=(y-crG-cbG)>>16;
3614 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3615 tmp=(y+crR)>>16;
3616 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3617 bufouto++;
3618
3619 }
3620 bufoute+=oskip;
3621 bufouto+=oskip;
3622 bufy+=yskip<<1;
3623 }
3624 }
3625
3626 void RTjpeg_yuvrgb24(__u8 *buf, __u8 *rgb, int stride)
3627 {
3628 int tmp;
3629 int i, j;
3630 __s32 y, crR, crG, cbG, cbB;
3631 __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto;
3632 int oskip, yskip;
3633
3634 if(stride==0)
3635 oskip=RTjpeg_width*3;
3636 else
3637 oskip=2*stride - RTjpeg_width*3;
3638
3639 yskip=RTjpeg_width;
3640
3641 bufcb=&buf[RTjpeg_width*RTjpeg_height];
3642 bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/4];
3643 bufy=&buf[0];
3644 bufoute=rgb;
3645 bufouto=rgb+RTjpeg_width*3;
3646
3647 for(i=0; i<(RTjpeg_height>>1); i++)
3648 {
3649 for(j=0; j<RTjpeg_width; j+=2)
3650 {
3651 crR=(*bufcr-128)*KcrR;
3652 crG=(*(bufcr++)-128)*KcrG;
3653 cbG=(*bufcb-128)*KcbG;
3654 cbB=(*(bufcb++)-128)*KcbB;
3655
3656 y=(bufy[j]-16)*Ky;
3657
3658 tmp=(y+cbB)>>16;
3659 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3660 tmp=(y-crG-cbG)>>16;
3661 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3662 tmp=(y+crR)>>16;
3663 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3664
3665 y=(bufy[j+1]-16)*Ky;
3666
3667 tmp=(y+cbB)>>16;
3668 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3669 tmp=(y-crG-cbG)>>16;
3670 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3671 tmp=(y+crR)>>16;
3672 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3673
3674 y=(bufy[j+yskip]-16)*Ky;
3675
3676 tmp=(y+cbB)>>16;
3677 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3678 tmp=(y-crG-cbG)>>16;
3679 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3680 tmp=(y+crR)>>16;
3681 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3682
3683 y=(bufy[j+1+yskip]-16)*Ky;
3684
3685 tmp=(y+cbB)>>16;
3686 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3687 tmp=(y-crG-cbG)>>16;
3688 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3689 tmp=(y+crR)>>16;
3690 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3691
3692 }
3693 bufoute+=oskip;
3694 bufouto+=oskip;
3695 bufy+=yskip<<1;
3696 }
3697 }
3698
3699 void RTjpeg_yuvrgb16(__u8 *buf, __u8 *rgb, int stride)
3700 {
3701 int tmp;
3702 int i, j;
3703 __s32 y, crR, crG, cbG, cbB;
3704 __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto;
3705 int oskip, yskip;
3706 unsigned char r, g, b;
3707
3708 if(stride==0)
3709 oskip=RTjpeg_width*2;
3710 else
3711 oskip=2*stride-RTjpeg_width*2;
3712
3713 yskip=RTjpeg_width;
3714
3715 bufcb=&buf[RTjpeg_width*RTjpeg_height];
3716 bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/4];
3717 bufy=&buf[0];
3718 bufoute=rgb;
3719 bufouto=rgb+RTjpeg_width*2;
3720
3721 for(i=0; i<(RTjpeg_height>>1); i++)
3722 {
3723 for(j=0; j<RTjpeg_width; j+=2)
3724 {
3725 crR=(*bufcr-128)*KcrR;
3726 crG=(*(bufcr++)-128)*KcrG;
3727 cbG=(*bufcb-128)*KcbG;
3728 cbB=(*(bufcb++)-128)*KcbB;
3729
3730 y=(bufy[j]-16)*Ky;
3731
3732 tmp=(y+cbB)>>16;
3733 b=(tmp>255)?255:((tmp<0)?0:tmp);
3734 tmp=(y-crG-cbG)>>16;
3735 g=(tmp>255)?255:((tmp<0)?0:tmp);
3736 tmp=(y+crR)>>16;
3737 r=(tmp>255)?255:((tmp<0)?0:tmp);
3738 tmp=(int)((int)b >> 3);
3739 tmp|=(int)(((int)g >> 2) << 5);
3740 tmp|=(int)(((int)r >> 3) << 11);
3741 *(bufoute++)=tmp&0xff;
3742 *(bufoute++)=tmp>>8;
3743
3744
3745 y=(bufy[j+1]-16)*Ky;
3746
3747 tmp=(y+cbB)>>16;
3748 b=(tmp>255)?255:((tmp<0)?0:tmp);
3749 tmp=(y-crG-cbG)>>16;
3750 g=(tmp>255)?255:((tmp<0)?0:tmp);
3751 tmp=(y+crR)>>16;
3752 r=(tmp>255)?255:((tmp<0)?0:tmp);
3753 tmp=(int)((int)b >> 3);
3754 tmp|=(int)(((int)g >> 2) << 5);
3755 tmp|=(int)(((int)r >> 3) << 11);
3756 *(bufoute++)=tmp&0xff;
3757 *(bufoute++)=tmp>>8;
3758
3759 y=(bufy[j+yskip]-16)*Ky;
3760
3761 tmp=(y+cbB)>>16;
3762 b=(tmp>255)?255:((tmp<0)?0:tmp);
3763 tmp=(y-crG-cbG)>>16;
3764 g=(tmp>255)?255:((tmp<0)?0:tmp);
3765 tmp=(y+crR)>>16;
3766 r=(tmp>255)?255:((tmp<0)?0:tmp);
3767 tmp=(int)((int)b >> 3);
3768 tmp|=(int)(((int)g >> 2) << 5);
3769 tmp|=(int)(((int)r >> 3) << 11);
3770 *(bufouto++)=tmp&0xff;
3771 *(bufouto++)=tmp>>8;
3772
3773 y=(bufy[j+1+yskip]-16)*Ky;
3774
3775 tmp=(y+cbB)>>16;
3776 b=(tmp>255)?255:((tmp<0)?0:tmp);
3777 tmp=(y-crG-cbG)>>16;
3778 g=(tmp>255)?255:((tmp<0)?0:tmp);
3779 tmp=(y+crR)>>16;
3780 r=(tmp>255)?255:((tmp<0)?0:tmp);
3781 tmp=(int)((int)b >> 3);
3782 tmp|=(int)(((int)g >> 2) << 5);
3783 tmp|=(int)(((int)r >> 3) << 11);
3784 *(bufouto++)=tmp&0xff;
3785 *(bufouto++)=tmp>>8;
3786
3787 }
3788 bufoute+=oskip;
3789 bufouto+=oskip;
3790 bufy+=yskip<<1;
3791 }
3792 }
3793
3794 /* fix stride */
3795
3796 void RTjpeg_yuvrgb8(__u8 *buf, __u8 *rgb, int stride)
3797 {
3798 bcopy(buf, rgb, RTjpeg_width*RTjpeg_height);
3799 }
3800