comparison RTjpegN.c @ 3802:bf37d5cd3e6e

used by NuppelVideo decoder
author alex
date Thu, 27 Dec 2001 21:27:29 +0000
parents
children 07d5beff8793
comparison
equal deleted inserted replaced
3801:3cea69dda1b3 3802:bf37d5cd3e6e
1 /*
2 RTjpeg (C) Justin Schoeman 1998 (justin@suntiger.ee.up.ac.za)
3
4 With modifications by:
5 (c) 1998, 1999 by Joerg Walter <trouble@moes.pmnet.uni-oldenburg.de>
6 and
7 (c) 1999 by Wim Taymans <wim.taymans@tvd.be>
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; either version 2 of the License, or
12 (at your option) any later version.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22
23 */
24
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include "RTjpegN.h"
29
30 #ifdef MMX
31 #include "mmx.h"
32 #endif
33
34 //#define SHOWBLOCK 1
35 #define BETTERCOMPRESSION 1
36
37 static const unsigned char RTjpeg_ZZ[64]={
38 0,
39 8, 1,
40 2, 9, 16,
41 24, 17, 10, 3,
42 4, 11, 18, 25, 32,
43 40, 33, 26, 19, 12, 5,
44 6, 13, 20, 27, 34, 41, 48,
45 56, 49, 42, 35, 28, 21, 14, 7,
46 15, 22, 29, 36, 43, 50, 57,
47 58, 51, 44, 37, 30, 23,
48 31, 38, 45, 52, 59,
49 60, 53, 46, 39,
50 47, 54, 61,
51 62, 55,
52 63 };
53
54 static const __u64 RTjpeg_aan_tab[64]={
55 4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL, 3374581504ULL, 2324432128ULL, 1184891264ULL,
56 5957222912ULL, 8263040512ULL, 7783580160ULL, 7005009920ULL, 5957222912ULL, 4680582144ULL, 3224107520ULL, 1643641088ULL,
57 5611718144ULL, 7783580160ULL, 7331904512ULL, 6598688768ULL, 5611718144ULL, 4408998912ULL, 3036936960ULL, 1548224000ULL,
58 5050464768ULL, 7005009920ULL, 6598688768ULL, 5938608128ULL, 5050464768ULL, 3968072960ULL, 2733115392ULL, 1393296000ULL,
59 4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL, 3374581504ULL, 2324432128ULL, 1184891264ULL,
60 3374581504ULL, 4680582144ULL, 4408998912ULL, 3968072960ULL, 3374581504ULL, 2651326208ULL, 1826357504ULL, 931136000ULL,
61 2324432128ULL, 3224107520ULL, 3036936960ULL, 2733115392ULL, 2324432128ULL, 1826357504ULL, 1258030336ULL, 641204288ULL,
62 1184891264ULL, 1643641088ULL, 1548224000ULL, 1393296000ULL, 1184891264ULL, 931136000ULL, 641204288ULL, 326894240ULL,
63 };
64
65 #ifndef MMX
66 static __s32 RTjpeg_ws[64+31];
67 #endif
68 __u8 RTjpeg_alldata[2*64+4*64+4*64+4*64+4*64+32];
69
70 __s16 *block; // rh
71 __s16 *RTjpeg_block;
72 __s32 *RTjpeg_lqt;
73 __s32 *RTjpeg_cqt;
74 __u32 *RTjpeg_liqt;
75 __u32 *RTjpeg_ciqt;
76
77 unsigned char RTjpeg_lb8;
78 unsigned char RTjpeg_cb8;
79 int RTjpeg_width, RTjpeg_height;
80 int RTjpeg_Ywidth, RTjpeg_Cwidth;
81 int RTjpeg_Ysize, RTjpeg_Csize;
82
83 __s16 *RTjpeg_old=NULL;
84
85 #ifdef MMX
86 mmx_t RTjpeg_lmask;
87 mmx_t RTjpeg_cmask;
88 #else
89 __u16 RTjpeg_lmask;
90 __u16 RTjpeg_cmask;
91 #endif
92 int RTjpeg_mtest=0;
93
94 static const unsigned char RTjpeg_lum_quant_tbl[64] = {
95 16, 11, 10, 16, 24, 40, 51, 61,
96 12, 12, 14, 19, 26, 58, 60, 55,
97 14, 13, 16, 24, 40, 57, 69, 56,
98 14, 17, 22, 29, 51, 87, 80, 62,
99 18, 22, 37, 56, 68, 109, 103, 77,
100 24, 35, 55, 64, 81, 104, 113, 92,
101 49, 64, 78, 87, 103, 121, 120, 101,
102 72, 92, 95, 98, 112, 100, 103, 99
103 };
104
105 static const unsigned char RTjpeg_chrom_quant_tbl[64] = {
106 17, 18, 24, 47, 99, 99, 99, 99,
107 18, 21, 26, 66, 99, 99, 99, 99,
108 24, 26, 56, 99, 99, 99, 99, 99,
109 47, 66, 99, 99, 99, 99, 99, 99,
110 99, 99, 99, 99, 99, 99, 99, 99,
111 99, 99, 99, 99, 99, 99, 99, 99,
112 99, 99, 99, 99, 99, 99, 99, 99,
113 99, 99, 99, 99, 99, 99, 99, 99
114 };
115
116 #ifdef BETTERCOMPRESSION
117
118 /*--------------------------------------------------*/
119 /* better encoding, but needs a lot more cpu time */
120 /* seems to be more effective than old method +lzo */
121 /* with this encoding lzo isn't efficient anymore */
122 /* there is still more potential for better */
123 /* encoding but that would need even more cputime */
124 /* anyway your mileage may vary */
125 /* */
126 /* written by Martin BIELY and Roman HOCHLEITNER */
127 /*--------------------------------------------------*/
128
129 /* +++++++++++++++++++++++++++++++++++++++++++++++++++*/
130 /* Block to Stream (encoding) */
131 /* */
132
133 int RTjpeg_b2s(__s16 *data, __s8 *strm, __u8 bt8)
134 {
135 register int ci, co=1;
136 register __s16 ZZvalue;
137 register unsigned char bitten;
138 register unsigned char bitoff;
139
140 #ifdef SHOWBLOCK
141
142 int ii;
143 for (ii=0; ii < 64; ii++) {
144 fprintf(stdout, "%d ", data[RTjpeg_ZZ[ii]]);
145 }
146 fprintf(stdout, "\n\n");
147
148 #endif
149
150 // *strm++ = 0x10;
151 // *strm = 0x00;
152 //
153 // return 2;
154
155 // first byte allways written
156 (__u8)strm[0]=
157 (__u8)(data[RTjpeg_ZZ[0]]>254) ? 254:((data[RTjpeg_ZZ[0]]<0)?0:data[RTjpeg_ZZ[0]]);
158
159
160 ci=63;
161 while (data[RTjpeg_ZZ[ci]]==0 && ci>0) ci--;
162
163 bitten = ((unsigned char)ci) << 2;
164
165 if (ci==0) {
166 (__u8)strm[1]= bitten;
167 co = 2;
168 return (int)co;
169 }
170
171 /* bitoff=0 because the high 6bit contain first non zero position */
172 bitoff = 0;
173 co = 1;
174
175 for(; ci>0; ci--) {
176
177 ZZvalue = data[RTjpeg_ZZ[ci]];
178
179 switch(ZZvalue) {
180 case 0:
181 break;
182 case 1:
183 bitten |= (0x01<<bitoff);
184 break;
185 case -1:
186 bitten |= (0x03<<bitoff);
187 break;
188 default:
189 bitten |= (0x02<<bitoff);
190 goto HERZWEH;
191 break;
192 }
193
194 if( bitoff == 0 ) {
195 (__u8)strm[co]= bitten;
196 bitten = 0;
197 bitoff = 8;
198 co++;
199 } /* "fall through" */
200 bitoff-=2;
201
202 }
203
204 /* ci must be 0 */
205 if(bitoff != 6) {
206
207 (__u8)strm[co]= bitten;
208 co++;
209
210 }
211 goto BAUCHWEH;
212
213 HERZWEH:
214 /* ci cannot be 0 */
215 /* correct bitoff to nibble boundaries */
216
217 switch(bitoff){
218 case 4:
219 case 6:
220 bitoff = 0;
221 break;
222 case 2:
223 case 0:
224 (__u8)strm[co]= bitten;
225 bitoff = 4;
226 co++;
227 bitten = 0; // clear half nibble values in bitten
228 break;
229 default:
230 break;
231 }
232
233 for(; ci>0; ci--) {
234
235 ZZvalue = data[RTjpeg_ZZ[ci]];
236
237 if( (ZZvalue > 7) || (ZZvalue < -7) ) {
238 bitten |= (0x08<<bitoff);
239 goto HIRNWEH;
240 }
241
242 bitten |= (ZZvalue&0xf)<<bitoff;
243
244 if( bitoff == 0 ) {
245 (__u8)strm[co]= bitten;
246 bitten = 0;
247 bitoff = 8;
248 co++;
249 } /* "fall thru" */
250 bitoff-=4;
251 }
252
253 /* ci must be 0 */
254 if( bitoff == 0 ) {
255 (__u8)strm[co]= bitten;
256 co++;
257 }
258 goto BAUCHWEH;
259
260 HIRNWEH:
261
262 (__u8)strm[co]= bitten;
263 co++;
264
265
266 /* bitting is over now we bite */
267 for(; ci>0; ci--) {
268
269 ZZvalue = data[RTjpeg_ZZ[ci]];
270
271 if(ZZvalue>0)
272 {
273 strm[co++]=(__s8)(ZZvalue>127)?127:ZZvalue;
274 }
275 else
276 {
277 strm[co++]=(__s8)(ZZvalue<-128)?-128:ZZvalue;
278 }
279
280 }
281
282
283 BAUCHWEH:
284 /* we gotoo much now we are ill */
285 #ifdef SHOWBLOCK
286 {
287 int i;
288 fprintf(stdout, "\nco = '%d'\n", co);
289 for (i=0; i < co+2; i++) {
290 fprintf(stdout, "%d ", strm[i]);
291 }
292 fprintf(stdout, "\n\n");
293 }
294 #endif
295
296 return (int)co;
297 }
298
299 /* +++++++++++++++++++++++++++++++++++++++++++++++++++*/
300 /* Stream to Block (decoding) */
301 /* */
302
303 int RTjpeg_s2b(__s16 *data, __s8 *strm, __u8 bt8, __u32 *qtbl)
304 {
305 int ci;
306 register int co;
307 register int i;
308 register unsigned char bitten;
309 register unsigned char bitoff;
310
311 /* first byte always read */
312 i=RTjpeg_ZZ[0];
313 data[i]=((__u8)strm[0])*qtbl[i];
314
315 /* we start at the behind */
316
317 bitten = ((unsigned char)strm[1]) >> 2;
318 co = 63;
319 for(; co > bitten; co--) {
320
321 data[RTjpeg_ZZ[co]] = 0;
322
323 }
324
325 if (co==0) {
326 ci = 2;
327 goto AUTOBAHN;
328 }
329
330 /* we have to read the last 2 bits of the second byte */
331 ci=1;
332 bitoff = 0;
333
334 for(; co>0; co--) {
335
336 bitten = ((unsigned char)strm[ci]) >> bitoff;
337 bitten &= 0x03;
338
339 i=RTjpeg_ZZ[co];
340
341 switch( bitten ) {
342 case 0x03:
343 data[i]= -qtbl[i];
344 break;
345 case 0x02:
346 goto FUSSWEG;
347 break;
348 case 0x01:
349 data[i]= qtbl[i];
350 break;
351 case 0x00:
352 data[i]= 0;
353 break;
354 default:
355
356 }
357
358 if( bitoff == 0 ) {
359 bitoff = 8;
360 ci++;
361 }
362 bitoff -= 2;
363 }
364 /* co is 0 now */
365 /* data is written properly */
366
367 /* if bitoff!=6 then ci is the index, but should be the byte count, so we increment by 1 */
368 if (bitoff!=6) ci++;
369
370 goto AUTOBAHN;
371
372
373 FUSSWEG:
374 /* correct bitoff to nibble */
375 switch(bitoff){
376 case 4:
377 case 6:
378 bitoff = 0;
379 break;
380 case 2:
381 case 0:
382 /* we have to read from the next byte */
383 ci++;
384 bitoff = 4;
385 break;
386 default:
387 break;
388 }
389
390 for(; co>0; co--) {
391
392 bitten = ((unsigned char)strm[ci]) >> bitoff;
393 bitten &= 0x0f;
394
395 i=RTjpeg_ZZ[co];
396
397 if( bitten == 0x08 ) {
398 goto STRASSE;
399 }
400
401 /* the compiler cannot do sign extension for signed nibbles */
402 if( bitten & 0x08 ) {
403 bitten |= 0xf0;
404 }
405 /* the unsigned char bitten now is a valid signed char */
406
407 data[i]=((signed char)bitten)*qtbl[i];
408
409 if( bitoff == 0 ) {
410 bitoff = 8;
411 ci++;
412 }
413 bitoff -= 4;
414 }
415 /* co is 0 */
416
417 /* if bitoff!=4 then ci is the index, but should be the byte count, so we increment by 1 */
418 if (bitoff!=4) ci++;
419
420 goto AUTOBAHN;
421
422 STRASSE:
423 ci++;
424
425 for(; co>0; co--) {
426 i=RTjpeg_ZZ[co];
427 data[i]=strm[ci++]*qtbl[i];
428 }
429
430 /* ci now is the count, because it points to next element => no incrementing */
431
432 AUTOBAHN:
433
434 #ifdef SHOWBLOCK
435 fprintf(stdout, "\nci = '%d'\n", ci);
436 for (i=0; i < 64; i++) {
437 fprintf(stdout, "%d ", data[RTjpeg_ZZ[i]]);
438 }
439 fprintf(stdout, "\n\n");
440 #endif
441
442 return ci;
443 }
444
445 #else
446
447 int RTjpeg_b2s(__s16 *data, __s8 *strm, __u8 bt8)
448 {
449 register int ci, co=1, tmp;
450 register __s16 ZZvalue;
451
452 #ifdef SHOWBLOCK
453
454 int ii;
455 for (ii=0; ii < 64; ii++) {
456 fprintf(stdout, "%d ", data[RTjpeg_ZZ[ii]]);
457 }
458 fprintf(stdout, "\n\n");
459
460 #endif
461
462 (__u8)strm[0]=(__u8)(data[RTjpeg_ZZ[0]]>254) ? 254:((data[RTjpeg_ZZ[0]]<0)?0:data[RTjpeg_ZZ[0]]);
463
464 for(ci=1; ci<=bt8; ci++)
465 {
466 ZZvalue = data[RTjpeg_ZZ[ci]];
467
468 if(ZZvalue>0)
469 {
470 strm[co++]=(__s8)(ZZvalue>127)?127:ZZvalue;
471 }
472 else
473 {
474 strm[co++]=(__s8)(ZZvalue<-128)?-128:ZZvalue;
475 }
476 }
477
478 for(; ci<64; ci++)
479 {
480 ZZvalue = data[RTjpeg_ZZ[ci]];
481
482 if(ZZvalue>0)
483 {
484 strm[co++]=(__s8)(ZZvalue>63)?63:ZZvalue;
485 }
486 else if(ZZvalue<0)
487 {
488 strm[co++]=(__s8)(ZZvalue<-64)?-64:ZZvalue;
489 }
490 else /* compress zeros */
491 {
492 tmp=ci;
493 do
494 {
495 ci++;
496 }
497 while((ci<64)&&(data[RTjpeg_ZZ[ci]]==0));
498
499 strm[co++]=(__s8)(63+(ci-tmp));
500 ci--;
501 }
502 }
503 return (int)co;
504 }
505
506 int RTjpeg_s2b(__s16 *data, __s8 *strm, __u8 bt8, __u32 *qtbl)
507 {
508 int ci=1, co=1, tmp;
509 register int i;
510
511 i=RTjpeg_ZZ[0];
512 data[i]=((__u8)strm[0])*qtbl[i];
513
514 for(co=1; co<=bt8; co++)
515 {
516 i=RTjpeg_ZZ[co];
517 data[i]=strm[ci++]*qtbl[i];
518 }
519
520 for(; co<64; co++)
521 {
522 if(strm[ci]>63)
523 {
524 tmp=co+strm[ci]-63;
525 for(; co<tmp; co++)data[RTjpeg_ZZ[co]]=0;
526 co--;
527 } else
528 {
529 i=RTjpeg_ZZ[co];
530 data[i]=strm[ci]*qtbl[i];
531 }
532 ci++;
533 }
534 return (int)ci;
535 }
536 #endif
537
538 #if defined(MMX)
539 void RTjpeg_quant_init(void)
540 {
541 int i;
542 __s16 *qtbl;
543
544 qtbl=(__s16 *)RTjpeg_lqt;
545 for(i=0; i<64; i++)qtbl[i]=(__s16)RTjpeg_lqt[i];
546
547 qtbl=(__s16 *)RTjpeg_cqt;
548 for(i=0; i<64; i++)qtbl[i]=(__s16)RTjpeg_cqt[i];
549 }
550
551 static mmx_t RTjpeg_ones=(mmx_t)(long long)0x0001000100010001LL;
552 static mmx_t RTjpeg_half=(mmx_t)(long long)0x7fff7fff7fff7fffLL;
553
554 void RTjpeg_quant(__s16 *block, __s32 *qtbl)
555 {
556 int i;
557 mmx_t *bl, *ql;
558
559 ql=(mmx_t *)qtbl;
560 bl=(mmx_t *)block;
561
562 movq_m2r(RTjpeg_ones, mm6);
563 movq_m2r(RTjpeg_half, mm7);
564
565 for(i=16; i; i--)
566 {
567 movq_m2r(*(ql++), mm0); /* quant vals (4) */
568 movq_m2r(*bl, mm2); /* block vals (4) */
569 movq_r2r(mm0, mm1);
570 movq_r2r(mm2, mm3);
571
572 punpcklwd_r2r(mm6, mm0); /* 1 qb 1 qa */
573 punpckhwd_r2r(mm6, mm1); /* 1 qd 1 qc */
574
575 punpcklwd_r2r(mm7, mm2); /* 32767 bb 32767 ba */
576 punpckhwd_r2r(mm7, mm3); /* 32767 bd 32767 bc */
577
578 pmaddwd_r2r(mm2, mm0); /* 32767+bb*qb 32767+ba*qa */
579 pmaddwd_r2r(mm3, mm1); /* 32767+bd*qd 32767+bc*qc */
580
581 psrad_i2r(16, mm0);
582 psrad_i2r(16, mm1);
583
584 packssdw_r2r(mm1, mm0);
585
586 movq_r2m(mm0, *(bl++));
587
588 }
589 }
590 #else
591 void RTjpeg_quant_init(void)
592 {
593 }
594
595 void RTjpeg_quant(__s16 *block, __s32 *qtbl)
596 {
597 int i;
598
599 for(i=0; i<64; i++)
600 block[i]=(__s16)((block[i]*qtbl[i]+32767)>>16);
601 }
602 #endif
603
604 /*
605 * Perform the forward DCT on one block of samples.
606 */
607 #ifdef MMX
608 static mmx_t RTjpeg_C4 =(mmx_t)(long long)0x2D412D412D412D41LL;
609 static mmx_t RTjpeg_C6 =(mmx_t)(long long)0x187E187E187E187ELL;
610 static mmx_t RTjpeg_C2mC6=(mmx_t)(long long)0x22A322A322A322A3LL;
611 static mmx_t RTjpeg_C2pC6=(mmx_t)(long long)0x539F539F539F539FLL;
612 static mmx_t RTjpeg_zero =(mmx_t)(long long)0x0000000000000000LL;
613
614 #else
615
616 #define FIX_0_382683433 ((__s32) 98) /* FIX(0.382683433) */
617 #define FIX_0_541196100 ((__s32) 139) /* FIX(0.541196100) */
618 #define FIX_0_707106781 ((__s32) 181) /* FIX(0.707106781) */
619 #define FIX_1_306562965 ((__s32) 334) /* FIX(1.306562965) */
620
621 #define DESCALE10(x) (__s16)( ((x)+128) >> 8)
622 #define DESCALE20(x) (__s16)(((x)+32768) >> 16)
623 #define D_MULTIPLY(var,const) ((__s32) ((var) * (const)))
624 #endif
625
626 void RTjpeg_dct_init(void)
627 {
628 int i;
629
630 for(i=0; i<64; i++)
631 {
632 RTjpeg_lqt[i]=(((__u64)RTjpeg_lqt[i]<<32)/RTjpeg_aan_tab[i]);
633 RTjpeg_cqt[i]=(((__u64)RTjpeg_cqt[i]<<32)/RTjpeg_aan_tab[i]);
634 }
635 }
636
637 void RTjpeg_dctY(__u8 *idata, __s16 *odata, int rskip)
638 {
639 #ifndef MMX
640 __s32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
641 __s32 tmp10, tmp11, tmp12, tmp13;
642 __s32 z1, z2, z3, z4, z5, z11, z13;
643 __u8 *idataptr;
644 __s16 *odataptr;
645 __s32 *wsptr;
646 int ctr;
647
648 idataptr = idata;
649 wsptr = RTjpeg_ws;
650 for (ctr = 7; ctr >= 0; ctr--) {
651 tmp0 = idataptr[0] + idataptr[7];
652 tmp7 = idataptr[0] - idataptr[7];
653 tmp1 = idataptr[1] + idataptr[6];
654 tmp6 = idataptr[1] - idataptr[6];
655 tmp2 = idataptr[2] + idataptr[5];
656 tmp5 = idataptr[2] - idataptr[5];
657 tmp3 = idataptr[3] + idataptr[4];
658 tmp4 = idataptr[3] - idataptr[4];
659
660 tmp10 = (tmp0 + tmp3); /* phase 2 */
661 tmp13 = tmp0 - tmp3;
662 tmp11 = (tmp1 + tmp2);
663 tmp12 = tmp1 - tmp2;
664
665 wsptr[0] = (tmp10 + tmp11)<<8; /* phase 3 */
666 wsptr[4] = (tmp10 - tmp11)<<8;
667
668 z1 = D_MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
669 wsptr[2] = (tmp13<<8) + z1; /* phase 5 */
670 wsptr[6] = (tmp13<<8) - z1;
671
672 tmp10 = tmp4 + tmp5; /* phase 2 */
673 tmp11 = tmp5 + tmp6;
674 tmp12 = tmp6 + tmp7;
675
676 z5 = D_MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */
677 z2 = D_MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */
678 z4 = D_MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */
679 z3 = D_MULTIPLY(tmp11, FIX_0_707106781); /* c4 */
680
681 z11 = (tmp7<<8) + z3; /* phase 5 */
682 z13 = (tmp7<<8) - z3;
683
684 wsptr[5] = z13 + z2; /* phase 6 */
685 wsptr[3] = z13 - z2;
686 wsptr[1] = z11 + z4;
687 wsptr[7] = z11 - z4;
688
689 idataptr += rskip<<3; /* advance pointer to next row */
690 wsptr += 8;
691 }
692
693 wsptr = RTjpeg_ws;
694 odataptr=odata;
695 for (ctr = 7; ctr >= 0; ctr--) {
696 tmp0 = wsptr[0] + wsptr[56];
697 tmp7 = wsptr[0] - wsptr[56];
698 tmp1 = wsptr[8] + wsptr[48];
699 tmp6 = wsptr[8] - wsptr[48];
700 tmp2 = wsptr[16] + wsptr[40];
701 tmp5 = wsptr[16] - wsptr[40];
702 tmp3 = wsptr[24] + wsptr[32];
703 tmp4 = wsptr[24] - wsptr[32];
704
705 tmp10 = tmp0 + tmp3; /* phase 2 */
706 tmp13 = tmp0 - tmp3;
707 tmp11 = tmp1 + tmp2;
708 tmp12 = tmp1 - tmp2;
709
710 odataptr[0] = DESCALE10(tmp10 + tmp11); /* phase 3 */
711 odataptr[32] = DESCALE10(tmp10 - tmp11);
712
713 z1 = D_MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
714 odataptr[16] = DESCALE20((tmp13<<8) + z1); /* phase 5 */
715 odataptr[48] = DESCALE20((tmp13<<8) - z1);
716
717 tmp10 = tmp4 + tmp5; /* phase 2 */
718 tmp11 = tmp5 + tmp6;
719 tmp12 = tmp6 + tmp7;
720
721 z5 = D_MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */
722 z2 = D_MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */
723 z4 = D_MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */
724 z3 = D_MULTIPLY(tmp11, FIX_0_707106781); /* c4 */
725
726 z11 = (tmp7<<8) + z3; /* phase 5 */
727 z13 = (tmp7<<8) - z3;
728
729 odataptr[40] = DESCALE20(z13 + z2); /* phase 6 */
730 odataptr[24] = DESCALE20(z13 - z2);
731 odataptr[8] = DESCALE20(z11 + z4);
732 odataptr[56] = DESCALE20(z11 - z4);
733
734 odataptr++; /* advance pointer to next column */
735 wsptr++;
736 }
737 #else
738 volatile mmx_t tmp6, tmp7;
739 register mmx_t *dataptr = (mmx_t *)odata;
740 mmx_t *idata2 = (mmx_t *)idata;
741
742 // first copy the input 8 bit to the destination 16 bits
743
744 movq_m2r(RTjpeg_zero, mm2);
745
746
747 movq_m2r(*idata2, mm0);
748 movq_r2r(mm0, mm1);
749
750 punpcklbw_r2r(mm2, mm0);
751 movq_r2m(mm0, *(dataptr));
752
753 punpckhbw_r2r(mm2, mm1);
754 movq_r2m(mm1, *(dataptr+1));
755
756 idata2 += rskip;
757
758 movq_m2r(*idata2, mm0);
759 movq_r2r(mm0, mm1);
760
761 punpcklbw_r2r(mm2, mm0);
762 movq_r2m(mm0, *(dataptr+2));
763
764 punpckhbw_r2r(mm2, mm1);
765 movq_r2m(mm1, *(dataptr+3));
766
767 idata2 += rskip;
768
769 movq_m2r(*idata2, mm0);
770 movq_r2r(mm0, mm1);
771
772 punpcklbw_r2r(mm2, mm0);
773 movq_r2m(mm0, *(dataptr+4));
774
775 punpckhbw_r2r(mm2, mm1);
776 movq_r2m(mm1, *(dataptr+5));
777
778 idata2 += rskip;
779
780 movq_m2r(*idata2, mm0);
781 movq_r2r(mm0, mm1);
782
783 punpcklbw_r2r(mm2, mm0);
784 movq_r2m(mm0, *(dataptr+6));
785
786 punpckhbw_r2r(mm2, mm1);
787 movq_r2m(mm1, *(dataptr+7));
788
789 idata2 += rskip;
790
791 movq_m2r(*idata2, mm0);
792 movq_r2r(mm0, mm1);
793
794 punpcklbw_r2r(mm2, mm0);
795 movq_r2m(mm0, *(dataptr+8));
796
797 punpckhbw_r2r(mm2, mm1);
798 movq_r2m(mm1, *(dataptr+9));
799
800 idata2 += rskip;
801
802 movq_m2r(*idata2, mm0);
803 movq_r2r(mm0, mm1);
804
805 punpcklbw_r2r(mm2, mm0);
806 movq_r2m(mm0, *(dataptr+10));
807
808 punpckhbw_r2r(mm2, mm1);
809 movq_r2m(mm1, *(dataptr+11));
810
811 idata2 += rskip;
812
813 movq_m2r(*idata2, mm0);
814 movq_r2r(mm0, mm1);
815
816 punpcklbw_r2r(mm2, mm0);
817 movq_r2m(mm0, *(dataptr+12));
818
819 punpckhbw_r2r(mm2, mm1);
820 movq_r2m(mm1, *(dataptr+13));
821
822 idata2 += rskip;
823
824 movq_m2r(*idata2, mm0);
825 movq_r2r(mm0, mm1);
826
827 punpcklbw_r2r(mm2, mm0);
828 movq_r2m(mm0, *(dataptr+14));
829
830 punpckhbw_r2r(mm2, mm1);
831 movq_r2m(mm1, *(dataptr+15));
832
833 /* Start Transpose to do calculations on rows */
834
835 movq_m2r(*(dataptr+9), mm7); // m03:m02|m01:m00 - first line (line 4)and copy into m5
836
837 movq_m2r(*(dataptr+13), mm6); // m23:m22|m21:m20 - third line (line 6)and copy into m2
838 movq_r2r(mm7, mm5);
839
840 punpcklwd_m2r(*(dataptr+11), mm7); // m11:m01|m10:m00 - interleave first and second lines
841 movq_r2r(mm6, mm2);
842
843 punpcklwd_m2r(*(dataptr+15), mm6); // m31:m21|m30:m20 - interleave third and fourth lines
844 movq_r2r(mm7, mm1);
845
846 movq_m2r(*(dataptr+11), mm3); // m13:m13|m11:m10 - second line
847 punpckldq_r2r(mm6, mm7); // m30:m20|m10:m00 - interleave to produce result 1
848
849 movq_m2r(*(dataptr+15), mm0); // m13:m13|m11:m10 - fourth line
850 punpckhdq_r2r(mm6, mm1); // m31:m21|m11:m01 - interleave to produce result 2
851
852 movq_r2m(mm7,*(dataptr+9)); // write result 1
853 punpckhwd_r2r(mm3, mm5); // m13:m03|m12:m02 - interleave first and second lines
854
855 movq_r2m(mm1,*(dataptr+11)); // write result 2
856 punpckhwd_r2r(mm0, mm2); // m33:m23|m32:m22 - interleave third and fourth lines
857
858 movq_r2r(mm5, mm1);
859 punpckldq_r2r(mm2, mm5); // m32:m22|m12:m02 - interleave to produce result 3
860
861 movq_m2r(*(dataptr+1), mm0); // m03:m02|m01:m00 - first line, 4x4
862 punpckhdq_r2r(mm2, mm1); // m33:m23|m13:m03 - interleave to produce result 4
863
864 movq_r2m(mm5,*(dataptr+13)); // write result 3
865
866 // last 4x4 done
867
868 movq_r2m(mm1, *(dataptr+15)); // write result 4, last 4x4
869
870 movq_m2r(*(dataptr+5), mm2); // m23:m22|m21:m20 - third line
871 movq_r2r(mm0, mm6);
872
873 punpcklwd_m2r(*(dataptr+3), mm0); // m11:m01|m10:m00 - interleave first and second lines
874 movq_r2r(mm2, mm7);
875
876 punpcklwd_m2r(*(dataptr+7), mm2); // m31:m21|m30:m20 - interleave third and fourth lines
877 movq_r2r(mm0, mm4);
878
879 //
880 movq_m2r(*(dataptr+8), mm1); // n03:n02|n01:n00 - first line
881 punpckldq_r2r(mm2, mm0); // m30:m20|m10:m00 - interleave to produce first result
882
883 movq_m2r(*(dataptr+12), mm3); // n23:n22|n21:n20 - third line
884 punpckhdq_r2r(mm2, mm4); // m31:m21|m11:m01 - interleave to produce second result
885
886 punpckhwd_m2r(*(dataptr+3), mm6); // m13:m03|m12:m02 - interleave first and second lines
887 movq_r2r(mm1, mm2); // copy first line
888
889 punpckhwd_m2r(*(dataptr+7), mm7); // m33:m23|m32:m22 - interleave third and fourth lines
890 movq_r2r(mm6, mm5); // copy first intermediate result
891
892 movq_r2m(mm0, *(dataptr+8)); // write result 1
893 punpckhdq_r2r(mm7, mm5); // m33:m23|m13:m03 - produce third result
894
895 punpcklwd_m2r(*(dataptr+10), mm1); // n11:n01|n10:n00 - interleave first and second lines
896 movq_r2r(mm3, mm0); // copy third line
897
898 punpckhwd_m2r(*(dataptr+10), mm2); // n13:n03|n12:n02 - interleave first and second lines
899
900 movq_r2m(mm4, *(dataptr+10)); // write result 2 out
901 punpckldq_r2r(mm7, mm6); // m32:m22|m12:m02 - produce fourth result
902
903 punpcklwd_m2r(*(dataptr+14), mm3); // n31:n21|n30:n20 - interleave third and fourth lines
904 movq_r2r(mm1, mm4);
905
906 movq_r2m(mm6, *(dataptr+12)); // write result 3 out
907 punpckldq_r2r(mm3, mm1); // n30:n20|n10:n00 - produce first result
908
909 punpckhwd_m2r(*(dataptr+14), mm0); // n33:n23|n32:n22 - interleave third and fourth lines
910 movq_r2r(mm2, mm6);
911
912 movq_r2m(mm5, *(dataptr+14)); // write result 4 out
913 punpckhdq_r2r(mm3, mm4); // n31:n21|n11:n01- produce second result
914
915 movq_r2m(mm1, *(dataptr+1)); // write result 5 out - (first result for other 4 x 4 block)
916 punpckldq_r2r(mm0, mm2); // n32:n22|n12:n02- produce third result
917
918 movq_r2m(mm4, *(dataptr+3)); // write result 6 out
919 punpckhdq_r2r(mm0, mm6); // n33:n23|n13:n03 - produce fourth result
920
921 movq_r2m(mm2, *(dataptr+5)); // write result 7 out
922
923 movq_m2r(*dataptr, mm0); // m03:m02|m01:m00 - first line, first 4x4
924
925 movq_r2m(mm6, *(dataptr+7)); // write result 8 out
926
927
928 // Do first 4x4 quadrant, which is used in the beginning of the DCT:
929
930 movq_m2r(*(dataptr+4), mm7); // m23:m22|m21:m20 - third line
931 movq_r2r(mm0, mm2);
932
933 punpcklwd_m2r(*(dataptr+2), mm0); // m11:m01|m10:m00 - interleave first and second lines
934 movq_r2r(mm7, mm4);
935
936 punpcklwd_m2r(*(dataptr+6), mm7); // m31:m21|m30:m20 - interleave third and fourth lines
937 movq_r2r(mm0, mm1);
938
939 movq_m2r(*(dataptr+2), mm6); // m13:m12|m11:m10 - second line
940 punpckldq_r2r(mm7, mm0); // m30:m20|m10:m00 - interleave to produce result 1
941
942 movq_m2r(*(dataptr+6), mm5); // m33:m32|m31:m30 - fourth line
943 punpckhdq_r2r(mm7, mm1); // m31:m21|m11:m01 - interleave to produce result 2
944
945 movq_r2r(mm0, mm7); // write result 1
946 punpckhwd_r2r(mm6, mm2); // m13:m03|m12:m02 - interleave first and second lines
947
948 psubw_m2r(*(dataptr+14), mm7); // tmp07=x0-x7 /* Stage 1 */
949 movq_r2r(mm1, mm6); // write result 2
950
951 paddw_m2r(*(dataptr+14), mm0); // tmp00=x0+x7 /* Stage 1 */
952 punpckhwd_r2r(mm5, mm4); // m33:m23|m32:m22 - interleave third and fourth lines
953
954 paddw_m2r(*(dataptr+12), mm1); // tmp01=x1+x6 /* Stage 1 */
955 movq_r2r(mm2, mm3); // copy first intermediate result
956
957 psubw_m2r(*(dataptr+12), mm6); // tmp06=x1-x6 /* Stage 1 */
958 punpckldq_r2r(mm4, mm2); // m32:m22|m12:m02 - interleave to produce result 3
959
960 movq_r2m(mm7, tmp7);
961 movq_r2r(mm2, mm5); // write result 3
962
963 movq_r2m(mm6, tmp6);
964 punpckhdq_r2r(mm4, mm3); // m33:m23|m13:m03 - interleave to produce result 4
965
966 paddw_m2r(*(dataptr+10), mm2); // tmp02=x2+5 /* Stage 1 */
967 movq_r2r(mm3, mm4); // write result 4
968
969 /************************************************************************************************
970 End of Transpose
971 ************************************************************************************************/
972
973
974 paddw_m2r(*(dataptr+8), mm3); // tmp03=x3+x4 /* stage 1*/
975 movq_r2r(mm0, mm7);
976
977 psubw_m2r(*(dataptr+8), mm4); // tmp04=x3-x4 /* stage 1*/
978 movq_r2r(mm1, mm6);
979
980 paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03 /* even 2 */
981 psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03 /* even 2 */
982
983 psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02 /* even 2 */
984 paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02 /* even 2 */
985
986 psubw_m2r(*(dataptr+10), mm5); // tmp05=x2-x5 /* stage 1*/
987 paddw_r2r(mm7, mm6); // tmp12 + tmp13
988
989 /* stage 3 */
990
991 movq_m2r(tmp6, mm2);
992 movq_r2r(mm0, mm3);
993
994 psllw_i2r(2, mm6); // m8 * 2^2
995 paddw_r2r(mm1, mm0);
996
997 pmulhw_m2r(RTjpeg_C4, mm6); // z1
998 psubw_r2r(mm1, mm3);
999
1000 movq_r2m(mm0, *dataptr);
1001 movq_r2r(mm7, mm0);
1002
1003 /* Odd part */
1004 movq_r2m(mm3, *(dataptr+8));
1005 paddw_r2r(mm5, mm4); // tmp10
1006
1007 movq_m2r(tmp7, mm3);
1008 paddw_r2r(mm6, mm0); // tmp32
1009
1010 paddw_r2r(mm2, mm5); // tmp11
1011 psubw_r2r(mm6, mm7); // tmp33
1012
1013 movq_r2m(mm0, *(dataptr+4));
1014 paddw_r2r(mm3, mm2); // tmp12
1015
1016 /* stage 4 */
1017
1018 movq_r2m(mm7, *(dataptr+12));
1019 movq_r2r(mm4, mm1); // copy of tmp10
1020
1021 psubw_r2r(mm2, mm1); // tmp10 - tmp12
1022 psllw_i2r(2, mm4); // m8 * 2^2
1023
1024 movq_m2r(RTjpeg_C2mC6, mm0);
1025 psllw_i2r(2, mm1);
1026
1027 pmulhw_m2r(RTjpeg_C6, mm1); // z5
1028 psllw_i2r(2, mm2);
1029
1030 pmulhw_r2r(mm0, mm4); // z5
1031
1032 /* stage 5 */
1033
1034 pmulhw_m2r(RTjpeg_C2pC6, mm2);
1035 psllw_i2r(2, mm5);
1036
1037 pmulhw_m2r(RTjpeg_C4, mm5); // z3
1038 movq_r2r(mm3, mm0); // copy tmp7
1039
1040 movq_m2r(*(dataptr+1), mm7);
1041 paddw_r2r(mm1, mm4); // z2
1042
1043 paddw_r2r(mm1, mm2); // z4
1044
1045 paddw_r2r(mm5, mm0); // z11
1046 psubw_r2r(mm5, mm3); // z13
1047
1048 /* stage 6 */
1049
1050 movq_r2r(mm3, mm5); // copy z13
1051 psubw_r2r(mm4, mm3); // y3=z13 - z2
1052
1053 paddw_r2r(mm4, mm5); // y5=z13 + z2
1054 movq_r2r(mm0, mm6); // copy z11
1055
1056 movq_r2m(mm3, *(dataptr+6)); //save y3
1057 psubw_r2r(mm2, mm0); // y7=z11 - z4
1058
1059 movq_r2m(mm5, *(dataptr+10)); //save y5
1060 paddw_r2r(mm2, mm6); // y1=z11 + z4
1061
1062 movq_r2m(mm0, *(dataptr+14)); //save y7
1063
1064 /************************************************
1065 * End of 1st 4 rows
1066 ************************************************/
1067
1068 movq_m2r(*(dataptr+3), mm1); // load x1 /* stage 1 */
1069 movq_r2r(mm7, mm0); // copy x0
1070
1071 movq_r2m(mm6, *(dataptr+2)); //save y1
1072
1073 movq_m2r(*(dataptr+5), mm2); // load x2 /* stage 1 */
1074 movq_r2r(mm1, mm6); // copy x1
1075
1076 paddw_m2r(*(dataptr+15), mm0); // tmp00 = x0 + x7
1077
1078 movq_m2r(*(dataptr+7), mm3); // load x3 /* stage 1 */
1079 movq_r2r(mm2, mm5); // copy x2
1080
1081 psubw_m2r(*(dataptr+15), mm7); // tmp07 = x0 - x7
1082 movq_r2r(mm3, mm4); // copy x3
1083
1084 paddw_m2r(*(dataptr+13), mm1); // tmp01 = x1 + x6
1085
1086 movq_r2m(mm7, tmp7); // save tmp07
1087 movq_r2r(mm0, mm7); // copy tmp00
1088
1089 psubw_m2r(*(dataptr+13), mm6); // tmp06 = x1 - x6
1090
1091 /* stage 2, Even Part */
1092
1093 paddw_m2r(*(dataptr+9), mm3); // tmp03 = x3 + x4
1094
1095 movq_r2m(mm6, tmp6); // save tmp07
1096 movq_r2r(mm1, mm6); // copy tmp01
1097
1098 paddw_m2r(*(dataptr+11), mm2); // tmp02 = x2 + x5
1099 paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03
1100
1101 psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03
1102
1103 psubw_m2r(*(dataptr+9), mm4); // tmp04 = x3 - x4
1104 psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02
1105
1106 paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02
1107
1108 psubw_m2r(*(dataptr+11), mm5); // tmp05 = x2 - x5
1109 paddw_r2r(mm7, mm6); // tmp12 + tmp13
1110
1111 /* stage 3, Even and stage 4 & 5 even */
1112
1113 movq_m2r(tmp6, mm2); // load tmp6
1114 movq_r2r(mm0, mm3); // copy tmp10
1115
1116 psllw_i2r(2, mm6); // shift z1
1117 paddw_r2r(mm1, mm0); // y0=tmp10 + tmp11
1118
1119 pmulhw_m2r(RTjpeg_C4, mm6); // z1
1120 psubw_r2r(mm1, mm3); // y4=tmp10 - tmp11
1121
1122 movq_r2m(mm0, *(dataptr+1)); //save y0
1123 movq_r2r(mm7, mm0); // copy tmp13
1124
1125 /* odd part */
1126
1127 movq_r2m(mm3, *(dataptr+9)); //save y4
1128 paddw_r2r(mm5, mm4); // tmp10 = tmp4 + tmp5
1129
1130 movq_m2r(tmp7, mm3); // load tmp7
1131 paddw_r2r(mm6, mm0); // tmp32 = tmp13 + z1
1132
1133 paddw_r2r(mm2, mm5); // tmp11 = tmp5 + tmp6
1134 psubw_r2r(mm6, mm7); // tmp33 = tmp13 - z1
1135
1136 movq_r2m(mm0, *(dataptr+5)); //save y2
1137 paddw_r2r(mm3, mm2); // tmp12 = tmp6 + tmp7
1138
1139 /* stage 4 */
1140
1141 movq_r2m(mm7, *(dataptr+13)); //save y6
1142 movq_r2r(mm4, mm1); // copy tmp10
1143
1144 psubw_r2r(mm2, mm1); // tmp10 - tmp12
1145 psllw_i2r(2, mm4); // shift tmp10
1146
1147 movq_m2r(RTjpeg_C2mC6, mm0); // load C2mC6
1148 psllw_i2r(2, mm1); // shift (tmp10-tmp12)
1149
1150 pmulhw_m2r(RTjpeg_C6, mm1); // z5
1151 psllw_i2r(2, mm5); // prepare for multiply
1152
1153 pmulhw_r2r(mm0, mm4); // multiply by converted real
1154
1155 /* stage 5 */
1156
1157 pmulhw_m2r(RTjpeg_C4, mm5); // z3
1158 psllw_i2r(2, mm2); // prepare for multiply
1159
1160 pmulhw_m2r(RTjpeg_C2pC6, mm2); // multiply
1161 movq_r2r(mm3, mm0); // copy tmp7
1162
1163 movq_m2r(*(dataptr+9), mm7); // m03:m02|m01:m00 - first line (line 4)and copy into mm7
1164 paddw_r2r(mm1, mm4); // z2
1165
1166 paddw_r2r(mm5, mm0); // z11
1167 psubw_r2r(mm5, mm3); // z13
1168
1169 /* stage 6 */
1170
1171 movq_r2r(mm3, mm5); // copy z13
1172 paddw_r2r(mm1, mm2); // z4
1173
1174 movq_r2r(mm0, mm6); // copy z11
1175 psubw_r2r(mm4, mm5); // y3
1176
1177 paddw_r2r(mm2, mm6); // y1
1178 paddw_r2r(mm4, mm3); // y5
1179
1180 movq_r2m(mm5, *(dataptr+7)); //save y3
1181
1182 movq_r2m(mm6, *(dataptr+3)); //save y1
1183 psubw_r2r(mm2, mm0); // y7
1184
1185 /************************************************************************************************
1186 Start of Transpose
1187 ************************************************************************************************/
1188
1189 movq_m2r(*(dataptr+13), mm6); // m23:m22|m21:m20 - third line (line 6)and copy into m2
1190 movq_r2r(mm7, mm5); // copy first line
1191
1192 punpcklwd_r2r(mm3, mm7); // m11:m01|m10:m00 - interleave first and second lines
1193 movq_r2r(mm6, mm2); // copy third line
1194
1195 punpcklwd_r2r(mm0, mm6); // m31:m21|m30:m20 - interleave third and fourth lines
1196 movq_r2r(mm7, mm1); // copy first intermediate result
1197
1198 punpckldq_r2r(mm6, mm7); // m30:m20|m10:m00 - interleave to produce result 1
1199
1200 punpckhdq_r2r(mm6, mm1); // m31:m21|m11:m01 - interleave to produce result 2
1201
1202 movq_r2m(mm7, *(dataptr+9)); // write result 1
1203 punpckhwd_r2r(mm3, mm5); // m13:m03|m12:m02 - interleave first and second lines
1204
1205 movq_r2m(mm1, *(dataptr+11)); // write result 2
1206 punpckhwd_r2r(mm0, mm2); // m33:m23|m32:m22 - interleave third and fourth lines
1207
1208 movq_r2r(mm5, mm1); // copy first intermediate result
1209 punpckldq_r2r(mm2, mm5); // m32:m22|m12:m02 - interleave to produce result 3
1210
1211 movq_m2r(*(dataptr+1), mm0); // m03:m02|m01:m00 - first line, 4x4
1212 punpckhdq_r2r(mm2, mm1); // m33:m23|m13:m03 - interleave to produce result 4
1213
1214 movq_r2m(mm5, *(dataptr+13)); // write result 3
1215
1216 /****** last 4x4 done */
1217
1218 movq_r2m(mm1, *(dataptr+15)); // write result 4, last 4x4
1219
1220 movq_m2r(*(dataptr+5), mm2); // m23:m22|m21:m20 - third line
1221 movq_r2r(mm0, mm6); // copy first line
1222
1223 punpcklwd_m2r(*(dataptr+3), mm0); // m11:m01|m10:m00 - interleave first and second lines
1224 movq_r2r(mm2, mm7); // copy third line
1225
1226 punpcklwd_m2r(*(dataptr+7), mm2); // m31:m21|m30:m20 - interleave third and fourth lines
1227 movq_r2r(mm0, mm4); // copy first intermediate result
1228
1229
1230
1231 movq_m2r(*(dataptr+8), mm1); // n03:n02|n01:n00 - first line
1232 punpckldq_r2r(mm2, mm0); // m30:m20|m10:m00 - interleave to produce first result
1233
1234 movq_m2r(*(dataptr+12), mm3); // n23:n22|n21:n20 - third line
1235 punpckhdq_r2r(mm2, mm4); // m31:m21|m11:m01 - interleave to produce second result
1236
1237 punpckhwd_m2r(*(dataptr+3), mm6); // m13:m03|m12:m02 - interleave first and second lines
1238 movq_r2r(mm1, mm2); // copy first line
1239
1240 punpckhwd_m2r(*(dataptr+7), mm7); // m33:m23|m32:m22 - interleave third and fourth lines
1241 movq_r2r(mm6, mm5); // copy first intermediate result
1242
1243 movq_r2m(mm0, *(dataptr+8)); // write result 1
1244 punpckhdq_r2r(mm7, mm5); // m33:m23|m13:m03 - produce third result
1245
1246 punpcklwd_m2r(*(dataptr+10), mm1); // n11:n01|n10:n00 - interleave first and second lines
1247 movq_r2r(mm3, mm0); // copy third line
1248
1249 punpckhwd_m2r(*(dataptr+10), mm2); // n13:n03|n12:n02 - interleave first and second lines
1250
1251 movq_r2m(mm4, *(dataptr+10)); // write result 2 out
1252 punpckldq_r2r(mm7, mm6); // m32:m22|m12:m02 - produce fourth result
1253
1254 punpcklwd_m2r(*(dataptr+14), mm3); // n33:n23|n32:n22 - interleave third and fourth lines
1255 movq_r2r(mm1, mm4); // copy second intermediate result
1256
1257 movq_r2m(mm6, *(dataptr+12)); // write result 3 out
1258 punpckldq_r2r(mm3, mm1); //
1259
1260 punpckhwd_m2r(*(dataptr+14), mm0); // n33:n23|n32:n22 - interleave third and fourth lines
1261 movq_r2r(mm2, mm6); // copy second intermediate result
1262
1263 movq_r2m(mm5, *(dataptr+14)); // write result 4 out
1264 punpckhdq_r2r(mm3, mm4); // n31:n21|n11:n01- produce second result
1265
1266 movq_r2m(mm1, *(dataptr+1)); // write result 5 out - (first result for other 4 x 4 block)
1267 punpckldq_r2r(mm0, mm2); // n32:n22|n12:n02- produce third result
1268
1269 movq_r2m(mm4, *(dataptr+3)); // write result 6 out
1270 punpckhdq_r2r(mm0, mm6); // n33:n23|n13:n03 - produce fourth result
1271
1272 movq_r2m(mm2, *(dataptr+5)); // write result 7 out
1273
1274 movq_m2r(*dataptr, mm0); // m03:m02|m01:m00 - first line, first 4x4
1275
1276 movq_r2m(mm6, *(dataptr+7)); // write result 8 out
1277
1278 // Do first 4x4 quadrant, which is used in the beginning of the DCT:
1279
1280 movq_m2r(*(dataptr+4), mm7); // m23:m22|m21:m20 - third line
1281 movq_r2r(mm0, mm2); // copy first line
1282
1283 punpcklwd_m2r(*(dataptr+2), mm0); // m11:m01|m10:m00 - interleave first and second lines
1284 movq_r2r(mm7, mm4); // copy third line
1285
1286 punpcklwd_m2r(*(dataptr+6), mm7); // m31:m21|m30:m20 - interleave third and fourth lines
1287 movq_r2r(mm0, mm1); // copy first intermediate result
1288
1289 movq_m2r(*(dataptr+2), mm6); // m13:m12|m11:m10 - second line
1290 punpckldq_r2r(mm7, mm0); // m30:m20|m10:m00 - interleave to produce result 1
1291
1292 movq_m2r(*(dataptr+6), mm5); // m33:m32|m31:m30 - fourth line
1293 punpckhdq_r2r(mm7, mm1); // m31:m21|m11:m01 - interleave to produce result 2
1294
1295 movq_r2r(mm0, mm7); // write result 1
1296 punpckhwd_r2r(mm6, mm2); // m13:m03|m12:m02 - interleave first and second lines
1297
1298 psubw_m2r(*(dataptr+14), mm7); // tmp07=x0-x7 /* Stage 1 */
1299 movq_r2r(mm1, mm6); // write result 2
1300
1301 paddw_m2r(*(dataptr+14), mm0); // tmp00=x0+x7 /* Stage 1 */
1302 punpckhwd_r2r(mm5, mm4); // m33:m23|m32:m22 - interleave third and fourth lines
1303
1304 paddw_m2r(*(dataptr+12), mm1); // tmp01=x1+x6 /* Stage 1 */
1305 movq_r2r(mm2, mm3); // copy first intermediate result
1306
1307 psubw_m2r(*(dataptr+12), mm6); // tmp06=x1-x6 /* Stage 1 */
1308 punpckldq_r2r(mm4, mm2); // m32:m22|m12:m02 - interleave to produce result 3
1309
1310 movq_r2m(mm7, tmp7); // save tmp07
1311 movq_r2r(mm2, mm5); // write result 3
1312
1313 movq_r2m(mm6, tmp6); // save tmp06
1314
1315 punpckhdq_r2r(mm4, mm3); // m33:m23|m13:m03 - interleave to produce result 4
1316
1317 paddw_m2r(*(dataptr+10), mm2); // tmp02=x2+x5 /* stage 1 */
1318 movq_r2r(mm3, mm4); // write result 4
1319
1320 /************************************************************************************************
1321 End of Transpose 2
1322 ************************************************************************************************/
1323
1324 paddw_m2r(*(dataptr+8), mm3); // tmp03=x3+x4 /* stage 1*/
1325 movq_r2r(mm0, mm7);
1326
1327 psubw_m2r(*(dataptr+8), mm4); // tmp04=x3-x4 /* stage 1*/
1328 movq_r2r(mm1, mm6);
1329
1330 paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03 /* even 2 */
1331 psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03 /* even 2 */
1332
1333 psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02 /* even 2 */
1334 paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02 /* even 2 */
1335
1336 psubw_m2r(*(dataptr+10), mm5); // tmp05=x2-x5 /* stage 1*/
1337 paddw_r2r(mm7, mm6); // tmp12 + tmp13
1338
1339 /* stage 3 */
1340
1341 movq_m2r(tmp6, mm2);
1342 movq_r2r(mm0, mm3);
1343
1344 psllw_i2r(2, mm6); // m8 * 2^2
1345 paddw_r2r(mm1, mm0);
1346
1347 pmulhw_m2r(RTjpeg_C4, mm6); // z1
1348 psubw_r2r(mm1, mm3);
1349
1350 movq_r2m(mm0, *dataptr);
1351 movq_r2r(mm7, mm0);
1352
1353 /* Odd part */
1354 movq_r2m(mm3, *(dataptr+8));
1355 paddw_r2r(mm5, mm4); // tmp10
1356
1357 movq_m2r(tmp7, mm3);
1358 paddw_r2r(mm6, mm0); // tmp32
1359
1360 paddw_r2r(mm2, mm5); // tmp11
1361 psubw_r2r(mm6, mm7); // tmp33
1362
1363 movq_r2m(mm0, *(dataptr+4));
1364 paddw_r2r(mm3, mm2); // tmp12
1365
1366 /* stage 4 */
1367 movq_r2m(mm7, *(dataptr+12));
1368 movq_r2r(mm4, mm1); // copy of tmp10
1369
1370 psubw_r2r(mm2, mm1); // tmp10 - tmp12
1371 psllw_i2r(2, mm4); // m8 * 2^2
1372
1373 movq_m2r(RTjpeg_C2mC6, mm0);
1374 psllw_i2r(2, mm1);
1375
1376 pmulhw_m2r(RTjpeg_C6, mm1); // z5
1377 psllw_i2r(2, mm2);
1378
1379 pmulhw_r2r(mm0, mm4); // z5
1380
1381 /* stage 5 */
1382
1383 pmulhw_m2r(RTjpeg_C2pC6, mm2);
1384 psllw_i2r(2, mm5);
1385
1386 pmulhw_m2r(RTjpeg_C4, mm5); // z3
1387 movq_r2r(mm3, mm0); // copy tmp7
1388
1389 movq_m2r(*(dataptr+1), mm7);
1390 paddw_r2r(mm1, mm4); // z2
1391
1392 paddw_r2r(mm1, mm2); // z4
1393
1394 paddw_r2r(mm5, mm0); // z11
1395 psubw_r2r(mm5, mm3); // z13
1396
1397 /* stage 6 */
1398
1399 movq_r2r(mm3, mm5); // copy z13
1400 psubw_r2r(mm4, mm3); // y3=z13 - z2
1401
1402 paddw_r2r(mm4, mm5); // y5=z13 + z2
1403 movq_r2r(mm0, mm6); // copy z11
1404
1405 movq_r2m(mm3, *(dataptr+6)); //save y3
1406 psubw_r2r(mm2, mm0); // y7=z11 - z4
1407
1408 movq_r2m(mm5, *(dataptr+10)); //save y5
1409 paddw_r2r(mm2, mm6); // y1=z11 + z4
1410
1411 movq_r2m(mm0, *(dataptr+14)); //save y7
1412
1413 /************************************************
1414 * End of 1st 4 rows
1415 ************************************************/
1416
1417 movq_m2r(*(dataptr+3), mm1); // load x1 /* stage 1 */
1418 movq_r2r(mm7, mm0); // copy x0
1419
1420 movq_r2m(mm6, *(dataptr+2)); //save y1
1421
1422 movq_m2r(*(dataptr+5), mm2); // load x2 /* stage 1 */
1423 movq_r2r(mm1, mm6); // copy x1
1424
1425 paddw_m2r(*(dataptr+15), mm0); // tmp00 = x0 + x7
1426
1427 movq_m2r(*(dataptr+7), mm3); // load x3 /* stage 1 */
1428 movq_r2r(mm2, mm5); // copy x2
1429
1430 psubw_m2r(*(dataptr+15), mm7); // tmp07 = x0 - x7
1431 movq_r2r(mm3, mm4); // copy x3
1432
1433 paddw_m2r(*(dataptr+13), mm1); // tmp01 = x1 + x6
1434
1435 movq_r2m(mm7, tmp7); // save tmp07
1436 movq_r2r(mm0, mm7); // copy tmp00
1437
1438 psubw_m2r(*(dataptr+13), mm6); // tmp06 = x1 - x6
1439
1440 /* stage 2, Even Part */
1441
1442 paddw_m2r(*(dataptr+9), mm3); // tmp03 = x3 + x4
1443
1444 movq_r2m(mm6, tmp6); // save tmp07
1445 movq_r2r(mm1, mm6); // copy tmp01
1446
1447 paddw_m2r(*(dataptr+11), mm2); // tmp02 = x2 + x5
1448 paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03
1449
1450 psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03
1451
1452 psubw_m2r(*(dataptr+9), mm4); // tmp04 = x3 - x4
1453 psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02
1454
1455 paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02
1456
1457 psubw_m2r(*(dataptr+11), mm5); // tmp05 = x2 - x5
1458 paddw_r2r(mm7, mm6); // tmp12 + tmp13
1459
1460 /* stage 3, Even and stage 4 & 5 even */
1461
1462 movq_m2r(tmp6, mm2); // load tmp6
1463 movq_r2r(mm0, mm3); // copy tmp10
1464
1465 psllw_i2r(2, mm6); // shift z1
1466 paddw_r2r(mm1, mm0); // y0=tmp10 + tmp11
1467
1468 pmulhw_m2r(RTjpeg_C4, mm6); // z1
1469 psubw_r2r(mm1, mm3); // y4=tmp10 - tmp11
1470
1471 movq_r2m(mm0, *(dataptr+1)); //save y0
1472 movq_r2r(mm7, mm0); // copy tmp13
1473
1474 /* odd part */
1475
1476 movq_r2m(mm3, *(dataptr+9)); //save y4
1477 paddw_r2r(mm5, mm4); // tmp10 = tmp4 + tmp5
1478
1479 movq_m2r(tmp7, mm3); // load tmp7
1480 paddw_r2r(mm6, mm0); // tmp32 = tmp13 + z1
1481
1482 paddw_r2r(mm2, mm5); // tmp11 = tmp5 + tmp6
1483 psubw_r2r(mm6, mm7); // tmp33 = tmp13 - z1
1484
1485 movq_r2m(mm0, *(dataptr+5)); //save y2
1486 paddw_r2r(mm3, mm2); // tmp12 = tmp6 + tmp7
1487
1488 /* stage 4 */
1489
1490 movq_r2m(mm7, *(dataptr+13)); //save y6
1491 movq_r2r(mm4, mm1); // copy tmp10
1492
1493 psubw_r2r(mm2, mm1); // tmp10 - tmp12
1494 psllw_i2r(2, mm4); // shift tmp10
1495
1496 movq_m2r(RTjpeg_C2mC6, mm0); // load C2mC6
1497 psllw_i2r(2, mm1); // shift (tmp10-tmp12)
1498
1499 pmulhw_m2r(RTjpeg_C6, mm1); // z5
1500 psllw_i2r(2, mm5); // prepare for multiply
1501
1502 pmulhw_r2r(mm0, mm4); // multiply by converted real
1503
1504 /* stage 5 */
1505
1506 pmulhw_m2r(RTjpeg_C4, mm5); // z3
1507 psllw_i2r(2, mm2); // prepare for multiply
1508
1509 pmulhw_m2r(RTjpeg_C2pC6, mm2); // multiply
1510 movq_r2r(mm3, mm0); // copy tmp7
1511
1512 movq_m2r(*(dataptr+9), mm7); // m03:m02|m01:m00 - first line (line 4)and copy into mm7
1513 paddw_r2r(mm1, mm4); // z2
1514
1515 paddw_r2r(mm5, mm0); // z11
1516 psubw_r2r(mm5, mm3); // z13
1517
1518 /* stage 6 */
1519
1520 movq_r2r(mm3, mm5); // copy z13
1521 paddw_r2r(mm1, mm2); // z4
1522
1523 movq_r2r(mm0, mm6); // copy z11
1524 psubw_r2r(mm4, mm5); // y3
1525
1526 paddw_r2r(mm2, mm6); // y1
1527 paddw_r2r(mm4, mm3); // y5
1528
1529 movq_r2m(mm5, *(dataptr+7)); //save y3
1530 psubw_r2r(mm2, mm0); // yè=z11 - z4
1531
1532 movq_r2m(mm3, *(dataptr+11)); //save y5
1533
1534 movq_r2m(mm6, *(dataptr+3)); //save y1
1535
1536 movq_r2m(mm0, *(dataptr+15)); //save y7
1537
1538
1539 #endif
1540 }
1541
1542 #define FIX_1_082392200 ((__s32) 277) /* FIX(1.082392200) */
1543 #define FIX_1_414213562 ((__s32) 362) /* FIX(1.414213562) */
1544 #define FIX_1_847759065 ((__s32) 473) /* FIX(1.847759065) */
1545 #define FIX_2_613125930 ((__s32) 669) /* FIX(2.613125930) */
1546
1547 #define DESCALE(x) (__s16)( ((x)+4) >> 3)
1548
1549 /* clip yuv to 16..235 (should be 16..240 for cr/cb but ... */
1550
1551 #define RL(x) ((x)>235) ? 235 : (((x)<16) ? 16 : (x))
1552 #define MULTIPLY(var,const) (((__s32) ((var) * (const)) + 128)>>8)
1553
1554 void RTjpeg_idct_init(void)
1555 {
1556 int i;
1557
1558 for(i=0; i<64; i++)
1559 {
1560 RTjpeg_liqt[i]=((__u64)RTjpeg_liqt[i]*RTjpeg_aan_tab[i])>>32;
1561 RTjpeg_ciqt[i]=((__u64)RTjpeg_ciqt[i]*RTjpeg_aan_tab[i])>>32;
1562 }
1563 }
1564
1565 void RTjpeg_idct(__u8 *odata, __s16 *data, int rskip)
1566 {
1567 #ifdef MMX
1568
1569 static mmx_t fix_141 = (mmx_t)(long long)0x5a825a825a825a82LL;
1570 static mmx_t fix_184n261 = (mmx_t)(long long)0xcf04cf04cf04cf04LL;
1571 static mmx_t fix_184 = (mmx_t)(long long)0x7641764176417641LL;
1572 static mmx_t fix_n184 = (mmx_t)(long long)0x896f896f896f896fLL;
1573 static mmx_t fix_108n184 = (mmx_t)(long long)0xcf04cf04cf04cf04LL;
1574
1575 mmx_t workspace[64];
1576 mmx_t *wsptr = workspace;
1577 register mmx_t *dataptr = (mmx_t *)odata;
1578 mmx_t *idata = (mmx_t *)data;
1579
1580 rskip = rskip>>3;
1581 /*
1582 * Perform inverse DCT on one block of coefficients.
1583 */
1584
1585 /* Odd part */
1586
1587 movq_m2r(*(idata+10), mm1); // load idata[DCTSIZE*5]
1588
1589 movq_m2r(*(idata+6), mm0); // load idata[DCTSIZE*3]
1590
1591 movq_m2r(*(idata+2), mm3); // load idata[DCTSIZE*1]
1592
1593 movq_r2r(mm1, mm2); // copy tmp6 /* phase 6 */
1594
1595 movq_m2r(*(idata+14), mm4); // load idata[DCTSIZE*7]
1596
1597 paddw_r2r(mm0, mm1); // z13 = tmp6 + tmp5;
1598
1599 psubw_r2r(mm0, mm2); // z10 = tmp6 - tmp5
1600
1601 psllw_i2r(2, mm2); // shift z10
1602 movq_r2r(mm2, mm0); // copy z10
1603
1604 pmulhw_m2r(fix_184n261, mm2); // MULTIPLY( z12, FIX_1_847759065); /* 2*c2 */
1605 movq_r2r(mm3, mm5); // copy tmp4
1606
1607 pmulhw_m2r(fix_n184, mm0); // MULTIPLY(z10, -FIX_1_847759065); /* 2*c2 */
1608 paddw_r2r(mm4, mm3); // z11 = tmp4 + tmp7;
1609
1610 movq_r2r(mm3, mm6); // copy z11 /* phase 5 */
1611 psubw_r2r(mm4, mm5); // z12 = tmp4 - tmp7;
1612
1613 psubw_r2r(mm1, mm6); // z11-z13
1614 psllw_i2r(2, mm5); // shift z12
1615
1616 movq_m2r(*(idata+12), mm4); // load idata[DCTSIZE*6], even part
1617 movq_r2r(mm5, mm7); // copy z12
1618
1619 pmulhw_m2r(fix_108n184, mm5); // MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; /* 2*(c2-c6) */ even part
1620 paddw_r2r(mm1, mm3); // tmp7 = z11 + z13;
1621
1622 //ok
1623
1624 /* Even part */
1625 pmulhw_m2r(fix_184, mm7); // MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; /* -2*(c2+c6) */
1626 psllw_i2r(2, mm6);
1627
1628 movq_m2r(*(idata+4), mm1); // load idata[DCTSIZE*2]
1629
1630 paddw_r2r(mm5, mm0); // tmp10
1631
1632 paddw_r2r(mm7, mm2); // tmp12
1633
1634 pmulhw_m2r(fix_141, mm6); // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
1635 psubw_r2r(mm3, mm2); // tmp6 = tmp12 - tmp7
1636
1637 movq_r2r(mm1, mm5); // copy tmp1
1638 paddw_r2r(mm4, mm1); // tmp13= tmp1 + tmp3; /* phases 5-3 */
1639
1640 psubw_r2r(mm4, mm5); // tmp1-tmp3
1641 psubw_r2r(mm2, mm6); // tmp5 = tmp11 - tmp6;
1642
1643 movq_r2m(mm1, *(wsptr)); // save tmp13 in workspace
1644 psllw_i2r(2, mm5); // shift tmp1-tmp3
1645
1646 movq_m2r(*(idata), mm7); // load idata[DCTSIZE*0]
1647
1648 pmulhw_m2r(fix_141, mm5); // MULTIPLY(tmp1 - tmp3, FIX_1_414213562)
1649 paddw_r2r(mm6, mm0); // tmp4 = tmp10 + tmp5;
1650
1651 movq_m2r(*(idata+8), mm4); // load idata[DCTSIZE*4]
1652
1653 psubw_r2r(mm1, mm5); // tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */
1654
1655 movq_r2m(mm0, *(wsptr+4)); // save tmp4 in workspace
1656 movq_r2r(mm7, mm1); // copy tmp0 /* phase 3 */
1657
1658 movq_r2m(mm5, *(wsptr+2)); // save tmp12 in workspace
1659 psubw_r2r(mm4, mm1); // tmp11 = tmp0 - tmp2;
1660
1661 paddw_r2r(mm4, mm7); // tmp10 = tmp0 + tmp2;
1662 movq_r2r(mm1, mm5); // copy tmp11
1663
1664 paddw_m2r(*(wsptr+2), mm1); // tmp1 = tmp11 + tmp12;
1665 movq_r2r(mm7, mm4); // copy tmp10 /* phase 2 */
1666
1667 paddw_m2r(*(wsptr), mm7); // tmp0 = tmp10 + tmp13;
1668
1669 psubw_m2r(*(wsptr), mm4); // tmp3 = tmp10 - tmp13;
1670 movq_r2r(mm7, mm0); // copy tmp0
1671
1672 psubw_m2r(*(wsptr+2), mm5); // tmp2 = tmp11 - tmp12;
1673 paddw_r2r(mm3, mm7); // wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
1674
1675 psubw_r2r(mm3, mm0); // wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
1676
1677 movq_r2m(mm7, *(wsptr)); // wsptr[DCTSIZE*0]
1678 movq_r2r(mm1, mm3); // copy tmp1
1679
1680 movq_r2m(mm0, *(wsptr+14)); // wsptr[DCTSIZE*7]
1681 paddw_r2r(mm2, mm1); // wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
1682
1683 psubw_r2r(mm2, mm3); // wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
1684
1685 movq_r2m(mm1, *(wsptr+2)); // wsptr[DCTSIZE*1]
1686 movq_r2r(mm4, mm1); // copy tmp3
1687
1688 movq_r2m(mm3, *(wsptr+12)); // wsptr[DCTSIZE*6]
1689
1690 paddw_m2r(*(wsptr+4), mm4); // wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
1691
1692 psubw_m2r(*(wsptr+4), mm1); // wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
1693
1694 movq_r2m(mm4, *(wsptr+8));
1695 movq_r2r(mm5, mm7); // copy tmp2
1696
1697 paddw_r2r(mm6, mm5); // wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5)
1698
1699 movq_r2m(mm1, *(wsptr+6));
1700 psubw_r2r(mm6, mm7); // wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
1701
1702 movq_r2m(mm5, *(wsptr+4));
1703
1704 movq_r2m(mm7, *(wsptr+10));
1705
1706 //ok
1707
1708
1709 /*****************************************************************/
1710
1711 idata++;
1712 wsptr++;
1713
1714 /*****************************************************************/
1715
1716 movq_m2r(*(idata+10), mm1); // load idata[DCTSIZE*5]
1717
1718 movq_m2r(*(idata+6), mm0); // load idata[DCTSIZE*3]
1719
1720 movq_m2r(*(idata+2), mm3); // load idata[DCTSIZE*1]
1721 movq_r2r(mm1, mm2); // copy tmp6 /* phase 6 */
1722
1723 movq_m2r(*(idata+14), mm4); // load idata[DCTSIZE*7]
1724 paddw_r2r(mm0, mm1); // z13 = tmp6 + tmp5;
1725
1726 psubw_r2r(mm0, mm2); // z10 = tmp6 - tmp5
1727
1728 psllw_i2r(2, mm2); // shift z10
1729 movq_r2r(mm2, mm0); // copy z10
1730
1731 pmulhw_m2r(fix_184n261, mm2); // MULTIPLY( z12, FIX_1_847759065); /* 2*c2 */
1732 movq_r2r(mm3, mm5); // copy tmp4
1733
1734 pmulhw_m2r(fix_n184, mm0); // MULTIPLY(z10, -FIX_1_847759065); /* 2*c2 */
1735 paddw_r2r(mm4, mm3); // z11 = tmp4 + tmp7;
1736
1737 movq_r2r(mm3, mm6); // copy z11 /* phase 5 */
1738 psubw_r2r(mm4, mm5); // z12 = tmp4 - tmp7;
1739
1740 psubw_r2r(mm1, mm6); // z11-z13
1741 psllw_i2r(2, mm5); // shift z12
1742
1743 movq_m2r(*(idata+12), mm4); // load idata[DCTSIZE*6], even part
1744 movq_r2r(mm5, mm7); // copy z12
1745
1746 pmulhw_m2r(fix_108n184, mm5); // MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; /* 2*(c2-c6) */ even part
1747 paddw_r2r(mm1, mm3); // tmp7 = z11 + z13;
1748
1749 //ok
1750
1751 /* Even part */
1752 pmulhw_m2r(fix_184, mm7); // MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; /* -2*(c2+c6) */
1753 psllw_i2r(2, mm6);
1754
1755 movq_m2r(*(idata+4), mm1); // load idata[DCTSIZE*2]
1756
1757 paddw_r2r(mm5, mm0); // tmp10
1758
1759 paddw_r2r(mm7, mm2); // tmp12
1760
1761 pmulhw_m2r(fix_141, mm6); // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
1762 psubw_r2r(mm3, mm2); // tmp6 = tmp12 - tmp7
1763
1764 movq_r2r(mm1, mm5); // copy tmp1
1765 paddw_r2r(mm4, mm1); // tmp13= tmp1 + tmp3; /* phases 5-3 */
1766
1767 psubw_r2r(mm4, mm5); // tmp1-tmp3
1768 psubw_r2r(mm2, mm6); // tmp5 = tmp11 - tmp6;
1769
1770 movq_r2m(mm1, *(wsptr)); // save tmp13 in workspace
1771 psllw_i2r(2, mm5); // shift tmp1-tmp3
1772
1773 movq_m2r(*(idata), mm7); // load idata[DCTSIZE*0]
1774 paddw_r2r(mm6, mm0); // tmp4 = tmp10 + tmp5;
1775
1776 pmulhw_m2r(fix_141, mm5); // MULTIPLY(tmp1 - tmp3, FIX_1_414213562)
1777
1778 movq_m2r(*(idata+8), mm4); // load idata[DCTSIZE*4]
1779
1780 psubw_r2r(mm1, mm5); // tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */
1781
1782 movq_r2m(mm0, *(wsptr+4)); // save tmp4 in workspace
1783 movq_r2r(mm7, mm1); // copy tmp0 /* phase 3 */
1784
1785 movq_r2m(mm5, *(wsptr+2)); // save tmp12 in workspace
1786 psubw_r2r(mm4, mm1); // tmp11 = tmp0 - tmp2;
1787
1788 paddw_r2r(mm4, mm7); // tmp10 = tmp0 + tmp2;
1789 movq_r2r(mm1, mm5); // copy tmp11
1790
1791 paddw_m2r(*(wsptr+2), mm1); // tmp1 = tmp11 + tmp12;
1792 movq_r2r(mm7, mm4); // copy tmp10 /* phase 2 */
1793
1794 paddw_m2r(*(wsptr), mm7); // tmp0 = tmp10 + tmp13;
1795
1796 psubw_m2r(*(wsptr), mm4); // tmp3 = tmp10 - tmp13;
1797 movq_r2r(mm7, mm0); // copy tmp0
1798
1799 psubw_m2r(*(wsptr+2), mm5); // tmp2 = tmp11 - tmp12;
1800 paddw_r2r(mm3, mm7); // wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
1801
1802 psubw_r2r(mm3, mm0); // wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
1803
1804 movq_r2m(mm7, *(wsptr)); // wsptr[DCTSIZE*0]
1805 movq_r2r(mm1, mm3); // copy tmp1
1806
1807 movq_r2m(mm0, *(wsptr+14)); // wsptr[DCTSIZE*7]
1808 paddw_r2r(mm2, mm1); // wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
1809
1810 psubw_r2r(mm2, mm3); // wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
1811
1812 movq_r2m(mm1, *(wsptr+2)); // wsptr[DCTSIZE*1]
1813 movq_r2r(mm4, mm1); // copy tmp3
1814
1815 movq_r2m(mm3, *(wsptr+12)); // wsptr[DCTSIZE*6]
1816
1817 paddw_m2r(*(wsptr+4), mm4); // wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
1818
1819 psubw_m2r(*(wsptr+4), mm1); // wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
1820
1821 movq_r2m(mm4, *(wsptr+8));
1822 movq_r2r(mm5, mm7); // copy tmp2
1823
1824 paddw_r2r(mm6, mm5); // wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5)
1825
1826 movq_r2m(mm1, *(wsptr+6));
1827 psubw_r2r(mm6, mm7); // wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
1828
1829 movq_r2m(mm5, *(wsptr+4));
1830
1831 movq_r2m(mm7, *(wsptr+10));
1832
1833 /*****************************************************************/
1834
1835 /* Pass 2: process rows from work array, store into output array. */
1836 /* Note that we must descale the results by a factor of 8 == 2**3, */
1837 /* and also undo the PASS1_BITS scaling. */
1838
1839 /*****************************************************************/
1840 /* Even part */
1841
1842 wsptr--;
1843
1844 // tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]);
1845 // tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]);
1846 // tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]);
1847 // tmp14 = ((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6]);
1848 movq_m2r(*(wsptr), mm0); // wsptr[0,0],[0,1],[0,2],[0,3]
1849
1850 movq_m2r(*(wsptr+1), mm1); // wsptr[0,4],[0,5],[0,6],[0,7]
1851 movq_r2r(mm0, mm2);
1852
1853 movq_m2r(*(wsptr+2), mm3); // wsptr[1,0],[1,1],[1,2],[1,3]
1854 paddw_r2r(mm1, mm0); // wsptr[0,tmp10],[xxx],[0,tmp13],[xxx]
1855
1856 movq_m2r(*(wsptr+3), mm4); // wsptr[1,4],[1,5],[1,6],[1,7]
1857 psubw_r2r(mm1, mm2); // wsptr[0,tmp11],[xxx],[0,tmp14],[xxx]
1858
1859 movq_r2r(mm0, mm6);
1860 movq_r2r(mm3, mm5);
1861
1862 paddw_r2r(mm4, mm3); // wsptr[1,tmp10],[xxx],[1,tmp13],[xxx]
1863 movq_r2r(mm2, mm1);
1864
1865 psubw_r2r(mm4, mm5); // wsptr[1,tmp11],[xxx],[1,tmp14],[xxx]
1866 punpcklwd_r2r(mm3, mm0); // wsptr[0,tmp10],[1,tmp10],[xxx],[xxx]
1867
1868 movq_m2r(*(wsptr+7), mm7); // wsptr[3,4],[3,5],[3,6],[3,7]
1869 punpckhwd_r2r(mm3, mm6); // wsptr[0,tmp13],[1,tmp13],[xxx],[xxx]
1870
1871 movq_m2r(*(wsptr+4), mm3); // wsptr[2,0],[2,1],[2,2],[2,3]
1872 punpckldq_r2r(mm6, mm0); // wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
1873
1874 punpcklwd_r2r(mm5, mm1); // wsptr[0,tmp11],[1,tmp11],[xxx],[xxx]
1875 movq_r2r(mm3, mm4);
1876
1877 movq_m2r(*(wsptr+6), mm6); // wsptr[3,0],[3,1],[3,2],[3,3]
1878 punpckhwd_r2r(mm5, mm2); // wsptr[0,tmp14],[1,tmp14],[xxx],[xxx]
1879
1880 movq_m2r(*(wsptr+5), mm5); // wsptr[2,4],[2,5],[2,6],[2,7]
1881 punpckldq_r2r(mm2, mm1); // wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
1882
1883
1884 paddw_r2r(mm5, mm3); // wsptr[2,tmp10],[xxx],[2,tmp13],[xxx]
1885 movq_r2r(mm6, mm2);
1886
1887 psubw_r2r(mm5, mm4); // wsptr[2,tmp11],[xxx],[2,tmp14],[xxx]
1888 paddw_r2r(mm7, mm6); // wsptr[3,tmp10],[xxx],[3,tmp13],[xxx]
1889
1890 movq_r2r(mm3, mm5);
1891 punpcklwd_r2r(mm6, mm3); // wsptr[2,tmp10],[3,tmp10],[xxx],[xxx]
1892
1893 psubw_r2r(mm7, mm2); // wsptr[3,tmp11],[xxx],[3,tmp14],[xxx]
1894 punpckhwd_r2r(mm6, mm5); // wsptr[2,tmp13],[3,tmp13],[xxx],[xxx]
1895
1896 movq_r2r(mm4, mm7);
1897 punpckldq_r2r(mm5, mm3); // wsptr[2,tmp10],[3,tmp10],[2,tmp13],[3,tmp13]
1898
1899 punpcklwd_r2r(mm2, mm4); // wsptr[2,tmp11],[3,tmp11],[xxx],[xxx]
1900
1901 punpckhwd_r2r(mm2, mm7); // wsptr[2,tmp14],[3,tmp14],[xxx],[xxx]
1902
1903 punpckldq_r2r(mm7, mm4); // wsptr[2,tmp11],[3,tmp11],[2,tmp14],[3,tmp14]
1904 movq_r2r(mm1, mm6);
1905
1906 //ok
1907
1908 // mm0 = ;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
1909 // mm1 = ;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
1910
1911
1912 movq_r2r(mm0, mm2);
1913 punpckhdq_r2r(mm4, mm6); // wsptr[0,tmp14],[1,tmp14],[2,tmp14],[3,tmp14]
1914
1915 punpckldq_r2r(mm4, mm1); // wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11]
1916 psllw_i2r(2, mm6);
1917
1918 pmulhw_m2r(fix_141, mm6);
1919 punpckldq_r2r(mm3, mm0); // wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10]
1920
1921 punpckhdq_r2r(mm3, mm2); // wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13]
1922 movq_r2r(mm0, mm7);
1923
1924 // tmp0 = tmp10 + tmp13;
1925 // tmp3 = tmp10 - tmp13;
1926 paddw_r2r(mm2, mm0); // [0,tmp0],[1,tmp0],[2,tmp0],[3,tmp0]
1927 psubw_r2r(mm2, mm7); // [0,tmp3],[1,tmp3],[2,tmp3],[3,tmp3]
1928
1929 // tmp12 = MULTIPLY(tmp14, FIX_1_414213562) - tmp13;
1930 psubw_r2r(mm2, mm6); // wsptr[0,tmp12],[1,tmp12],[2,tmp12],[3,tmp12]
1931 // tmp1 = tmp11 + tmp12;
1932 // tmp2 = tmp11 - tmp12;
1933 movq_r2r(mm1, mm5);
1934
1935 //OK
1936
1937 /* Odd part */
1938
1939 // z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];
1940 // z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];
1941 // z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];
1942 // z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];
1943 movq_m2r(*(wsptr), mm3); // wsptr[0,0],[0,1],[0,2],[0,3]
1944 paddw_r2r(mm6, mm1); // [0,tmp1],[1,tmp1],[2,tmp1],[3,tmp1]
1945
1946 movq_m2r(*(wsptr+1), mm4); // wsptr[0,4],[0,5],[0,6],[0,7]
1947 psubw_r2r(mm6, mm5); // [0,tmp2],[1,tmp2],[2,tmp2],[3,tmp2]
1948
1949 movq_r2r(mm3, mm6);
1950 punpckldq_r2r(mm4, mm3); // wsptr[0,0],[0,1],[0,4],[0,5]
1951
1952 punpckhdq_r2r(mm6, mm4); // wsptr[0,6],[0,7],[0,2],[0,3]
1953 movq_r2r(mm3, mm2);
1954
1955 //Save tmp0 and tmp1 in wsptr
1956 movq_r2m(mm0, *(wsptr)); // save tmp0
1957 paddw_r2r(mm4, mm2); // wsptr[xxx],[0,z11],[xxx],[0,z13]
1958
1959
1960 //Continue with z10 --- z13
1961 movq_m2r(*(wsptr+2), mm6); // wsptr[1,0],[1,1],[1,2],[1,3]
1962 psubw_r2r(mm4, mm3); // wsptr[xxx],[0,z12],[xxx],[0,z10]
1963
1964 movq_m2r(*(wsptr+3), mm0); // wsptr[1,4],[1,5],[1,6],[1,7]
1965 movq_r2r(mm6, mm4);
1966
1967 movq_r2m(mm1, *(wsptr+1)); // save tmp1
1968 punpckldq_r2r(mm0, mm6); // wsptr[1,0],[1,1],[1,4],[1,5]
1969
1970 punpckhdq_r2r(mm4, mm0); // wsptr[1,6],[1,7],[1,2],[1,3]
1971 movq_r2r(mm6, mm1);
1972
1973 //Save tmp2 and tmp3 in wsptr
1974 paddw_r2r(mm0, mm6); // wsptr[xxx],[1,z11],[xxx],[1,z13]
1975 movq_r2r(mm2, mm4);
1976
1977 //Continue with z10 --- z13
1978 movq_r2m(mm5, *(wsptr+2)); // save tmp2
1979 punpcklwd_r2r(mm6, mm2); // wsptr[xxx],[xxx],[0,z11],[1,z11]
1980
1981 psubw_r2r(mm0, mm1); // wsptr[xxx],[1,z12],[xxx],[1,z10]
1982 punpckhwd_r2r(mm6, mm4); // wsptr[xxx],[xxx],[0,z13],[1,z13]
1983
1984 movq_r2r(mm3, mm0);
1985 punpcklwd_r2r(mm1, mm3); // wsptr[xxx],[xxx],[0,z12],[1,z12]
1986
1987 movq_r2m(mm7, *(wsptr+3)); // save tmp3
1988 punpckhwd_r2r(mm1, mm0); // wsptr[xxx],[xxx],[0,z10],[1,z10]
1989
1990 movq_m2r(*(wsptr+4), mm6); // wsptr[2,0],[2,1],[2,2],[2,3]
1991 punpckhdq_r2r(mm2, mm0); // wsptr[0,z10],[1,z10],[0,z11],[1,z11]
1992
1993 movq_m2r(*(wsptr+5), mm7); // wsptr[2,4],[2,5],[2,6],[2,7]
1994 punpckhdq_r2r(mm4, mm3); // wsptr[0,z12],[1,z12],[0,z13],[1,z13]
1995
1996 movq_m2r(*(wsptr+6), mm1); // wsptr[3,0],[3,1],[3,2],[3,3]
1997 movq_r2r(mm6, mm4);
1998
1999 punpckldq_r2r(mm7, mm6); // wsptr[2,0],[2,1],[2,4],[2,5]
2000 movq_r2r(mm1, mm5);
2001
2002 punpckhdq_r2r(mm4, mm7); // wsptr[2,6],[2,7],[2,2],[2,3]
2003 movq_r2r(mm6, mm2);
2004
2005 movq_m2r(*(wsptr+7), mm4); // wsptr[3,4],[3,5],[3,6],[3,7]
2006 paddw_r2r(mm7, mm6); // wsptr[xxx],[2,z11],[xxx],[2,z13]
2007
2008 psubw_r2r(mm7, mm2); // wsptr[xxx],[2,z12],[xxx],[2,z10]
2009 punpckldq_r2r(mm4, mm1); // wsptr[3,0],[3,1],[3,4],[3,5]
2010
2011 punpckhdq_r2r(mm5, mm4); // wsptr[3,6],[3,7],[3,2],[3,3]
2012 movq_r2r(mm1, mm7);
2013
2014 paddw_r2r(mm4, mm1); // wsptr[xxx],[3,z11],[xxx],[3,z13]
2015 psubw_r2r(mm4, mm7); // wsptr[xxx],[3,z12],[xxx],[3,z10]
2016
2017 movq_r2r(mm6, mm5);
2018 punpcklwd_r2r(mm1, mm6); // wsptr[xxx],[xxx],[2,z11],[3,z11]
2019
2020 punpckhwd_r2r(mm1, mm5); // wsptr[xxx],[xxx],[2,z13],[3,z13]
2021 movq_r2r(mm2, mm4);
2022
2023 punpcklwd_r2r(mm7, mm2); // wsptr[xxx],[xxx],[2,z12],[3,z12]
2024
2025 punpckhwd_r2r(mm7, mm4); // wsptr[xxx],[xxx],[2,z10],[3,z10]
2026
2027 punpckhdq_r2r(mm6, mm4); /// wsptr[2,z10],[3,z10],[2,z11],[3,z11]
2028
2029 punpckhdq_r2r(mm5, mm2); // wsptr[2,z12],[3,z12],[2,z13],[3,z13]
2030 movq_r2r(mm0, mm5);
2031
2032 punpckldq_r2r(mm4, mm0); // wsptr[0,z10],[1,z10],[2,z10],[3,z10]
2033
2034 punpckhdq_r2r(mm4, mm5); // wsptr[0,z11],[1,z11],[2,z11],[3,z11]
2035 movq_r2r(mm3, mm4);
2036
2037 punpckhdq_r2r(mm2, mm4); // wsptr[0,z13],[1,z13],[2,z13],[3,z13]
2038 movq_r2r(mm5, mm1);
2039
2040 punpckldq_r2r(mm2, mm3); // wsptr[0,z12],[1,z12],[2,z12],[3,z12]
2041 // tmp7 = z11 + z13; /* phase 5 */
2042 // tmp8 = z11 - z13; /* phase 5 */
2043 psubw_r2r(mm4, mm1); // tmp8
2044
2045 paddw_r2r(mm4, mm5); // tmp7
2046 // tmp21 = MULTIPLY(tmp8, FIX_1_414213562); /* 2*c4 */
2047 psllw_i2r(2, mm1);
2048
2049 psllw_i2r(2, mm0);
2050
2051 pmulhw_m2r(fix_141, mm1); // tmp21
2052 // tmp20 = MULTIPLY(z12, (FIX_1_082392200- FIX_1_847759065)) /* 2*(c2-c6) */
2053 // + MULTIPLY(z10, - FIX_1_847759065); /* 2*c2 */
2054 psllw_i2r(2, mm3);
2055 movq_r2r(mm0, mm7);
2056
2057 pmulhw_m2r(fix_n184, mm7);
2058 movq_r2r(mm3, mm6);
2059
2060 movq_m2r(*(wsptr), mm2); // tmp0,final1
2061
2062 pmulhw_m2r(fix_108n184, mm6);
2063 // tmp22 = MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) /* -2*(c2+c6) */
2064 // + MULTIPLY(z12, FIX_1_847759065); /* 2*c2 */
2065 movq_r2r(mm2, mm4); // final1
2066
2067 pmulhw_m2r(fix_184n261, mm0);
2068 paddw_r2r(mm5, mm2); // tmp0+tmp7,final1
2069
2070 pmulhw_m2r(fix_184, mm3);
2071 psubw_r2r(mm5, mm4); // tmp0-tmp7,final1
2072
2073 // tmp6 = tmp22 - tmp7; /* phase 2 */
2074 psraw_i2r(3, mm2); // outptr[0,0],[1,0],[2,0],[3,0],final1
2075
2076 paddw_r2r(mm6, mm7); // tmp20
2077 psraw_i2r(3, mm4); // outptr[0,7],[1,7],[2,7],[3,7],final1
2078
2079 paddw_r2r(mm0, mm3); // tmp22
2080
2081 // tmp5 = tmp21 - tmp6;
2082 psubw_r2r(mm5, mm3); // tmp6
2083
2084 // tmp4 = tmp20 + tmp5;
2085 movq_m2r(*(wsptr+1), mm0); // tmp1,final2
2086 psubw_r2r(mm3, mm1); // tmp5
2087
2088 movq_r2r(mm0, mm6); // final2
2089 paddw_r2r(mm3, mm0); // tmp1+tmp6,final2
2090
2091 /* Final output stage: scale down by a factor of 8 and range-limit */
2092
2093
2094 // outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3)
2095 // & RANGE_MASK];
2096 // outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3)
2097 // & RANGE_MASK]; final1
2098
2099
2100 // outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3)
2101 // & RANGE_MASK];
2102 // outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3)
2103 // & RANGE_MASK]; final2
2104 psubw_r2r(mm3, mm6); // tmp1-tmp6,final2
2105 psraw_i2r(3, mm0); // outptr[0,1],[1,1],[2,1],[3,1]
2106
2107 psraw_i2r(3, mm6); // outptr[0,6],[1,6],[2,6],[3,6]
2108
2109 packuswb_r2r(mm4, mm0); // out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7]
2110
2111 movq_m2r(*(wsptr+2), mm5); // tmp2,final3
2112 packuswb_r2r(mm6, mm2); // out[0,0],[1,0],[2,0],[3,0],[0,6],[1,6],[2,6],[3,6]
2113
2114 // outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3)
2115 // & RANGE_MASK];
2116 // outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3)
2117 // & RANGE_MASK]; final3
2118 paddw_r2r(mm1, mm7); // tmp4
2119 movq_r2r(mm5, mm3);
2120
2121 paddw_r2r(mm1, mm5); // tmp2+tmp5
2122 psubw_r2r(mm1, mm3); // tmp2-tmp5
2123
2124 psraw_i2r(3, mm5); // outptr[0,2],[1,2],[2,2],[3,2]
2125
2126 movq_m2r(*(wsptr+3), mm4); // tmp3,final4
2127 psraw_i2r(3, mm3); // outptr[0,5],[1,5],[2,5],[3,5]
2128
2129
2130
2131 // outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3)
2132 // & RANGE_MASK];
2133 // outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3)
2134 // & RANGE_MASK]; final4
2135 movq_r2r(mm4, mm6);
2136 paddw_r2r(mm7, mm4); // tmp3+tmp4
2137
2138 psubw_r2r(mm7, mm6); // tmp3-tmp4
2139 psraw_i2r(3, mm4); // outptr[0,4],[1,4],[2,4],[3,4]
2140
2141 // mov ecx, [dataptr]
2142
2143 psraw_i2r(3, mm6); // outptr[0,3],[1,3],[2,3],[3,3]
2144
2145 packuswb_r2r(mm4, mm5); // out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4]
2146
2147 packuswb_r2r(mm3, mm6); // out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5]
2148 movq_r2r(mm2, mm4);
2149
2150 movq_r2r(mm5, mm7);
2151 punpcklbw_r2r(mm0, mm2); // out[0,0],[0,1],[1,0],[1,1],[2,0],[2,1],[3,0],[3,1]
2152
2153 punpckhbw_r2r(mm0, mm4); // out[0,6],[0,7],[1,6],[1,7],[2,6],[2,7],[3,6],[3,7]
2154 movq_r2r(mm2, mm1);
2155
2156 punpcklbw_r2r(mm6, mm5); // out[0,2],[0,3],[1,2],[1,3],[2,2],[2,3],[3,2],[3,3]
2157
2158 // add dataptr, 4
2159
2160 punpckhbw_r2r(mm6, mm7); // out[0,4],[0,5],[1,4],[1,5],[2,4],[2,5],[3,4],[3,5]
2161
2162 punpcklwd_r2r(mm5, mm2); // out[0,0],[0,1],[0,2],[0,3],[1,0],[1,1],[1,2],[1,3]
2163
2164 // add ecx, output_col
2165
2166 movq_r2r(mm7, mm6);
2167 punpckhwd_r2r(mm5, mm1); // out[2,0],[2,1],[2,2],[2,3],[3,0],[3,1],[3,2],[3,3]
2168
2169 movq_r2r(mm2, mm0);
2170 punpcklwd_r2r(mm4, mm6); // out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7]
2171
2172 // mov idata, [dataptr]
2173
2174 punpckldq_r2r(mm6, mm2); // out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7]
2175
2176 // add dataptr, 4
2177
2178 movq_r2r(mm1, mm3);
2179
2180 // add idata, output_col
2181
2182 punpckhwd_r2r(mm4, mm7); // out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7]
2183
2184 movq_r2m(mm2, *(dataptr));
2185
2186 punpckhdq_r2r(mm6, mm0); // out[1,0],[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7]
2187
2188 dataptr += rskip;
2189 movq_r2m(mm0, *(dataptr));
2190
2191 punpckldq_r2r(mm7, mm1); // out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7]
2192 punpckhdq_r2r(mm7, mm3); // out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7]
2193
2194 dataptr += rskip;
2195 movq_r2m(mm1, *(dataptr));
2196
2197 dataptr += rskip;
2198 movq_r2m(mm3, *(dataptr));
2199
2200 /*******************************************************************/
2201
2202 wsptr += 8;
2203
2204 /*******************************************************************/
2205
2206 // tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]);
2207 // tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]);
2208 // tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]);
2209 // tmp14 = ((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6]);
2210 movq_m2r(*(wsptr), mm0); // wsptr[0,0],[0,1],[0,2],[0,3]
2211
2212 movq_m2r(*(wsptr+1), mm1); // wsptr[0,4],[0,5],[0,6],[0,7]
2213 movq_r2r(mm0, mm2);
2214
2215 movq_m2r(*(wsptr+2), mm3); // wsptr[1,0],[1,1],[1,2],[1,3]
2216 paddw_r2r(mm1, mm0); // wsptr[0,tmp10],[xxx],[0,tmp13],[xxx]
2217
2218 movq_m2r(*(wsptr+3), mm4); // wsptr[1,4],[1,5],[1,6],[1,7]
2219 psubw_r2r(mm1, mm2); // wsptr[0,tmp11],[xxx],[0,tmp14],[xxx]
2220
2221 movq_r2r(mm0, mm6);
2222 movq_r2r(mm3, mm5);
2223
2224 paddw_r2r(mm4, mm3); // wsptr[1,tmp10],[xxx],[1,tmp13],[xxx]
2225 movq_r2r(mm2, mm1);
2226
2227 psubw_r2r(mm4, mm5); // wsptr[1,tmp11],[xxx],[1,tmp14],[xxx]
2228 punpcklwd_r2r(mm3, mm0); // wsptr[0,tmp10],[1,tmp10],[xxx],[xxx]
2229
2230 movq_m2r(*(wsptr+7), mm7); // wsptr[3,4],[3,5],[3,6],[3,7]
2231 punpckhwd_r2r(mm3, mm6); // wsptr[0,tmp13],[1,tmp13],[xxx],[xxx]
2232
2233 movq_m2r(*(wsptr+4), mm3); // wsptr[2,0],[2,1],[2,2],[2,3]
2234 punpckldq_r2r(mm6, mm0); // wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
2235
2236 punpcklwd_r2r(mm5, mm1); // wsptr[0,tmp11],[1,tmp11],[xxx],[xxx]
2237 movq_r2r(mm3, mm4);
2238
2239 movq_m2r(*(wsptr+6), mm6); // wsptr[3,0],[3,1],[3,2],[3,3]
2240 punpckhwd_r2r(mm5, mm2); // wsptr[0,tmp14],[1,tmp14],[xxx],[xxx]
2241
2242 movq_m2r(*(wsptr+5), mm5); // wsptr[2,4],[2,5],[2,6],[2,7]
2243 punpckldq_r2r(mm2, mm1); // wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
2244
2245 paddw_r2r(mm5, mm3); // wsptr[2,tmp10],[xxx],[2,tmp13],[xxx]
2246 movq_r2r(mm6, mm2);
2247
2248 psubw_r2r(mm5, mm4); // wsptr[2,tmp11],[xxx],[2,tmp14],[xxx]
2249 paddw_r2r(mm7, mm6); // wsptr[3,tmp10],[xxx],[3,tmp13],[xxx]
2250
2251 movq_r2r(mm3, mm5);
2252 punpcklwd_r2r(mm6, mm3); // wsptr[2,tmp10],[3,tmp10],[xxx],[xxx]
2253
2254 psubw_r2r(mm7, mm2); // wsptr[3,tmp11],[xxx],[3,tmp14],[xxx]
2255 punpckhwd_r2r(mm6, mm5); // wsptr[2,tmp13],[3,tmp13],[xxx],[xxx]
2256
2257 movq_r2r(mm4, mm7);
2258 punpckldq_r2r(mm5, mm3); // wsptr[2,tmp10],[3,tmp10],[2,tmp13],[3,tmp13]
2259
2260 punpcklwd_r2r(mm2, mm4); // wsptr[2,tmp11],[3,tmp11],[xxx],[xxx]
2261
2262 punpckhwd_r2r(mm2, mm7); // wsptr[2,tmp14],[3,tmp14],[xxx],[xxx]
2263
2264 punpckldq_r2r(mm7, mm4); // wsptr[2,tmp11],[3,tmp11],[2,tmp14],[3,tmp14]
2265 movq_r2r(mm1, mm6);
2266
2267 //OK
2268
2269 // mm0 = ;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
2270 // mm1 = ;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
2271
2272 movq_r2r(mm0, mm2);
2273 punpckhdq_r2r(mm4, mm6); // wsptr[0,tmp14],[1,tmp14],[2,tmp14],[3,tmp14]
2274
2275 punpckldq_r2r(mm4, mm1); // wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11]
2276 psllw_i2r(2, mm6);
2277
2278 pmulhw_m2r(fix_141, mm6);
2279 punpckldq_r2r(mm3, mm0); // wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10]
2280
2281 punpckhdq_r2r(mm3, mm2); // wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13]
2282 movq_r2r(mm0, mm7);
2283
2284 // tmp0 = tmp10 + tmp13;
2285 // tmp3 = tmp10 - tmp13;
2286 paddw_r2r(mm2, mm0); // [0,tmp0],[1,tmp0],[2,tmp0],[3,tmp0]
2287 psubw_r2r(mm2, mm7); // [0,tmp3],[1,tmp3],[2,tmp3],[3,tmp3]
2288
2289 // tmp12 = MULTIPLY(tmp14, FIX_1_414213562) - tmp13;
2290 psubw_r2r(mm2, mm6); // wsptr[0,tmp12],[1,tmp12],[2,tmp12],[3,tmp12]
2291 // tmp1 = tmp11 + tmp12;
2292 // tmp2 = tmp11 - tmp12;
2293 movq_r2r(mm1, mm5);
2294
2295 //OK
2296
2297
2298 /* Odd part */
2299
2300 // z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];
2301 // z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];
2302 // z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];
2303 // z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];
2304 movq_m2r(*(wsptr), mm3); // wsptr[0,0],[0,1],[0,2],[0,3]
2305 paddw_r2r(mm6, mm1); // [0,tmp1],[1,tmp1],[2,tmp1],[3,tmp1]
2306
2307 movq_m2r(*(wsptr+1), mm4); // wsptr[0,4],[0,5],[0,6],[0,7]
2308 psubw_r2r(mm6, mm5); // [0,tmp2],[1,tmp2],[2,tmp2],[3,tmp2]
2309
2310 movq_r2r(mm3, mm6);
2311 punpckldq_r2r(mm4, mm3); // wsptr[0,0],[0,1],[0,4],[0,5]
2312
2313 punpckhdq_r2r(mm6, mm4); // wsptr[0,6],[0,7],[0,2],[0,3]
2314 movq_r2r(mm3, mm2);
2315
2316 //Save tmp0 and tmp1 in wsptr
2317 movq_r2m(mm0, *(wsptr)); // save tmp0
2318 paddw_r2r(mm4, mm2); // wsptr[xxx],[0,z11],[xxx],[0,z13]
2319
2320
2321 //Continue with z10 --- z13
2322 movq_m2r(*(wsptr+2), mm6); // wsptr[1,0],[1,1],[1,2],[1,3]
2323 psubw_r2r(mm4, mm3); // wsptr[xxx],[0,z12],[xxx],[0,z10]
2324
2325 movq_m2r(*(wsptr+3), mm0); // wsptr[1,4],[1,5],[1,6],[1,7]
2326 movq_r2r(mm6, mm4);
2327
2328 movq_r2m(mm1, *(wsptr+1)); // save tmp1
2329 punpckldq_r2r(mm0, mm6); // wsptr[1,0],[1,1],[1,4],[1,5]
2330
2331 punpckhdq_r2r(mm4, mm0); // wsptr[1,6],[1,7],[1,2],[1,3]
2332 movq_r2r(mm6, mm1);
2333
2334 //Save tmp2 and tmp3 in wsptr
2335 paddw_r2r(mm0, mm6); // wsptr[xxx],[1,z11],[xxx],[1,z13]
2336 movq_r2r(mm2, mm4);
2337
2338 //Continue with z10 --- z13
2339 movq_r2m(mm5, *(wsptr+2)); // save tmp2
2340 punpcklwd_r2r(mm6, mm2); // wsptr[xxx],[xxx],[0,z11],[1,z11]
2341
2342 psubw_r2r(mm0, mm1); // wsptr[xxx],[1,z12],[xxx],[1,z10]
2343 punpckhwd_r2r(mm6, mm4); // wsptr[xxx],[xxx],[0,z13],[1,z13]
2344
2345 movq_r2r(mm3, mm0);
2346 punpcklwd_r2r(mm1, mm3); // wsptr[xxx],[xxx],[0,z12],[1,z12]
2347
2348 movq_r2m(mm7, *(wsptr+3)); // save tmp3
2349 punpckhwd_r2r(mm1, mm0); // wsptr[xxx],[xxx],[0,z10],[1,z10]
2350
2351 movq_m2r(*(wsptr+4), mm6); // wsptr[2,0],[2,1],[2,2],[2,3]
2352 punpckhdq_r2r(mm2, mm0); // wsptr[0,z10],[1,z10],[0,z11],[1,z11]
2353
2354 movq_m2r(*(wsptr+5), mm7); // wsptr[2,4],[2,5],[2,6],[2,7]
2355 punpckhdq_r2r(mm4, mm3); // wsptr[0,z12],[1,z12],[0,z13],[1,z13]
2356
2357 movq_m2r(*(wsptr+6), mm1); // wsptr[3,0],[3,1],[3,2],[3,3]
2358 movq_r2r(mm6, mm4);
2359
2360 punpckldq_r2r(mm7, mm6); // wsptr[2,0],[2,1],[2,4],[2,5]
2361 movq_r2r(mm1, mm5);
2362
2363 punpckhdq_r2r(mm4, mm7); // wsptr[2,6],[2,7],[2,2],[2,3]
2364 movq_r2r(mm6, mm2);
2365
2366 movq_m2r(*(wsptr+7), mm4); // wsptr[3,4],[3,5],[3,6],[3,7]
2367 paddw_r2r(mm7, mm6); // wsptr[xxx],[2,z11],[xxx],[2,z13]
2368
2369 psubw_r2r(mm7, mm2); // wsptr[xxx],[2,z12],[xxx],[2,z10]
2370 punpckldq_r2r(mm4, mm1); // wsptr[3,0],[3,1],[3,4],[3,5]
2371
2372 punpckhdq_r2r(mm5, mm4); // wsptr[3,6],[3,7],[3,2],[3,3]
2373 movq_r2r(mm1, mm7);
2374
2375 paddw_r2r(mm4, mm1); // wsptr[xxx],[3,z11],[xxx],[3,z13]
2376 psubw_r2r(mm4, mm7); // wsptr[xxx],[3,z12],[xxx],[3,z10]
2377
2378 movq_r2r(mm6, mm5);
2379 punpcklwd_r2r(mm1, mm6); // wsptr[xxx],[xxx],[2,z11],[3,z11]
2380
2381 punpckhwd_r2r(mm1, mm5); // wsptr[xxx],[xxx],[2,z13],[3,z13]
2382 movq_r2r(mm2, mm4);
2383
2384 punpcklwd_r2r(mm7, mm2); // wsptr[xxx],[xxx],[2,z12],[3,z12]
2385
2386 punpckhwd_r2r(mm7, mm4); // wsptr[xxx],[xxx],[2,z10],[3,z10]
2387
2388 punpckhdq_r2r(mm6, mm4); // wsptr[2,z10],[3,z10],[2,z11],[3,z11]
2389
2390 punpckhdq_r2r(mm5, mm2); // wsptr[2,z12],[3,z12],[2,z13],[3,z13]
2391 movq_r2r(mm0, mm5);
2392
2393 punpckldq_r2r(mm4, mm0); // wsptr[0,z10],[1,z10],[2,z10],[3,z10]
2394
2395 punpckhdq_r2r(mm4, mm5); // wsptr[0,z11],[1,z11],[2,z11],[3,z11]
2396 movq_r2r(mm3, mm4);
2397
2398 punpckhdq_r2r(mm2, mm4); // wsptr[0,z13],[1,z13],[2,z13],[3,z13]
2399 movq_r2r(mm5, mm1);
2400
2401 punpckldq_r2r(mm2, mm3); // wsptr[0,z12],[1,z12],[2,z12],[3,z12]
2402 // tmp7 = z11 + z13; /* phase 5 */
2403 // tmp8 = z11 - z13; /* phase 5 */
2404 psubw_r2r(mm4, mm1); // tmp8
2405
2406 paddw_r2r(mm4, mm5); // tmp7
2407 // tmp21 = MULTIPLY(tmp8, FIX_1_414213562); /* 2*c4 */
2408 psllw_i2r(2, mm1);
2409
2410 psllw_i2r(2, mm0);
2411
2412 pmulhw_m2r(fix_141, mm1); // tmp21
2413 // tmp20 = MULTIPLY(z12, (FIX_1_082392200- FIX_1_847759065)) /* 2*(c2-c6) */
2414 // + MULTIPLY(z10, - FIX_1_847759065); /* 2*c2 */
2415 psllw_i2r(2, mm3);
2416 movq_r2r(mm0, mm7);
2417
2418 pmulhw_m2r(fix_n184, mm7);
2419 movq_r2r(mm3, mm6);
2420
2421 movq_m2r(*(wsptr), mm2); // tmp0,final1
2422
2423 pmulhw_m2r(fix_108n184, mm6);
2424 // tmp22 = MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) /* -2*(c2+c6) */
2425 // + MULTIPLY(z12, FIX_1_847759065); /* 2*c2 */
2426 movq_r2r(mm2, mm4); // final1
2427
2428 pmulhw_m2r(fix_184n261, mm0);
2429 paddw_r2r(mm5, mm2); // tmp0+tmp7,final1
2430
2431 pmulhw_m2r(fix_184, mm3);
2432 psubw_r2r(mm5, mm4); // tmp0-tmp7,final1
2433
2434 // tmp6 = tmp22 - tmp7; /* phase 2 */
2435 psraw_i2r(3, mm2); // outptr[0,0],[1,0],[2,0],[3,0],final1
2436
2437 paddw_r2r(mm6, mm7); // tmp20
2438 psraw_i2r(3, mm4); // outptr[0,7],[1,7],[2,7],[3,7],final1
2439
2440 paddw_r2r(mm0, mm3); // tmp22
2441
2442 // tmp5 = tmp21 - tmp6;
2443 psubw_r2r(mm5, mm3); // tmp6
2444
2445 // tmp4 = tmp20 + tmp5;
2446 movq_m2r(*(wsptr+1), mm0); // tmp1,final2
2447 psubw_r2r(mm3, mm1); // tmp5
2448
2449 movq_r2r(mm0, mm6); // final2
2450 paddw_r2r(mm3, mm0); // tmp1+tmp6,final2
2451
2452 /* Final output stage: scale down by a factor of 8 and range-limit */
2453
2454 // outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3)
2455 // & RANGE_MASK];
2456 // outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3)
2457 // & RANGE_MASK]; final1
2458
2459
2460 // outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3)
2461 // & RANGE_MASK];
2462 // outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3)
2463 // & RANGE_MASK]; final2
2464 psubw_r2r(mm3, mm6); // tmp1-tmp6,final2
2465 psraw_i2r(3, mm0); // outptr[0,1],[1,1],[2,1],[3,1]
2466
2467 psraw_i2r(3, mm6); // outptr[0,6],[1,6],[2,6],[3,6]
2468
2469 packuswb_r2r(mm4, mm0); // out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7]
2470
2471 movq_m2r(*(wsptr+2), mm5); // tmp2,final3
2472 packuswb_r2r(mm6, mm2); // out[0,0],[1,0],[2,0],[3,0],[0,6],[1,6],[2,6],[3,6]
2473
2474 // outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3)
2475 // & RANGE_MASK];
2476 // outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3)
2477 // & RANGE_MASK]; final3
2478 paddw_r2r(mm1, mm7); // tmp4
2479 movq_r2r(mm5, mm3);
2480
2481 paddw_r2r(mm1, mm5); // tmp2+tmp5
2482 psubw_r2r(mm1, mm3); // tmp2-tmp5
2483
2484 psraw_i2r(3, mm5); // outptr[0,2],[1,2],[2,2],[3,2]
2485
2486 movq_m2r(*(wsptr+3), mm4); // tmp3,final4
2487 psraw_i2r(3, mm3); // outptr[0,5],[1,5],[2,5],[3,5]
2488
2489
2490
2491 // outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3)
2492 // & RANGE_MASK];
2493 // outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3)
2494 // & RANGE_MASK]; final4
2495 movq_r2r(mm4, mm6);
2496 paddw_r2r(mm7, mm4); // tmp3+tmp4
2497
2498 psubw_r2r(mm7, mm6); // tmp3-tmp4
2499 psraw_i2r(3, mm4); // outptr[0,4],[1,4],[2,4],[3,4]
2500
2501 psraw_i2r(3, mm6); // outptr[0,3],[1,3],[2,3],[3,3]
2502
2503 /*
2504 movq_r2m(mm4, *dummy);
2505 fprintf(stderr, "3-4 %016llx\n", dummy);
2506 movq_r2m(mm4, *dummy);
2507 fprintf(stderr, "3+4 %016llx\n", dummy);
2508 */
2509
2510
2511 packuswb_r2r(mm4, mm5); // out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4]
2512
2513 packuswb_r2r(mm3, mm6); // out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5]
2514 movq_r2r(mm2, mm4);
2515
2516 movq_r2r(mm5, mm7);
2517 punpcklbw_r2r(mm0, mm2); // out[0,0],[0,1],[1,0],[1,1],[2,0],[2,1],[3,0],[3,1]
2518
2519 punpckhbw_r2r(mm0, mm4); // out[0,6],[0,7],[1,6],[1,7],[2,6],[2,7],[3,6],[3,7]
2520 movq_r2r(mm2, mm1);
2521
2522 punpcklbw_r2r(mm6, mm5); // out[0,2],[0,3],[1,2],[1,3],[2,2],[2,3],[3,2],[3,3]
2523
2524 punpckhbw_r2r(mm6, mm7); // out[0,4],[0,5],[1,4],[1,5],[2,4],[2,5],[3,4],[3,5]
2525
2526 punpcklwd_r2r(mm5, mm2); // out[0,0],[0,1],[0,2],[0,3],[1,0],[1,1],[1,2],[1,3]
2527
2528 movq_r2r(mm7, mm6);
2529 punpckhwd_r2r(mm5, mm1); // out[2,0],[2,1],[2,2],[2,3],[3,0],[3,1],[3,2],[3,3]
2530
2531 movq_r2r(mm2, mm0);
2532 punpcklwd_r2r(mm4, mm6); // out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7]
2533
2534 punpckldq_r2r(mm6, mm2); // out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7]
2535
2536 movq_r2r(mm1, mm3);
2537
2538 punpckhwd_r2r(mm4, mm7); // out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7]
2539
2540 dataptr += rskip;
2541 movq_r2m(mm2, *(dataptr));
2542
2543 punpckhdq_r2r(mm6, mm0); // out[1,0],[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7]
2544
2545 dataptr += rskip;
2546 movq_r2m(mm0, *(dataptr));
2547
2548 punpckldq_r2r(mm7, mm1); // out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7]
2549
2550 punpckhdq_r2r(mm7, mm3); // out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7]
2551
2552 dataptr += rskip;
2553 movq_r2m(mm1, *(dataptr));
2554
2555 dataptr += rskip;
2556 movq_r2m(mm3, *(dataptr));
2557
2558 #else
2559 __s32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2560 __s32 tmp10, tmp11, tmp12, tmp13;
2561 __s32 z5, z10, z11, z12, z13;
2562 __s16 *inptr;
2563 __s32 *wsptr;
2564 __u8 *outptr;
2565 int ctr;
2566 __s32 dcval;
2567 __s32 workspace[64];
2568
2569 inptr = data;
2570 wsptr = workspace;
2571 for (ctr = 8; ctr > 0; ctr--) {
2572
2573 if ((inptr[8] | inptr[16] | inptr[24] |
2574 inptr[32] | inptr[40] | inptr[48] | inptr[56]) == 0) {
2575 dcval = inptr[0];
2576 wsptr[0] = dcval;
2577 wsptr[8] = dcval;
2578 wsptr[16] = dcval;
2579 wsptr[24] = dcval;
2580 wsptr[32] = dcval;
2581 wsptr[40] = dcval;
2582 wsptr[48] = dcval;
2583 wsptr[56] = dcval;
2584
2585 inptr++;
2586 wsptr++;
2587 continue;
2588 }
2589
2590 tmp0 = inptr[0];
2591 tmp1 = inptr[16];
2592 tmp2 = inptr[32];
2593 tmp3 = inptr[48];
2594
2595 tmp10 = tmp0 + tmp2;
2596 tmp11 = tmp0 - tmp2;
2597
2598 tmp13 = tmp1 + tmp3;
2599 tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13;
2600
2601 tmp0 = tmp10 + tmp13;
2602 tmp3 = tmp10 - tmp13;
2603 tmp1 = tmp11 + tmp12;
2604 tmp2 = tmp11 - tmp12;
2605
2606 tmp4 = inptr[8];
2607 tmp5 = inptr[24];
2608 tmp6 = inptr[40];
2609 tmp7 = inptr[56];
2610
2611 z13 = tmp6 + tmp5;
2612 z10 = tmp6 - tmp5;
2613 z11 = tmp4 + tmp7;
2614 z12 = tmp4 - tmp7;
2615
2616 tmp7 = z11 + z13;
2617 tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562);
2618
2619 z5 = MULTIPLY(z10 + z12, FIX_1_847759065);
2620 tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5;
2621 tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5;
2622
2623 tmp6 = tmp12 - tmp7;
2624 tmp5 = tmp11 - tmp6;
2625 tmp4 = tmp10 + tmp5;
2626
2627 wsptr[0] = (__s32) (tmp0 + tmp7);
2628 wsptr[56] = (__s32) (tmp0 - tmp7);
2629 wsptr[8] = (__s32) (tmp1 + tmp6);
2630 wsptr[48] = (__s32) (tmp1 - tmp6);
2631 wsptr[16] = (__s32) (tmp2 + tmp5);
2632 wsptr[40] = (__s32) (tmp2 - tmp5);
2633 wsptr[32] = (__s32) (tmp3 + tmp4);
2634 wsptr[24] = (__s32) (tmp3 - tmp4);
2635
2636 inptr++;
2637 wsptr++;
2638 }
2639
2640 wsptr = workspace;
2641 for (ctr = 0; ctr < 8; ctr++) {
2642 outptr = &(odata[ctr*rskip]);
2643
2644 tmp10 = wsptr[0] + wsptr[4];
2645 tmp11 = wsptr[0] - wsptr[4];
2646
2647 tmp13 = wsptr[2] + wsptr[6];
2648 tmp12 = MULTIPLY(wsptr[2] - wsptr[6], FIX_1_414213562) - tmp13;
2649
2650 tmp0 = tmp10 + tmp13;
2651 tmp3 = tmp10 - tmp13;
2652 tmp1 = tmp11 + tmp12;
2653 tmp2 = tmp11 - tmp12;
2654
2655 z13 = wsptr[5] + wsptr[3];
2656 z10 = wsptr[5] - wsptr[3];
2657 z11 = wsptr[1] + wsptr[7];
2658 z12 = wsptr[1] - wsptr[7];
2659
2660 tmp7 = z11 + z13;
2661 tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562);
2662
2663 z5 = MULTIPLY(z10 + z12, FIX_1_847759065);
2664 tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5;
2665 tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5;
2666
2667 tmp6 = tmp12 - tmp7;
2668 tmp5 = tmp11 - tmp6;
2669 tmp4 = tmp10 + tmp5;
2670
2671 outptr[0] = RL(DESCALE(tmp0 + tmp7));
2672 outptr[7] = RL(DESCALE(tmp0 - tmp7));
2673 outptr[1] = RL(DESCALE(tmp1 + tmp6));
2674 outptr[6] = RL(DESCALE(tmp1 - tmp6));
2675 outptr[2] = RL(DESCALE(tmp2 + tmp5));
2676 outptr[5] = RL(DESCALE(tmp2 - tmp5));
2677 outptr[4] = RL(DESCALE(tmp3 + tmp4));
2678 outptr[3] = RL(DESCALE(tmp3 - tmp4));
2679
2680 wsptr += 8;
2681 }
2682 #endif
2683 }
2684 /*
2685
2686 Main Routines
2687
2688 This file contains most of the initialisation and control functions
2689
2690 (C) Justin Schoeman 1998
2691
2692 */
2693
2694 /*
2695
2696 Private function
2697
2698 Initialise all the cache-aliged data blocks
2699
2700 */
2701
2702 void RTjpeg_init_data(void)
2703 {
2704 unsigned long dptr;
2705
2706 dptr=(unsigned long)&(RTjpeg_alldata[0]);
2707 dptr+=32;
2708 dptr=dptr>>5;
2709 dptr=dptr<<5; /* cache align data */
2710
2711 RTjpeg_block=(__s16 *)dptr;
2712 dptr+=sizeof(__s16)*64;
2713 RTjpeg_lqt=(__s32 *)dptr;
2714 dptr+=sizeof(__s32)*64;
2715 RTjpeg_cqt=(__s32 *)dptr;
2716 dptr+=sizeof(__s32)*64;
2717 RTjpeg_liqt=(__u32 *)dptr;
2718 dptr+=sizeof(__u32)*64;
2719 RTjpeg_ciqt=(__u32 *)dptr;
2720 }
2721
2722 /*
2723
2724 External Function
2725
2726 Re-set quality factor
2727
2728 Input: buf -> pointer to 128 ints for quant values store to pass back to
2729 init_decompress.
2730 Q -> quality factor (192=best, 32=worst)
2731 */
2732
2733 void RTjpeg_init_Q(__u8 Q)
2734 {
2735 int i;
2736 __u64 qual;
2737
2738 qual=(__u64)Q<<(32-7); /* 32 bit FP, 255=2, 0=0 */
2739
2740 for(i=0; i<64; i++)
2741 {
2742 RTjpeg_lqt[i]=(__s32)((qual/((__u64)RTjpeg_lum_quant_tbl[i]<<16))>>3);
2743 if(RTjpeg_lqt[i]==0)RTjpeg_lqt[i]=1;
2744 RTjpeg_cqt[i]=(__s32)((qual/((__u64)RTjpeg_chrom_quant_tbl[i]<<16))>>3);
2745 if(RTjpeg_cqt[i]==0)RTjpeg_cqt[i]=1;
2746 RTjpeg_liqt[i]=(1<<16)/(RTjpeg_lqt[i]<<3);
2747 RTjpeg_ciqt[i]=(1<<16)/(RTjpeg_cqt[i]<<3);
2748 RTjpeg_lqt[i]=((1<<16)/RTjpeg_liqt[i])>>3;
2749 RTjpeg_cqt[i]=((1<<16)/RTjpeg_ciqt[i])>>3;
2750 }
2751
2752 RTjpeg_lb8=0;
2753 while(RTjpeg_liqt[RTjpeg_ZZ[++RTjpeg_lb8]]<=8);
2754 RTjpeg_lb8--;
2755 RTjpeg_cb8=0;
2756 while(RTjpeg_ciqt[RTjpeg_ZZ[++RTjpeg_cb8]]<=8);
2757 RTjpeg_cb8--;
2758
2759 RTjpeg_dct_init();
2760 RTjpeg_idct_init();
2761 RTjpeg_quant_init();
2762 }
2763
2764 /*
2765
2766 External Function
2767
2768 Initialise compression.
2769
2770 Input: buf -> pointer to 128 ints for quant values store to pass back to
2771 init_decompress.
2772 width -> width of image
2773 height -> height of image
2774 Q -> quality factor (192=best, 32=worst)
2775
2776 */
2777
2778 void RTjpeg_init_compress(__u32 *buf, int width, int height, __u8 Q)
2779 {
2780 int i;
2781 __u64 qual;
2782
2783 RTjpeg_init_data();
2784
2785 RTjpeg_width=width;
2786 RTjpeg_height=height;
2787 RTjpeg_Ywidth = RTjpeg_width>>3;
2788 RTjpeg_Ysize=width * height;
2789 RTjpeg_Cwidth = RTjpeg_width>>4;
2790 RTjpeg_Csize= (width>>1) * height;
2791
2792 qual=(__u64)Q<<(32-7); /* 32 bit FP, 255=2, 0=0 */
2793
2794 for(i=0; i<64; i++)
2795 {
2796 RTjpeg_lqt[i]=(__s32)((qual/((__u64)RTjpeg_lum_quant_tbl[i]<<16))>>3);
2797 if(RTjpeg_lqt[i]==0)RTjpeg_lqt[i]=1;
2798 RTjpeg_cqt[i]=(__s32)((qual/((__u64)RTjpeg_chrom_quant_tbl[i]<<16))>>3);
2799 if(RTjpeg_cqt[i]==0)RTjpeg_cqt[i]=1;
2800 RTjpeg_liqt[i]=(1<<16)/(RTjpeg_lqt[i]<<3);
2801 RTjpeg_ciqt[i]=(1<<16)/(RTjpeg_cqt[i]<<3);
2802 RTjpeg_lqt[i]=((1<<16)/RTjpeg_liqt[i])>>3;
2803 RTjpeg_cqt[i]=((1<<16)/RTjpeg_ciqt[i])>>3;
2804 }
2805
2806 RTjpeg_lb8=0;
2807 while(RTjpeg_liqt[RTjpeg_ZZ[++RTjpeg_lb8]]<=8);
2808 RTjpeg_lb8--;
2809 RTjpeg_cb8=0;
2810 while(RTjpeg_ciqt[RTjpeg_ZZ[++RTjpeg_cb8]]<=8);
2811 RTjpeg_cb8--;
2812
2813 RTjpeg_dct_init();
2814 RTjpeg_quant_init();
2815
2816 for(i=0; i<64; i++)
2817 buf[i]=RTjpeg_liqt[i];
2818 for(i=0; i<64; i++)
2819 buf[64+i]=RTjpeg_ciqt[i];
2820 }
2821
2822 void RTjpeg_init_decompress(__u32 *buf, int width, int height)
2823 {
2824 int i;
2825
2826 RTjpeg_init_data();
2827
2828 RTjpeg_width=width;
2829 RTjpeg_height=height;
2830 RTjpeg_Ywidth = RTjpeg_width>>3;
2831 RTjpeg_Ysize=width * height;
2832 RTjpeg_Cwidth = RTjpeg_width>>4;
2833 RTjpeg_Csize= (width>>1) * height;
2834
2835 for(i=0; i<64; i++)
2836 {
2837 RTjpeg_liqt[i]=buf[i];
2838 RTjpeg_ciqt[i]=buf[i+64];
2839 }
2840
2841 RTjpeg_lb8=0;
2842 while(RTjpeg_liqt[RTjpeg_ZZ[++RTjpeg_lb8]]<=8);
2843 RTjpeg_lb8--;
2844 RTjpeg_cb8=0;
2845 while(RTjpeg_ciqt[RTjpeg_ZZ[++RTjpeg_cb8]]<=8);
2846 RTjpeg_cb8--;
2847
2848 RTjpeg_idct_init();
2849
2850 // RTjpeg_color_init();
2851 }
2852
2853 int RTjpeg_compressYUV420(__s8 *sp, unsigned char *bp)
2854 {
2855 __s8 * sb;
2856 register __s8 * bp1 = bp + (RTjpeg_width<<3);
2857 register __s8 * bp2 = bp + RTjpeg_Ysize;
2858 register __s8 * bp3 = bp2 + (RTjpeg_Csize>>1);
2859 register int i, j, k;
2860
2861 #ifdef MMX
2862 emms();
2863 #endif
2864 sb=sp;
2865 /* Y */
2866 for(i=RTjpeg_height>>1; i; i-=8)
2867 {
2868 for(j=0, k=0; j<RTjpeg_width; j+=16, k+=8)
2869 {
2870 RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_Ywidth);
2871 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
2872 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
2873
2874 RTjpeg_dctY(bp+j+8, RTjpeg_block, RTjpeg_Ywidth);
2875 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
2876 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
2877
2878 RTjpeg_dctY(bp1+j, RTjpeg_block, RTjpeg_Ywidth);
2879 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
2880 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
2881
2882 RTjpeg_dctY(bp1+j+8, RTjpeg_block, RTjpeg_Ywidth);
2883 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
2884 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
2885
2886 RTjpeg_dctY(bp2+k, RTjpeg_block, RTjpeg_Cwidth);
2887 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
2888 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
2889
2890 RTjpeg_dctY(bp3+k, RTjpeg_block, RTjpeg_Cwidth);
2891 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
2892 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
2893
2894 }
2895 bp+=RTjpeg_width<<4;
2896 bp1+=RTjpeg_width<<4;
2897 bp2+=RTjpeg_width<<2;
2898 bp3+=RTjpeg_width<<2;
2899
2900 }
2901 #ifdef MMX
2902 emms();
2903 #endif
2904 return (sp-sb);
2905 }
2906
2907 int RTjpeg_compressYUV422(__s8 *sp, unsigned char *bp)
2908 {
2909 __s8 * sb;
2910 register __s8 * bp2 = bp + RTjpeg_Ysize;
2911 register __s8 * bp3 = bp2 + RTjpeg_Csize;
2912 register int i, j, k;
2913
2914 #ifdef MMX
2915 emms();
2916 #endif
2917 sb=sp;
2918 /* Y */
2919 for(i=RTjpeg_height; i; i-=8)
2920 {
2921 for(j=0, k=0; j<RTjpeg_width; j+=16, k+=8)
2922 {
2923 RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_Ywidth);
2924 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
2925 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
2926
2927 RTjpeg_dctY(bp+j+8, RTjpeg_block, RTjpeg_Ywidth);
2928 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
2929 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
2930
2931 RTjpeg_dctY(bp2+k, RTjpeg_block, RTjpeg_Cwidth);
2932 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
2933 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
2934
2935 RTjpeg_dctY(bp3+k, RTjpeg_block, RTjpeg_Cwidth);
2936 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
2937 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
2938
2939 }
2940 bp+=RTjpeg_width<<3;
2941 bp2+=RTjpeg_width<<2;
2942 bp3+=RTjpeg_width<<2;
2943
2944 }
2945 #ifdef MMX
2946 emms();
2947 #endif
2948 return (sp-sb);
2949 }
2950
2951 int RTjpeg_compress8(__s8 *sp, unsigned char *bp)
2952 {
2953 __s8 * sb;
2954 int i, j;
2955
2956 #ifdef MMX
2957 emms();
2958 #endif
2959
2960 sb=sp;
2961 /* Y */
2962 for(i=0; i<RTjpeg_height; i+=8)
2963 {
2964 for(j=0; j<RTjpeg_width; j+=8)
2965 {
2966 RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_width);
2967 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
2968 sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
2969 }
2970 bp+=RTjpeg_width;
2971 }
2972
2973 #ifdef MMX
2974 emms();
2975 #endif
2976 return (sp-sb);
2977 }
2978
2979 void RTjpeg_decompressYUV422(__s8 *sp, __u8 *bp)
2980 {
2981 register __s8 * bp2 = bp + RTjpeg_Ysize;
2982 register __s8 * bp3 = bp2 + (RTjpeg_Csize);
2983 int i, j,k;
2984
2985 #ifdef MMX
2986 emms();
2987 #endif
2988
2989 /* Y */
2990 for(i=RTjpeg_height; i; i-=8)
2991 {
2992 for(k=0, j=0; j<RTjpeg_width; j+=16, k+=8) {
2993 if(*sp==-1)sp++;
2994 else
2995 {
2996 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
2997 RTjpeg_idct(bp+j, RTjpeg_block, RTjpeg_width);
2998 }
2999 if(*sp==-1)sp++;
3000 else
3001 {
3002 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
3003 RTjpeg_idct(bp+j+8, RTjpeg_block, RTjpeg_width);
3004 }
3005 if(*sp==-1)sp++;
3006 else
3007 {
3008 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt);
3009 RTjpeg_idct(bp2+k, RTjpeg_block, RTjpeg_width>>1);
3010 }
3011 if(*sp==-1)sp++;
3012 else
3013 {
3014 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt);
3015 RTjpeg_idct(bp3+k, RTjpeg_block, RTjpeg_width>>1);
3016 }
3017 }
3018 bp+=RTjpeg_width<<3;
3019 bp2+=RTjpeg_width<<2;
3020 bp3+=RTjpeg_width<<2;
3021 }
3022 #ifdef MMX
3023 emms();
3024 #endif
3025 }
3026
3027 void RTjpeg_decompressYUV420(__s8 *sp, __u8 *bp)
3028 {
3029 register __s8 * bp1 = bp + (RTjpeg_width<<3);
3030 register __s8 * bp2 = bp + RTjpeg_Ysize;
3031 register __s8 * bp3 = bp2 + (RTjpeg_Csize>>1);
3032 int i, j,k;
3033
3034 #ifdef MMX
3035 emms();
3036 #endif
3037
3038 /* Y */
3039 for(i=RTjpeg_height>>1; i; i-=8)
3040 {
3041 for(k=0, j=0; j<RTjpeg_width; j+=16, k+=8) {
3042 if(*sp==-1)sp++;
3043 else
3044 {
3045 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
3046 RTjpeg_idct(bp+j, RTjpeg_block, RTjpeg_width);
3047 }
3048 if(*sp==-1)sp++;
3049 else
3050 {
3051 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
3052 RTjpeg_idct(bp+j+8, RTjpeg_block, RTjpeg_width);
3053 }
3054 if(*sp==-1)sp++;
3055 else
3056 {
3057 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
3058 RTjpeg_idct(bp1+j, RTjpeg_block, RTjpeg_width);
3059 }
3060 if(*sp==-1)sp++;
3061 else
3062 {
3063 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
3064 RTjpeg_idct(bp1+j+8, RTjpeg_block, RTjpeg_width);
3065 }
3066 if(*sp==-1)sp++;
3067 else
3068 {
3069 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt);
3070 RTjpeg_idct(bp2+k, RTjpeg_block, RTjpeg_width>>1);
3071 }
3072 if(*sp==-1)sp++;
3073 else
3074 {
3075 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt);
3076 RTjpeg_idct(bp3+k, RTjpeg_block, RTjpeg_width>>1);
3077 }
3078 }
3079 bp+=RTjpeg_width<<4;
3080 bp1+=RTjpeg_width<<4;
3081 bp2+=RTjpeg_width<<2;
3082 bp3+=RTjpeg_width<<2;
3083 }
3084 #ifdef MMX
3085 emms();
3086 #endif
3087 }
3088
3089 void RTjpeg_decompress8(__s8 *sp, __u8 *bp)
3090 {
3091 int i, j;
3092
3093 #ifdef MMX
3094 emms();
3095 #endif
3096
3097 /* Y */
3098 for(i=0; i<RTjpeg_height; i+=8)
3099 {
3100 for(j=0; j<RTjpeg_width; j+=8)
3101 if(*sp==-1)sp++;
3102 else
3103 {
3104 sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
3105 RTjpeg_idct(bp+j, RTjpeg_block, RTjpeg_width);
3106 }
3107 bp+=RTjpeg_width<<3;
3108 }
3109 }
3110
3111 /*
3112 External Function
3113
3114 Initialise additional data structures for motion compensation
3115
3116 */
3117
3118 void RTjpeg_init_mcompress(void)
3119 {
3120 unsigned long tmp;
3121
3122 if(!RTjpeg_old)
3123 {
3124 RTjpeg_old=malloc((4*RTjpeg_width*RTjpeg_height)+32);
3125 tmp=(unsigned long)RTjpeg_old;
3126 tmp+=32;
3127 tmp=tmp>>5;
3128 RTjpeg_old=(__s16 *)(tmp<<5);
3129 }
3130 if (!RTjpeg_old)
3131 {
3132 fprintf(stderr, "RTjpeg: Could not allocate memory\n");
3133 exit(-1);
3134 }
3135 bzero(RTjpeg_old, ((4*RTjpeg_width*RTjpeg_height)));
3136 }
3137
3138 #ifdef MMX
3139
3140 int RTjpeg_bcomp(__s16 *old, mmx_t *mask)
3141 {
3142 int i;
3143 mmx_t *mold=(mmx_t *)old;
3144 mmx_t *mblock=(mmx_t *)RTjpeg_block;
3145 volatile mmx_t result;
3146 static mmx_t neg=(mmx_t)(unsigned long long)0xffffffffffffffffULL;
3147
3148 movq_m2r(*mask, mm7);
3149 movq_m2r(neg, mm6);
3150 pxor_r2r(mm5, mm5);
3151
3152 for(i=0; i<8; i++)
3153 {
3154 movq_m2r(*(mblock++), mm0);
3155 movq_m2r(*(mblock++), mm2);
3156 movq_m2r(*(mold++), mm1);
3157 movq_m2r(*(mold++), mm3);
3158 psubsw_r2r(mm1, mm0);
3159 psubsw_r2r(mm3, mm2);
3160 movq_r2r(mm0, mm1);
3161 movq_r2r(mm2, mm3);
3162 pcmpgtw_r2r(mm7, mm0);
3163 pcmpgtw_r2r(mm7, mm2);
3164 pxor_r2r(mm6, mm1);
3165 pxor_r2r(mm6, mm3);
3166 pcmpgtw_r2r(mm7, mm1);
3167 pcmpgtw_r2r(mm7, mm3);
3168 por_r2r(mm0, mm5);
3169 por_r2r(mm2, mm5);
3170 por_r2r(mm1, mm5);
3171 por_r2r(mm3, mm5);
3172 }
3173 movq_r2m(mm5, result);
3174
3175 if(result.q)
3176 {
3177 // if(!RTjpeg_mtest)
3178 // for(i=0; i<16; i++)((__u64 *)old)[i]=((__u64 *)RTjpeg_block)[i];
3179 return 0;
3180 }
3181 // printf(".");
3182 return 1;
3183 }
3184
3185 #else
3186 int RTjpeg_bcomp(__s16 *old, __u16 *mask)
3187 {
3188 int i;
3189
3190 for(i=0; i<64; i++)
3191 if(abs(old[i]-RTjpeg_block[i])>*mask)
3192 {
3193 if(!RTjpeg_mtest)
3194 for(i=0; i<16; i++)((__u64 *)old)[i]=((__u64 *)RTjpeg_block)[i];
3195 return 0;
3196 }
3197 return 1;
3198 }
3199 #endif
3200
3201 void RTjpeg_set_test(int i)
3202 {
3203 RTjpeg_mtest=i;
3204 }
3205
3206 int RTjpeg_mcompressYUV420(__s8 *sp, unsigned char *bp, __u16 lmask, __u16 cmask)
3207 {
3208 __s8 * sb;
3209 //rh __s16 *block;
3210 register __s8 * bp1 = bp + (RTjpeg_width<<3);
3211 register __s8 * bp2 = bp + RTjpeg_Ysize;
3212 register __s8 * bp3 = bp2 + (RTjpeg_Csize>>1);
3213 register int i, j, k;
3214
3215 #ifdef MMX
3216 emms();
3217 RTjpeg_lmask=(mmx_t)(((__u64)lmask<<48)|((__u64)lmask<<32)|((__u64)lmask<<16)|lmask);
3218 RTjpeg_cmask=(mmx_t)(((__u64)cmask<<48)|((__u64)cmask<<32)|((__u64)cmask<<16)|cmask);
3219 #else
3220 RTjpeg_lmask=lmask;
3221 RTjpeg_cmask=cmask;
3222 #endif
3223
3224 sb=sp;
3225 block=RTjpeg_old;
3226 /* Y */
3227 for(i=RTjpeg_height>>1; i; i-=8)
3228 {
3229 for(j=0, k=0; j<RTjpeg_width; j+=16, k+=8)
3230 {
3231 RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_Ywidth);
3232 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
3233 if(RTjpeg_bcomp(block, &RTjpeg_lmask))
3234 {
3235 *((__u8 *)sp++)=255;
3236 }
3237 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
3238 block+=64;
3239
3240 RTjpeg_dctY(bp+j+8, RTjpeg_block, RTjpeg_Ywidth);
3241 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
3242 if(RTjpeg_bcomp(block, &RTjpeg_lmask))
3243 {
3244 *((__u8 *)sp++)=255;
3245 }
3246 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
3247 block+=64;
3248
3249 RTjpeg_dctY(bp1+j, RTjpeg_block, RTjpeg_Ywidth);
3250 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
3251 if(RTjpeg_bcomp(block, &RTjpeg_lmask))
3252 {
3253 *((__u8 *)sp++)=255;
3254 }
3255 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
3256 block+=64;
3257
3258 RTjpeg_dctY(bp1+j+8, RTjpeg_block, RTjpeg_Ywidth);
3259 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
3260 if(RTjpeg_bcomp(block, &RTjpeg_lmask))
3261 {
3262 *((__u8 *)sp++)=255;
3263 }
3264 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
3265 block+=64;
3266
3267 RTjpeg_dctY(bp2+k, RTjpeg_block, RTjpeg_Cwidth);
3268 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
3269 if(RTjpeg_bcomp(block, &RTjpeg_cmask))
3270 {
3271 *((__u8 *)sp++)=255;
3272 }
3273 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
3274 block+=64;
3275
3276 RTjpeg_dctY(bp3+k, RTjpeg_block, RTjpeg_Cwidth);
3277 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
3278 if(RTjpeg_bcomp(block, &RTjpeg_cmask))
3279 {
3280 *((__u8 *)sp++)=255;
3281 }
3282 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
3283 block+=64;
3284 }
3285 bp+=RTjpeg_width<<4;
3286 bp1+=RTjpeg_width<<4;
3287 bp2+=RTjpeg_width<<2;
3288 bp3+=RTjpeg_width<<2;
3289
3290 }
3291 #ifdef MMX
3292 emms();
3293 #endif
3294 return (sp-sb);
3295 }
3296
3297
3298 int RTjpeg_mcompressYUV422(__s8 *sp, unsigned char *bp, __u16 lmask, __u16 cmask)
3299 {
3300 __s8 * sb;
3301 __s16 *block;
3302 register __s8 * bp2;
3303 register __s8 * bp3;
3304 register int i, j, k;
3305
3306 #ifdef MMX
3307 emms();
3308 RTjpeg_lmask=(mmx_t)(((__u64)lmask<<48)|((__u64)lmask<<32)|((__u64)lmask<<16)|lmask);
3309 RTjpeg_cmask=(mmx_t)(((__u64)cmask<<48)|((__u64)cmask<<32)|((__u64)cmask<<16)|cmask);
3310 #else
3311 RTjpeg_lmask=lmask;
3312 RTjpeg_cmask=cmask;
3313 #endif
3314
3315 bp = bp - RTjpeg_width*0;
3316 bp2 = bp + RTjpeg_Ysize-RTjpeg_width*0;
3317 bp3 = bp2 + RTjpeg_Csize;
3318
3319 sb=sp;
3320 block=RTjpeg_old;
3321 /* Y */
3322 for(i=RTjpeg_height; i; i-=8)
3323 {
3324 for(j=0, k=0; j<RTjpeg_width; j+=16, k+=8)
3325 {
3326 RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_Ywidth);
3327 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
3328 if(RTjpeg_bcomp(block, &RTjpeg_lmask))
3329 {
3330 *((__u8 *)sp++)=255;
3331 }
3332 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
3333 block+=64;
3334
3335 RTjpeg_dctY(bp+j+8, RTjpeg_block, RTjpeg_Ywidth);
3336 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
3337 if(RTjpeg_bcomp(block, &RTjpeg_lmask))
3338 {
3339 *((__u8 *)sp++)=255;
3340 }
3341 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
3342 block+=64;
3343
3344 RTjpeg_dctY(bp2+k, RTjpeg_block, RTjpeg_Cwidth);
3345 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
3346 if(RTjpeg_bcomp(block, &RTjpeg_cmask))
3347 {
3348 *((__u8 *)sp++)=255;
3349 }
3350 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
3351 block+=64;
3352
3353 RTjpeg_dctY(bp3+k, RTjpeg_block, RTjpeg_Cwidth);
3354 RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
3355 if(RTjpeg_bcomp(block, &RTjpeg_cmask))
3356 {
3357 *((__u8 *)sp++)=255;
3358 }
3359 else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
3360 block+=64;
3361
3362 }
3363 bp+=RTjpeg_width<<3;
3364 bp2+=RTjpeg_width<<2;
3365 bp3+=RTjpeg_width<<2;
3366 }
3367 printf ("%d\n", block - RTjpeg_old);
3368 #ifdef MMX
3369 emms();
3370 #endif
3371 return (sp-sb);
3372 }
3373
3374 int RTjpeg_mcompress8(__s8 *sp, unsigned char *bp, __u16 lmask)
3375 {
3376 __s8 * sb;
3377 __s16 *block;
3378 int i, j;
3379
3380 #ifdef MMX
3381 emms();
3382 RTjpeg_lmask=(mmx_t)(((__u64)lmask<<48)|((__u64)lmask<<32)|((__u64)lmask<<16)|lmask);
3383 #else
3384 RTjpeg_lmask=lmask;
3385 #endif
3386
3387
3388 sb=sp;
3389 block=RTjpeg_old;
3390 /* Y */
3391 for(i=0; i<RTjpeg_height; i+=8)
3392 {
3393 for(j=0; j<RTjpeg_width; j+=8)
3394 {
3395 RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_width);
3396 RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
3397 if(RTjpeg_bcomp(block, &RTjpeg_lmask))
3398 {
3399 *((__u8 *)sp++)=255;
3400 // printf("* %d ", sp[-1]);
3401 } else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
3402 block+=64;
3403 }
3404 bp+=RTjpeg_width<<3;
3405 }
3406 #ifdef MMX
3407 emms();
3408 #endif
3409 return (sp-sb);
3410 }
3411
3412 void RTjpeg_color_init(void)
3413 {
3414 }
3415
3416 #define KcrR 76284
3417 #define KcrG 53281
3418 #define KcbG 25625
3419 #define KcbB 132252
3420 #define Ky 76284
3421
3422 void RTjpeg_yuv422rgb(__u8 *buf, __u8 *rgb, int stride)
3423 {
3424 int tmp;
3425 int i, j;
3426 __s32 y, crR, crG, cbG, cbB;
3427 __u8 *bufcr, *bufcb, *bufy, *bufoute;
3428 int yskip;
3429
3430 yskip=RTjpeg_width;
3431
3432 bufcb=&buf[RTjpeg_width*RTjpeg_height];
3433 bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/2];
3434 bufy=&buf[0];
3435 bufoute=rgb;
3436
3437 for(i=0; i<(RTjpeg_height); i++)
3438 {
3439 for(j=0; j<RTjpeg_width; j+=2)
3440 {
3441 crR=(*bufcr-128)*KcrR;
3442 crG=(*(bufcr++)-128)*KcrG;
3443 cbG=(*bufcb-128)*KcbG;
3444 cbB=(*(bufcb++)-128)*KcbB;
3445
3446 y=(bufy[j]-16)*Ky;
3447
3448 tmp=(y+crR)>>16;
3449 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3450 tmp=(y-crG-cbG)>>16;
3451 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3452 tmp=(y+cbB)>>16;
3453 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3454
3455 y=(bufy[j+1]-16)*Ky;
3456
3457 tmp=(y+crR)>>16;
3458 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3459 tmp=(y-crG-cbG)>>16;
3460 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3461 tmp=(y+cbB)>>16;
3462 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3463
3464 }
3465 bufy+=yskip;
3466 }
3467 }
3468
3469
3470 void RTjpeg_yuv420rgb(__u8 *buf, __u8 *rgb, int stride)
3471 {
3472 int tmp;
3473 int i, j;
3474 __s32 y, crR, crG, cbG, cbB;
3475 __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto;
3476 int oskip, yskip;
3477
3478 if(stride==0)
3479 oskip=RTjpeg_width*3;
3480 else
3481 oskip=2*stride-RTjpeg_width*3;
3482
3483 yskip=RTjpeg_width;
3484
3485 bufcb=&buf[RTjpeg_width*RTjpeg_height];
3486 bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/4];
3487 bufy=&buf[0];
3488 bufoute=rgb;
3489 bufouto=rgb+RTjpeg_width*3;
3490
3491 for(i=0; i<(RTjpeg_height>>1); i++)
3492 {
3493 for(j=0; j<RTjpeg_width; j+=2)
3494 {
3495 crR=(*bufcr-128)*KcrR;
3496 crG=(*(bufcr++)-128)*KcrG;
3497 cbG=(*bufcb-128)*KcbG;
3498 cbB=(*(bufcb++)-128)*KcbB;
3499
3500 y=(bufy[j]-16)*Ky;
3501
3502 tmp=(y+crR)>>16;
3503 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3504 tmp=(y-crG-cbG)>>16;
3505 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3506 tmp=(y+cbB)>>16;
3507 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3508
3509 y=(bufy[j+1]-16)*Ky;
3510
3511 tmp=(y+crR)>>16;
3512 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3513 tmp=(y-crG-cbG)>>16;
3514 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3515 tmp=(y+cbB)>>16;
3516 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3517
3518 y=(bufy[j+yskip]-16)*Ky;
3519
3520 tmp=(y+crR)>>16;
3521 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3522 tmp=(y-crG-cbG)>>16;
3523 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3524 tmp=(y+cbB)>>16;
3525 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3526
3527 y=(bufy[j+1+yskip]-16)*Ky;
3528
3529 tmp=(y+crR)>>16;
3530 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3531 tmp=(y-crG-cbG)>>16;
3532 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3533 tmp=(y+cbB)>>16;
3534 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3535
3536 }
3537 bufoute+=oskip;
3538 bufouto+=oskip;
3539 bufy+=yskip<<1;
3540 }
3541 }
3542
3543
3544 void RTjpeg_yuvrgb32(__u8 *buf, __u8 *rgb, int stride)
3545 {
3546 int tmp;
3547 int i, j;
3548 __s32 y, crR, crG, cbG, cbB;
3549 __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto;
3550 int oskip, yskip;
3551
3552 if(stride==0)
3553 oskip=RTjpeg_width*4;
3554 else
3555 oskip = 2*stride-RTjpeg_width*4;
3556 yskip=RTjpeg_width;
3557
3558 bufcb=&buf[RTjpeg_width*RTjpeg_height];
3559 bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/2];
3560 bufy=&buf[0];
3561 bufoute=rgb;
3562 bufouto=rgb+RTjpeg_width*4;
3563
3564 for(i=0; i<(RTjpeg_height>>1); i++)
3565 {
3566 for(j=0; j<RTjpeg_width; j+=2)
3567 {
3568 crR=(*bufcr-128)*KcrR;
3569 crG=(*(bufcr++)-128)*KcrG;
3570 cbG=(*bufcb-128)*KcbG;
3571 cbB=(*(bufcb++)-128)*KcbB;
3572
3573 y=(bufy[j]-16)*Ky;
3574
3575 tmp=(y+cbB)>>16;
3576 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3577 tmp=(y-crG-cbG)>>16;
3578 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3579 tmp=(y+crR)>>16;
3580 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3581 bufoute++;
3582
3583 y=(bufy[j+1]-16)*Ky;
3584
3585 tmp=(y+cbB)>>16;
3586 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3587 tmp=(y-crG-cbG)>>16;
3588 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3589 tmp=(y+crR)>>16;
3590 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3591 bufoute++;
3592
3593 y=(bufy[j+yskip]-16)*Ky;
3594
3595 tmp=(y+cbB)>>16;
3596 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3597 tmp=(y-crG-cbG)>>16;
3598 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3599 tmp=(y+crR)>>16;
3600 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3601 bufouto++;
3602
3603 y=(bufy[j+1+yskip]-16)*Ky;
3604
3605 tmp=(y+cbB)>>16;
3606 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3607 tmp=(y-crG-cbG)>>16;
3608 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3609 tmp=(y+crR)>>16;
3610 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3611 bufouto++;
3612
3613 }
3614 bufoute+=oskip;
3615 bufouto+=oskip;
3616 bufy+=yskip<<1;
3617 }
3618 }
3619
3620 void RTjpeg_yuvrgb24(__u8 *buf, __u8 *rgb, int stride)
3621 {
3622 int tmp;
3623 int i, j;
3624 __s32 y, crR, crG, cbG, cbB;
3625 __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto;
3626 int oskip, yskip;
3627
3628 if(stride==0)
3629 oskip=RTjpeg_width*3;
3630 else
3631 oskip=2*stride - RTjpeg_width*3;
3632
3633 yskip=RTjpeg_width;
3634
3635 bufcb=&buf[RTjpeg_width*RTjpeg_height];
3636 bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/4];
3637 bufy=&buf[0];
3638 bufoute=rgb;
3639 bufouto=rgb+RTjpeg_width*3;
3640
3641 for(i=0; i<(RTjpeg_height>>1); i++)
3642 {
3643 for(j=0; j<RTjpeg_width; j+=2)
3644 {
3645 crR=(*bufcr-128)*KcrR;
3646 crG=(*(bufcr++)-128)*KcrG;
3647 cbG=(*bufcb-128)*KcbG;
3648 cbB=(*(bufcb++)-128)*KcbB;
3649
3650 y=(bufy[j]-16)*Ky;
3651
3652 tmp=(y+cbB)>>16;
3653 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3654 tmp=(y-crG-cbG)>>16;
3655 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3656 tmp=(y+crR)>>16;
3657 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3658
3659 y=(bufy[j+1]-16)*Ky;
3660
3661 tmp=(y+cbB)>>16;
3662 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3663 tmp=(y-crG-cbG)>>16;
3664 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3665 tmp=(y+crR)>>16;
3666 *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
3667
3668 y=(bufy[j+yskip]-16)*Ky;
3669
3670 tmp=(y+cbB)>>16;
3671 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3672 tmp=(y-crG-cbG)>>16;
3673 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3674 tmp=(y+crR)>>16;
3675 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3676
3677 y=(bufy[j+1+yskip]-16)*Ky;
3678
3679 tmp=(y+cbB)>>16;
3680 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3681 tmp=(y-crG-cbG)>>16;
3682 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3683 tmp=(y+crR)>>16;
3684 *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
3685
3686 }
3687 bufoute+=oskip;
3688 bufouto+=oskip;
3689 bufy+=yskip<<1;
3690 }
3691 }
3692
3693 void RTjpeg_yuvrgb16(__u8 *buf, __u8 *rgb, int stride)
3694 {
3695 int tmp;
3696 int i, j;
3697 __s32 y, crR, crG, cbG, cbB;
3698 __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto;
3699 int oskip, yskip;
3700 unsigned char r, g, b;
3701
3702 if(stride==0)
3703 oskip=RTjpeg_width*2;
3704 else
3705 oskip=2*stride-RTjpeg_width*2;
3706
3707 yskip=RTjpeg_width;
3708
3709 bufcb=&buf[RTjpeg_width*RTjpeg_height];
3710 bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/4];
3711 bufy=&buf[0];
3712 bufoute=rgb;
3713 bufouto=rgb+RTjpeg_width*2;
3714
3715 for(i=0; i<(RTjpeg_height>>1); i++)
3716 {
3717 for(j=0; j<RTjpeg_width; j+=2)
3718 {
3719 crR=(*bufcr-128)*KcrR;
3720 crG=(*(bufcr++)-128)*KcrG;
3721 cbG=(*bufcb-128)*KcbG;
3722 cbB=(*(bufcb++)-128)*KcbB;
3723
3724 y=(bufy[j]-16)*Ky;
3725
3726 tmp=(y+cbB)>>16;
3727 b=(tmp>255)?255:((tmp<0)?0:tmp);
3728 tmp=(y-crG-cbG)>>16;
3729 g=(tmp>255)?255:((tmp<0)?0:tmp);
3730 tmp=(y+crR)>>16;
3731 r=(tmp>255)?255:((tmp<0)?0:tmp);
3732 tmp=(int)((int)b >> 3);
3733 tmp|=(int)(((int)g >> 2) << 5);
3734 tmp|=(int)(((int)r >> 3) << 11);
3735 *(bufoute++)=tmp&0xff;
3736 *(bufoute++)=tmp>>8;
3737
3738
3739 y=(bufy[j+1]-16)*Ky;
3740
3741 tmp=(y+cbB)>>16;
3742 b=(tmp>255)?255:((tmp<0)?0:tmp);
3743 tmp=(y-crG-cbG)>>16;
3744 g=(tmp>255)?255:((tmp<0)?0:tmp);
3745 tmp=(y+crR)>>16;
3746 r=(tmp>255)?255:((tmp<0)?0:tmp);
3747 tmp=(int)((int)b >> 3);
3748 tmp|=(int)(((int)g >> 2) << 5);
3749 tmp|=(int)(((int)r >> 3) << 11);
3750 *(bufoute++)=tmp&0xff;
3751 *(bufoute++)=tmp>>8;
3752
3753 y=(bufy[j+yskip]-16)*Ky;
3754
3755 tmp=(y+cbB)>>16;
3756 b=(tmp>255)?255:((tmp<0)?0:tmp);
3757 tmp=(y-crG-cbG)>>16;
3758 g=(tmp>255)?255:((tmp<0)?0:tmp);
3759 tmp=(y+crR)>>16;
3760 r=(tmp>255)?255:((tmp<0)?0:tmp);
3761 tmp=(int)((int)b >> 3);
3762 tmp|=(int)(((int)g >> 2) << 5);
3763 tmp|=(int)(((int)r >> 3) << 11);
3764 *(bufouto++)=tmp&0xff;
3765 *(bufouto++)=tmp>>8;
3766
3767 y=(bufy[j+1+yskip]-16)*Ky;
3768
3769 tmp=(y+cbB)>>16;
3770 b=(tmp>255)?255:((tmp<0)?0:tmp);
3771 tmp=(y-crG-cbG)>>16;
3772 g=(tmp>255)?255:((tmp<0)?0:tmp);
3773 tmp=(y+crR)>>16;
3774 r=(tmp>255)?255:((tmp<0)?0:tmp);
3775 tmp=(int)((int)b >> 3);
3776 tmp|=(int)(((int)g >> 2) << 5);
3777 tmp|=(int)(((int)r >> 3) << 11);
3778 *(bufouto++)=tmp&0xff;
3779 *(bufouto++)=tmp>>8;
3780
3781 }
3782 bufoute+=oskip;
3783 bufouto+=oskip;
3784 bufy+=yskip<<1;
3785 }
3786 }
3787
3788 /* fix stride */
3789
3790 void RTjpeg_yuvrgb8(__u8 *buf, __u8 *rgb, int stride)
3791 {
3792 bcopy(buf, rgb, RTjpeg_width*RTjpeg_height);
3793 }
3794