comparison i386/mpegvideo_mmx.c @ 706:e65798d228ea libavcodec

idct permutation cleanup, idct can be selected per context now fixing some threadunsafe code
author michaelni
date Sun, 29 Sep 2002 22:44:22 +0000
parents 9abb13c21fbe
children c3fc09466f92
comparison
equal deleted inserted replaced
705:107a56aa74f5 706:e65798d228ea
21 */ 21 */
22 22
23 #include "../dsputil.h" 23 #include "../dsputil.h"
24 #include "../mpegvideo.h" 24 #include "../mpegvideo.h"
25 #include "../avcodec.h" 25 #include "../avcodec.h"
26 26 #include "../simple_idct.h"
27 extern UINT8 zigzag_end[64]; 27
28 /* Input permutation for the simple_idct_mmx */
29 static UINT8 simple_mmx_permutation[64]={
30 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
31 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
32 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
33 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
34 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
35 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
36 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
37 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
38 };
28 39
29 extern UINT8 zigzag_direct_noperm[64]; 40 extern UINT8 zigzag_direct_noperm[64];
30 extern UINT16 inv_zigzag_direct16[64]; 41 extern UINT16 inv_zigzag_direct16[64];
31 extern UINT32 inverse[256]; 42 extern UINT32 inverse[256];
32 43
33 #if 0
34
35 /* XXX: GL: I don't understand why this function needs optimization
36 (it is called only once per frame!), so I disabled it */
37
38 void MPV_frame_start(MpegEncContext *s)
39 {
40 if (s->pict_type == B_TYPE) {
41 __asm __volatile(
42 "movl (%1), %%eax\n\t"
43 "movl 4(%1), %%edx\n\t"
44 "movl 8(%1), %%ecx\n\t"
45 "movl %%eax, (%0)\n\t"
46 "movl %%edx, 4(%0)\n\t"
47 "movl %%ecx, 8(%0)\n\t"
48 :
49 :"r"(s->current_picture), "r"(s->aux_picture)
50 :"eax","edx","ecx","memory");
51 } else {
52 /* swap next and last */
53 __asm __volatile(
54 "movl (%1), %%eax\n\t"
55 "movl 4(%1), %%edx\n\t"
56 "movl 8(%1), %%ecx\n\t"
57 "xchgl (%0), %%eax\n\t"
58 "xchgl 4(%0), %%edx\n\t"
59 "xchgl 8(%0), %%ecx\n\t"
60 "movl %%eax, (%1)\n\t"
61 "movl %%edx, 4(%1)\n\t"
62 "movl %%ecx, 8(%1)\n\t"
63 "movl %%eax, (%2)\n\t"
64 "movl %%edx, 4(%2)\n\t"
65 "movl %%ecx, 8(%2)\n\t"
66 :
67 :"r"(s->last_picture), "r"(s->next_picture), "r"(s->current_picture)
68 :"eax","edx","ecx","memory");
69 }
70 }
71 #endif
72
73 static const unsigned long long int mm_wabs __attribute__ ((aligned(8))) = 0xffffffffffffffffULL; 44 static const unsigned long long int mm_wabs __attribute__ ((aligned(8))) = 0xffffffffffffffffULL;
74 static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL; 45 static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
75 46
76 47
77 static void dct_unquantize_h263_mmx(MpegEncContext *s, 48 static void dct_unquantize_h263_mmx(MpegEncContext *s,
78 DCTELEM *block, int n, int qscale) 49 DCTELEM *block, int n, int qscale)
79 { 50 {
80 int i, level, qmul, qadd, nCoeffs; 51 int level, qmul, qadd, nCoeffs;
81 52
82 qmul = s->qscale << 1; 53 qmul = qscale << 1;
83 if (s->h263_aic && s->mb_intra) 54 qadd = (qscale - 1) | 1;
84 qadd = 0; 55
85 else 56 assert(s->block_last_index[n]>=0);
86 qadd = (s->qscale - 1) | 1; 57
87
88 if (s->mb_intra) { 58 if (s->mb_intra) {
89 if (!s->h263_aic) { 59 if (!s->h263_aic) {
90 if (n < 4) 60 if (n < 4)
91 block[0] = block[0] * s->y_dc_scale; 61 level = block[0] * s->y_dc_scale;
92 else 62 else
93 block[0] = block[0] * s->c_dc_scale; 63 level = block[0] * s->c_dc_scale;
64 }else{
65 qadd = 0;
66 level= block[0];
94 } 67 }
95 for(i=1; i<8; i++) { 68 nCoeffs=63;
96 level = block[i];
97 if (level) {
98 if (level < 0) {
99 level = level * qmul - qadd;
100 } else {
101 level = level * qmul + qadd;
102 }
103 block[i] = level;
104 }
105 }
106 nCoeffs=64;
107 } else { 69 } else {
108 i = 0; 70 nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
109 nCoeffs= zigzag_end[ s->block_last_index[n] ];
110 } 71 }
111 //printf("%d %d ", qmul, qadd); 72 //printf("%d %d ", qmul, qadd);
112 asm volatile( 73 asm volatile(
113 "movd %1, %%mm6 \n\t" //qmul 74 "movd %1, %%mm6 \n\t" //qmul
114 "packssdw %%mm6, %%mm6 \n\t" 75 "packssdw %%mm6, %%mm6 \n\t"
150 111
151 "movq %%mm0, (%0, %3) \n\t" 112 "movq %%mm0, (%0, %3) \n\t"
152 "movq %%mm1, 8(%0, %3) \n\t" 113 "movq %%mm1, 8(%0, %3) \n\t"
153 114
154 "addl $16, %3 \n\t" 115 "addl $16, %3 \n\t"
155 "js 1b \n\t" 116 "jng 1b \n\t"
156 ::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(i-nCoeffs)) 117 ::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(-nCoeffs))
157 : "memory" 118 : "memory"
158 ); 119 );
120 if(s->mb_intra)
121 block[0]= level;
159 } 122 }
160 123
161 124
162 /* 125 /*
163 NK: 126 NK:
191 static void dct_unquantize_mpeg1_mmx(MpegEncContext *s, 154 static void dct_unquantize_mpeg1_mmx(MpegEncContext *s,
192 DCTELEM *block, int n, int qscale) 155 DCTELEM *block, int n, int qscale)
193 { 156 {
194 int nCoeffs; 157 int nCoeffs;
195 const UINT16 *quant_matrix; 158 const UINT16 *quant_matrix;
196 159
197 if(s->alternate_scan) nCoeffs= 64; 160 assert(s->block_last_index[n]>=0);
198 else nCoeffs= zigzag_end[ s->block_last_index[n] ]; 161
162 nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
199 163
200 if (s->mb_intra) { 164 if (s->mb_intra) {
201 int block0; 165 int block0;
202 if (n < 4) 166 if (n < 4)
203 block0 = block[0] * s->y_dc_scale; 167 block0 = block[0] * s->y_dc_scale;
310 "js 1b \n\t" 274 "js 1b \n\t"
311 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) 275 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs)
312 : "%eax", "memory" 276 : "%eax", "memory"
313 ); 277 );
314 } 278 }
279
315 } 280 }
316 281
317 static void dct_unquantize_mpeg2_mmx(MpegEncContext *s, 282 static void dct_unquantize_mpeg2_mmx(MpegEncContext *s,
318 DCTELEM *block, int n, int qscale) 283 DCTELEM *block, int n, int qscale)
319 { 284 {
320 int nCoeffs; 285 int nCoeffs;
321 const UINT16 *quant_matrix; 286 const UINT16 *quant_matrix;
322 287
323 if(s->alternate_scan) nCoeffs= 64; 288 assert(s->block_last_index[n]>=0);
324 else nCoeffs= zigzag_end[ s->block_last_index[n] ]; 289
290 if(s->alternate_scan) nCoeffs= 63; //FIXME
291 else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
325 292
326 if (s->mb_intra) { 293 if (s->mb_intra) {
327 int block0; 294 int block0;
328 if (n < 4) 295 if (n < 4)
329 block0 = block[0] * s->y_dc_scale; 296 block0 = block[0] * s->y_dc_scale;
369 "pandn %%mm1, %%mm5 \n\t" 336 "pandn %%mm1, %%mm5 \n\t"
370 "movq %%mm4, (%0, %%eax) \n\t" 337 "movq %%mm4, (%0, %%eax) \n\t"
371 "movq %%mm5, 8(%0, %%eax) \n\t" 338 "movq %%mm5, 8(%0, %%eax) \n\t"
372 339
373 "addl $16, %%eax \n\t" 340 "addl $16, %%eax \n\t"
374 "js 1b \n\t" 341 "jng 1b \n\t"
375 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) 342 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs)
376 : "%eax", "memory" 343 : "%eax", "memory"
377 ); 344 );
378 block[0]= block0; 345 block[0]= block0;
379 //Note, we dont do mismatch control for intra as errors cannot accumulate 346 //Note, we dont do mismatch control for intra as errors cannot accumulate
425 "pxor %%mm5, %%mm7 \n\t" 392 "pxor %%mm5, %%mm7 \n\t"
426 "movq %%mm4, (%0, %%eax) \n\t" 393 "movq %%mm4, (%0, %%eax) \n\t"
427 "movq %%mm5, 8(%0, %%eax) \n\t" 394 "movq %%mm5, 8(%0, %%eax) \n\t"
428 395
429 "addl $16, %%eax \n\t" 396 "addl $16, %%eax \n\t"
430 "js 1b \n\t" 397 "jng 1b \n\t"
431 "movd 124(%0, %3), %%mm0 \n\t" 398 "movd 124(%0, %3), %%mm0 \n\t"
432 "movq %%mm7, %%mm6 \n\t" 399 "movq %%mm7, %%mm6 \n\t"
433 "psrlq $32, %%mm7 \n\t" 400 "psrlq $32, %%mm7 \n\t"
434 "pxor %%mm6, %%mm7 \n\t" 401 "pxor %%mm6, %%mm7 \n\t"
435 "movq %%mm7, %%mm6 \n\t" 402 "movq %%mm7, %%mm6 \n\t"
532 : "r" ((int)last_line - (int)ptr - w), "r" (wrap), "r" (wrap*3), "r" (ptr+width+2*w) 499 : "r" ((int)last_line - (int)ptr - w), "r" (wrap), "r" (wrap*3), "r" (ptr+width+2*w)
533 ); 500 );
534 } 501 }
535 } 502 }
536 503
537 static volatile int esp_temp;
538
539 void unused_var_warning_killer(){
540 esp_temp++;
541 }
542
543 #undef HAVE_MMX2 504 #undef HAVE_MMX2
544 #define RENAME(a) a ## _MMX 505 #define RENAME(a) a ## _MMX
545 #include "mpegvideo_mmx_template.c" 506 #include "mpegvideo_mmx_template.c"
546 507
547 #define HAVE_MMX2 508 #define HAVE_MMX2
548 #undef RENAME 509 #undef RENAME
549 #define RENAME(a) a ## _MMX2 510 #define RENAME(a) a ## _MMX2
550 #include "mpegvideo_mmx_template.c" 511 #include "mpegvideo_mmx_template.c"
551 512
513 /* external functions, from idct_mmx.c */
514 void ff_mmx_idct(DCTELEM *block);
515 void ff_mmxext_idct(DCTELEM *block);
516
517 /* XXX: those functions should be suppressed ASAP when all IDCTs are
518 converted */
519 static void ff_libmpeg2mmx_idct_put(UINT8 *dest, int line_size, DCTELEM *block)
520 {
521 ff_mmx_idct (block);
522 put_pixels_clamped(block, dest, line_size);
523 }
524 static void ff_libmpeg2mmx_idct_add(UINT8 *dest, int line_size, DCTELEM *block)
525 {
526 ff_mmx_idct (block);
527 add_pixels_clamped(block, dest, line_size);
528 }
529 static void ff_libmpeg2mmx2_idct_put(UINT8 *dest, int line_size, DCTELEM *block)
530 {
531 ff_mmxext_idct (block);
532 put_pixels_clamped(block, dest, line_size);
533 }
534 static void ff_libmpeg2mmx2_idct_add(UINT8 *dest, int line_size, DCTELEM *block)
535 {
536 ff_mmxext_idct (block);
537 add_pixels_clamped(block, dest, line_size);
538 }
539
552 void MPV_common_init_mmx(MpegEncContext *s) 540 void MPV_common_init_mmx(MpegEncContext *s)
553 { 541 {
554 if (mm_flags & MM_MMX) { 542 if (mm_flags & MM_MMX) {
555 const int dct_algo= s->avctx->dct_algo; 543 int i;
544 const int dct_algo = s->avctx->dct_algo;
545 const int idct_algo= s->avctx->idct_algo;
546
556 s->dct_unquantize_h263 = dct_unquantize_h263_mmx; 547 s->dct_unquantize_h263 = dct_unquantize_h263_mmx;
557 s->dct_unquantize_mpeg1 = dct_unquantize_mpeg1_mmx; 548 s->dct_unquantize_mpeg1 = dct_unquantize_mpeg1_mmx;
558 s->dct_unquantize_mpeg2 = dct_unquantize_mpeg2_mmx; 549 s->dct_unquantize_mpeg2 = dct_unquantize_mpeg2_mmx;
559 550
560 draw_edges = draw_edges_mmx; 551 draw_edges = draw_edges_mmx;
566 s->dct_quantize= dct_quantize_MMX2; 557 s->dct_quantize= dct_quantize_MMX2;
567 } else { 558 } else {
568 s->dct_quantize= dct_quantize_MMX; 559 s->dct_quantize= dct_quantize_MMX;
569 } 560 }
570 } 561 }
571 } 562
572 } 563 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
564 s->idct_put= ff_simple_idct_put_mmx;
565 s->idct_add= ff_simple_idct_add_mmx;
566 for(i=0; i<64; i++)
567 s->idct_permutation[i]= simple_mmx_permutation[i];
568 }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
569 if(mm_flags & MM_MMXEXT){
570 s->idct_put= ff_libmpeg2mmx2_idct_put;
571 s->idct_add= ff_libmpeg2mmx2_idct_add;
572 }else{
573 s->idct_put= ff_libmpeg2mmx_idct_put;
574 s->idct_add= ff_libmpeg2mmx_idct_add;
575 }
576 for(i=0; i<64; i++)
577 s->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
578 }
579 }
580 }