Mercurial > libavcodec.hg
comparison i386/mpegvideo_mmx.c @ 706:e65798d228ea libavcodec
idct permutation cleanup, idct can be selected per context now
fixing some threadunsafe code
author | michaelni |
---|---|
date | Sun, 29 Sep 2002 22:44:22 +0000 |
parents | 9abb13c21fbe |
children | c3fc09466f92 |
comparison
equal
deleted
inserted
replaced
705:107a56aa74f5 | 706:e65798d228ea |
---|---|
21 */ | 21 */ |
22 | 22 |
23 #include "../dsputil.h" | 23 #include "../dsputil.h" |
24 #include "../mpegvideo.h" | 24 #include "../mpegvideo.h" |
25 #include "../avcodec.h" | 25 #include "../avcodec.h" |
26 | 26 #include "../simple_idct.h" |
27 extern UINT8 zigzag_end[64]; | 27 |
28 /* Input permutation for the simple_idct_mmx */ | |
29 static UINT8 simple_mmx_permutation[64]={ | |
30 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D, | |
31 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D, | |
32 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D, | |
33 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F, | |
34 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F, | |
35 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D, | |
36 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F, | |
37 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F, | |
38 }; | |
28 | 39 |
29 extern UINT8 zigzag_direct_noperm[64]; | 40 extern UINT8 zigzag_direct_noperm[64]; |
30 extern UINT16 inv_zigzag_direct16[64]; | 41 extern UINT16 inv_zigzag_direct16[64]; |
31 extern UINT32 inverse[256]; | 42 extern UINT32 inverse[256]; |
32 | 43 |
33 #if 0 | |
34 | |
35 /* XXX: GL: I don't understand why this function needs optimization | |
36 (it is called only once per frame!), so I disabled it */ | |
37 | |
38 void MPV_frame_start(MpegEncContext *s) | |
39 { | |
40 if (s->pict_type == B_TYPE) { | |
41 __asm __volatile( | |
42 "movl (%1), %%eax\n\t" | |
43 "movl 4(%1), %%edx\n\t" | |
44 "movl 8(%1), %%ecx\n\t" | |
45 "movl %%eax, (%0)\n\t" | |
46 "movl %%edx, 4(%0)\n\t" | |
47 "movl %%ecx, 8(%0)\n\t" | |
48 : | |
49 :"r"(s->current_picture), "r"(s->aux_picture) | |
50 :"eax","edx","ecx","memory"); | |
51 } else { | |
52 /* swap next and last */ | |
53 __asm __volatile( | |
54 "movl (%1), %%eax\n\t" | |
55 "movl 4(%1), %%edx\n\t" | |
56 "movl 8(%1), %%ecx\n\t" | |
57 "xchgl (%0), %%eax\n\t" | |
58 "xchgl 4(%0), %%edx\n\t" | |
59 "xchgl 8(%0), %%ecx\n\t" | |
60 "movl %%eax, (%1)\n\t" | |
61 "movl %%edx, 4(%1)\n\t" | |
62 "movl %%ecx, 8(%1)\n\t" | |
63 "movl %%eax, (%2)\n\t" | |
64 "movl %%edx, 4(%2)\n\t" | |
65 "movl %%ecx, 8(%2)\n\t" | |
66 : | |
67 :"r"(s->last_picture), "r"(s->next_picture), "r"(s->current_picture) | |
68 :"eax","edx","ecx","memory"); | |
69 } | |
70 } | |
71 #endif | |
72 | |
73 static const unsigned long long int mm_wabs __attribute__ ((aligned(8))) = 0xffffffffffffffffULL; | 44 static const unsigned long long int mm_wabs __attribute__ ((aligned(8))) = 0xffffffffffffffffULL; |
74 static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL; | 45 static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL; |
75 | 46 |
76 | 47 |
77 static void dct_unquantize_h263_mmx(MpegEncContext *s, | 48 static void dct_unquantize_h263_mmx(MpegEncContext *s, |
78 DCTELEM *block, int n, int qscale) | 49 DCTELEM *block, int n, int qscale) |
79 { | 50 { |
80 int i, level, qmul, qadd, nCoeffs; | 51 int level, qmul, qadd, nCoeffs; |
81 | 52 |
82 qmul = s->qscale << 1; | 53 qmul = qscale << 1; |
83 if (s->h263_aic && s->mb_intra) | 54 qadd = (qscale - 1) | 1; |
84 qadd = 0; | 55 |
85 else | 56 assert(s->block_last_index[n]>=0); |
86 qadd = (s->qscale - 1) | 1; | 57 |
87 | |
88 if (s->mb_intra) { | 58 if (s->mb_intra) { |
89 if (!s->h263_aic) { | 59 if (!s->h263_aic) { |
90 if (n < 4) | 60 if (n < 4) |
91 block[0] = block[0] * s->y_dc_scale; | 61 level = block[0] * s->y_dc_scale; |
92 else | 62 else |
93 block[0] = block[0] * s->c_dc_scale; | 63 level = block[0] * s->c_dc_scale; |
64 }else{ | |
65 qadd = 0; | |
66 level= block[0]; | |
94 } | 67 } |
95 for(i=1; i<8; i++) { | 68 nCoeffs=63; |
96 level = block[i]; | |
97 if (level) { | |
98 if (level < 0) { | |
99 level = level * qmul - qadd; | |
100 } else { | |
101 level = level * qmul + qadd; | |
102 } | |
103 block[i] = level; | |
104 } | |
105 } | |
106 nCoeffs=64; | |
107 } else { | 69 } else { |
108 i = 0; | 70 nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; |
109 nCoeffs= zigzag_end[ s->block_last_index[n] ]; | |
110 } | 71 } |
111 //printf("%d %d ", qmul, qadd); | 72 //printf("%d %d ", qmul, qadd); |
112 asm volatile( | 73 asm volatile( |
113 "movd %1, %%mm6 \n\t" //qmul | 74 "movd %1, %%mm6 \n\t" //qmul |
114 "packssdw %%mm6, %%mm6 \n\t" | 75 "packssdw %%mm6, %%mm6 \n\t" |
150 | 111 |
151 "movq %%mm0, (%0, %3) \n\t" | 112 "movq %%mm0, (%0, %3) \n\t" |
152 "movq %%mm1, 8(%0, %3) \n\t" | 113 "movq %%mm1, 8(%0, %3) \n\t" |
153 | 114 |
154 "addl $16, %3 \n\t" | 115 "addl $16, %3 \n\t" |
155 "js 1b \n\t" | 116 "jng 1b \n\t" |
156 ::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(i-nCoeffs)) | 117 ::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(-nCoeffs)) |
157 : "memory" | 118 : "memory" |
158 ); | 119 ); |
120 if(s->mb_intra) | |
121 block[0]= level; | |
159 } | 122 } |
160 | 123 |
161 | 124 |
162 /* | 125 /* |
163 NK: | 126 NK: |
191 static void dct_unquantize_mpeg1_mmx(MpegEncContext *s, | 154 static void dct_unquantize_mpeg1_mmx(MpegEncContext *s, |
192 DCTELEM *block, int n, int qscale) | 155 DCTELEM *block, int n, int qscale) |
193 { | 156 { |
194 int nCoeffs; | 157 int nCoeffs; |
195 const UINT16 *quant_matrix; | 158 const UINT16 *quant_matrix; |
196 | 159 |
197 if(s->alternate_scan) nCoeffs= 64; | 160 assert(s->block_last_index[n]>=0); |
198 else nCoeffs= zigzag_end[ s->block_last_index[n] ]; | 161 |
162 nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; | |
199 | 163 |
200 if (s->mb_intra) { | 164 if (s->mb_intra) { |
201 int block0; | 165 int block0; |
202 if (n < 4) | 166 if (n < 4) |
203 block0 = block[0] * s->y_dc_scale; | 167 block0 = block[0] * s->y_dc_scale; |
310 "js 1b \n\t" | 274 "js 1b \n\t" |
311 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) | 275 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) |
312 : "%eax", "memory" | 276 : "%eax", "memory" |
313 ); | 277 ); |
314 } | 278 } |
279 | |
315 } | 280 } |
316 | 281 |
317 static void dct_unquantize_mpeg2_mmx(MpegEncContext *s, | 282 static void dct_unquantize_mpeg2_mmx(MpegEncContext *s, |
318 DCTELEM *block, int n, int qscale) | 283 DCTELEM *block, int n, int qscale) |
319 { | 284 { |
320 int nCoeffs; | 285 int nCoeffs; |
321 const UINT16 *quant_matrix; | 286 const UINT16 *quant_matrix; |
322 | 287 |
323 if(s->alternate_scan) nCoeffs= 64; | 288 assert(s->block_last_index[n]>=0); |
324 else nCoeffs= zigzag_end[ s->block_last_index[n] ]; | 289 |
290 if(s->alternate_scan) nCoeffs= 63; //FIXME | |
291 else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; | |
325 | 292 |
326 if (s->mb_intra) { | 293 if (s->mb_intra) { |
327 int block0; | 294 int block0; |
328 if (n < 4) | 295 if (n < 4) |
329 block0 = block[0] * s->y_dc_scale; | 296 block0 = block[0] * s->y_dc_scale; |
369 "pandn %%mm1, %%mm5 \n\t" | 336 "pandn %%mm1, %%mm5 \n\t" |
370 "movq %%mm4, (%0, %%eax) \n\t" | 337 "movq %%mm4, (%0, %%eax) \n\t" |
371 "movq %%mm5, 8(%0, %%eax) \n\t" | 338 "movq %%mm5, 8(%0, %%eax) \n\t" |
372 | 339 |
373 "addl $16, %%eax \n\t" | 340 "addl $16, %%eax \n\t" |
374 "js 1b \n\t" | 341 "jng 1b \n\t" |
375 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) | 342 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) |
376 : "%eax", "memory" | 343 : "%eax", "memory" |
377 ); | 344 ); |
378 block[0]= block0; | 345 block[0]= block0; |
379 //Note, we dont do mismatch control for intra as errors cannot accumulate | 346 //Note, we dont do mismatch control for intra as errors cannot accumulate |
425 "pxor %%mm5, %%mm7 \n\t" | 392 "pxor %%mm5, %%mm7 \n\t" |
426 "movq %%mm4, (%0, %%eax) \n\t" | 393 "movq %%mm4, (%0, %%eax) \n\t" |
427 "movq %%mm5, 8(%0, %%eax) \n\t" | 394 "movq %%mm5, 8(%0, %%eax) \n\t" |
428 | 395 |
429 "addl $16, %%eax \n\t" | 396 "addl $16, %%eax \n\t" |
430 "js 1b \n\t" | 397 "jng 1b \n\t" |
431 "movd 124(%0, %3), %%mm0 \n\t" | 398 "movd 124(%0, %3), %%mm0 \n\t" |
432 "movq %%mm7, %%mm6 \n\t" | 399 "movq %%mm7, %%mm6 \n\t" |
433 "psrlq $32, %%mm7 \n\t" | 400 "psrlq $32, %%mm7 \n\t" |
434 "pxor %%mm6, %%mm7 \n\t" | 401 "pxor %%mm6, %%mm7 \n\t" |
435 "movq %%mm7, %%mm6 \n\t" | 402 "movq %%mm7, %%mm6 \n\t" |
532 : "r" ((int)last_line - (int)ptr - w), "r" (wrap), "r" (wrap*3), "r" (ptr+width+2*w) | 499 : "r" ((int)last_line - (int)ptr - w), "r" (wrap), "r" (wrap*3), "r" (ptr+width+2*w) |
533 ); | 500 ); |
534 } | 501 } |
535 } | 502 } |
536 | 503 |
537 static volatile int esp_temp; | |
538 | |
539 void unused_var_warning_killer(){ | |
540 esp_temp++; | |
541 } | |
542 | |
543 #undef HAVE_MMX2 | 504 #undef HAVE_MMX2 |
544 #define RENAME(a) a ## _MMX | 505 #define RENAME(a) a ## _MMX |
545 #include "mpegvideo_mmx_template.c" | 506 #include "mpegvideo_mmx_template.c" |
546 | 507 |
547 #define HAVE_MMX2 | 508 #define HAVE_MMX2 |
548 #undef RENAME | 509 #undef RENAME |
549 #define RENAME(a) a ## _MMX2 | 510 #define RENAME(a) a ## _MMX2 |
550 #include "mpegvideo_mmx_template.c" | 511 #include "mpegvideo_mmx_template.c" |
551 | 512 |
513 /* external functions, from idct_mmx.c */ | |
514 void ff_mmx_idct(DCTELEM *block); | |
515 void ff_mmxext_idct(DCTELEM *block); | |
516 | |
517 /* XXX: those functions should be suppressed ASAP when all IDCTs are | |
518 converted */ | |
519 static void ff_libmpeg2mmx_idct_put(UINT8 *dest, int line_size, DCTELEM *block) | |
520 { | |
521 ff_mmx_idct (block); | |
522 put_pixels_clamped(block, dest, line_size); | |
523 } | |
524 static void ff_libmpeg2mmx_idct_add(UINT8 *dest, int line_size, DCTELEM *block) | |
525 { | |
526 ff_mmx_idct (block); | |
527 add_pixels_clamped(block, dest, line_size); | |
528 } | |
529 static void ff_libmpeg2mmx2_idct_put(UINT8 *dest, int line_size, DCTELEM *block) | |
530 { | |
531 ff_mmxext_idct (block); | |
532 put_pixels_clamped(block, dest, line_size); | |
533 } | |
534 static void ff_libmpeg2mmx2_idct_add(UINT8 *dest, int line_size, DCTELEM *block) | |
535 { | |
536 ff_mmxext_idct (block); | |
537 add_pixels_clamped(block, dest, line_size); | |
538 } | |
539 | |
552 void MPV_common_init_mmx(MpegEncContext *s) | 540 void MPV_common_init_mmx(MpegEncContext *s) |
553 { | 541 { |
554 if (mm_flags & MM_MMX) { | 542 if (mm_flags & MM_MMX) { |
555 const int dct_algo= s->avctx->dct_algo; | 543 int i; |
544 const int dct_algo = s->avctx->dct_algo; | |
545 const int idct_algo= s->avctx->idct_algo; | |
546 | |
556 s->dct_unquantize_h263 = dct_unquantize_h263_mmx; | 547 s->dct_unquantize_h263 = dct_unquantize_h263_mmx; |
557 s->dct_unquantize_mpeg1 = dct_unquantize_mpeg1_mmx; | 548 s->dct_unquantize_mpeg1 = dct_unquantize_mpeg1_mmx; |
558 s->dct_unquantize_mpeg2 = dct_unquantize_mpeg2_mmx; | 549 s->dct_unquantize_mpeg2 = dct_unquantize_mpeg2_mmx; |
559 | 550 |
560 draw_edges = draw_edges_mmx; | 551 draw_edges = draw_edges_mmx; |
566 s->dct_quantize= dct_quantize_MMX2; | 557 s->dct_quantize= dct_quantize_MMX2; |
567 } else { | 558 } else { |
568 s->dct_quantize= dct_quantize_MMX; | 559 s->dct_quantize= dct_quantize_MMX; |
569 } | 560 } |
570 } | 561 } |
571 } | 562 |
572 } | 563 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){ |
564 s->idct_put= ff_simple_idct_put_mmx; | |
565 s->idct_add= ff_simple_idct_add_mmx; | |
566 for(i=0; i<64; i++) | |
567 s->idct_permutation[i]= simple_mmx_permutation[i]; | |
568 }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){ | |
569 if(mm_flags & MM_MMXEXT){ | |
570 s->idct_put= ff_libmpeg2mmx2_idct_put; | |
571 s->idct_add= ff_libmpeg2mmx2_idct_add; | |
572 }else{ | |
573 s->idct_put= ff_libmpeg2mmx_idct_put; | |
574 s->idct_add= ff_libmpeg2mmx_idct_add; | |
575 } | |
576 for(i=0; i<64; i++) | |
577 s->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2); | |
578 } | |
579 } | |
580 } |