Mercurial > libavcodec.hg
comparison i386/fdct_mmx.c @ 4961:3e8e345084cb libavcodec
sse2 version of fdct_col.
k8: 72->61 cycles, core2: 51->26 cycles.
author | lorenm |
---|---|
date | Thu, 10 May 2007 03:13:41 +0000 |
parents | d6f83e2f8804 |
children | d5ba514e3f4a |
comparison
equal
deleted
inserted
replaced
4960:1745d0452e87 | 4961:3e8e345084cb |
---|---|
50 #define SHIFT_FRW_COL BITS_FRW_ACC | 50 #define SHIFT_FRW_COL BITS_FRW_ACC |
51 #define SHIFT_FRW_ROW (BITS_FRW_ACC + 17 - 3) | 51 #define SHIFT_FRW_ROW (BITS_FRW_ACC + 17 - 3) |
52 #define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1)) | 52 #define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1)) |
53 //#define RND_FRW_COL (1 << (SHIFT_FRW_COL-1)) | 53 //#define RND_FRW_COL (1 << (SHIFT_FRW_COL-1)) |
54 | 54 |
55 #define X8(x) x,x,x,x,x,x,x,x | |
56 | |
55 //concatenated table, for forward DCT transformation | 57 //concatenated table, for forward DCT transformation |
56 static const int16_t fdct_tg_all_16[] ATTR_ALIGN(8) = { | 58 static const int16_t fdct_tg_all_16[24] ATTR_ALIGN(16) = { |
57 13036, 13036, 13036, 13036, // tg * (2<<16) + 0.5 | 59 X8(13036), // tg * (2<<16) + 0.5 |
58 27146, 27146, 27146, 27146, // tg * (2<<16) + 0.5 | 60 X8(27146), // tg * (2<<16) + 0.5 |
59 -21746, -21746, -21746, -21746, // tg * (2<<16) + 0.5 | 61 X8(-21746) // tg * (2<<16) + 0.5 |
60 }; | 62 }; |
61 | 63 |
62 static const int16_t ocos_4_16[4] ATTR_ALIGN(8) = { | 64 static const int16_t ocos_4_16[8] ATTR_ALIGN(16) = { |
63 23170, 23170, 23170, 23170, //cos * (2<<15) + 0.5 | 65 X8(23170) //cos * (2<<15) + 0.5 |
64 }; | 66 }; |
65 | 67 |
66 static const int64_t fdct_one_corr ATTR_ALIGN(8) = 0x0001000100010001LL; | 68 static const int16_t fdct_one_corr[8] ATTR_ALIGN(16) = { X8(1) }; |
67 | 69 |
68 static const int32_t fdct_r_row[2] ATTR_ALIGN(8) = {RND_FRW_ROW, RND_FRW_ROW }; | 70 static const int32_t fdct_r_row[2] ATTR_ALIGN(8) = {RND_FRW_ROW, RND_FRW_ROW }; |
69 | 71 |
70 static struct | 72 static struct |
71 { | 73 { |
281 #define C6 12299 | 283 #define C6 12299 |
282 #define C7 6270 | 284 #define C7 6270 |
283 TABLE_SSE2 | 285 TABLE_SSE2 |
284 }}; | 286 }}; |
285 | 287 |
286 | 288 #define FDCT_COL(cpu, mm, mov)\ |
287 static av_always_inline void fdct_col(const int16_t *in, int16_t *out, int offset) | 289 static av_always_inline void fdct_col_##cpu(const int16_t *in, int16_t *out, int offset)\ |
288 { | 290 {\ |
289 movq_m2r(*(in + offset + 1 * 8), mm0); | 291 mov##_m2r(*(in + offset + 1 * 8), mm##0);\ |
290 movq_m2r(*(in + offset + 6 * 8), mm1); | 292 mov##_m2r(*(in + offset + 6 * 8), mm##1);\ |
291 movq_r2r(mm0, mm2); | 293 mov##_r2r(mm##0, mm##2);\ |
292 movq_m2r(*(in + offset + 2 * 8), mm3); | 294 mov##_m2r(*(in + offset + 2 * 8), mm##3);\ |
293 paddsw_r2r(mm1, mm0); | 295 paddsw_r2r(mm##1, mm##0);\ |
294 movq_m2r(*(in + offset + 5 * 8), mm4); | 296 mov##_m2r(*(in + offset + 5 * 8), mm##4);\ |
295 psllw_i2r(SHIFT_FRW_COL, mm0); | 297 psllw_i2r(SHIFT_FRW_COL, mm##0);\ |
296 movq_m2r(*(in + offset + 0 * 8), mm5); | 298 mov##_m2r(*(in + offset + 0 * 8), mm##5);\ |
297 paddsw_r2r(mm3, mm4); | 299 paddsw_r2r(mm##3, mm##4);\ |
298 paddsw_m2r(*(in + offset + 7 * 8), mm5); | 300 paddsw_m2r(*(in + offset + 7 * 8), mm##5);\ |
299 psllw_i2r(SHIFT_FRW_COL, mm4); | 301 psllw_i2r(SHIFT_FRW_COL, mm##4);\ |
300 movq_r2r(mm0, mm6); | 302 mov##_r2r(mm##0, mm##6);\ |
301 psubsw_r2r(mm1, mm2); | 303 psubsw_r2r(mm##1, mm##2);\ |
302 movq_m2r(*(fdct_tg_all_16 + 4), mm1); | 304 mov##_m2r(*(fdct_tg_all_16 + 8), mm##1);\ |
303 psubsw_r2r(mm4, mm0); | 305 psubsw_r2r(mm##4, mm##0);\ |
304 movq_m2r(*(in + offset + 3 * 8), mm7); | 306 mov##_m2r(*(in + offset + 3 * 8), mm##7);\ |
305 pmulhw_r2r(mm0, mm1); | 307 pmulhw_r2r(mm##0, mm##1);\ |
306 paddsw_m2r(*(in + offset + 4 * 8), mm7); | 308 paddsw_m2r(*(in + offset + 4 * 8), mm##7);\ |
307 psllw_i2r(SHIFT_FRW_COL, mm5); | 309 psllw_i2r(SHIFT_FRW_COL, mm##5);\ |
308 paddsw_r2r(mm4, mm6); | 310 paddsw_r2r(mm##4, mm##6);\ |
309 psllw_i2r(SHIFT_FRW_COL, mm7); | 311 psllw_i2r(SHIFT_FRW_COL, mm##7);\ |
310 movq_r2r(mm5, mm4); | 312 mov##_r2r(mm##5, mm##4);\ |
311 psubsw_r2r(mm7, mm5); | 313 psubsw_r2r(mm##7, mm##5);\ |
312 paddsw_r2r(mm5, mm1); | 314 paddsw_r2r(mm##5, mm##1);\ |
313 paddsw_r2r(mm7, mm4); | 315 paddsw_r2r(mm##7, mm##4);\ |
314 por_m2r(fdct_one_corr, mm1); | 316 por_m2r(*fdct_one_corr, mm##1);\ |
315 psllw_i2r(SHIFT_FRW_COL + 1, mm2); | 317 psllw_i2r(SHIFT_FRW_COL + 1, mm##2);\ |
316 pmulhw_m2r(*(fdct_tg_all_16 + 4), mm5); | 318 pmulhw_m2r(*(fdct_tg_all_16 + 8), mm##5);\ |
317 movq_r2r(mm4, mm7); | 319 mov##_r2r(mm##4, mm##7);\ |
318 psubsw_m2r(*(in + offset + 5 * 8), mm3); | 320 psubsw_m2r(*(in + offset + 5 * 8), mm##3);\ |
319 psubsw_r2r(mm6, mm4); | 321 psubsw_r2r(mm##6, mm##4);\ |
320 movq_r2m(mm1, *(out + offset + 2 * 8)); | 322 mov##_r2m(mm##1, *(out + offset + 2 * 8));\ |
321 paddsw_r2r(mm6, mm7); | 323 paddsw_r2r(mm##6, mm##7);\ |
322 movq_m2r(*(in + offset + 3 * 8), mm1); | 324 mov##_m2r(*(in + offset + 3 * 8), mm##1);\ |
323 psllw_i2r(SHIFT_FRW_COL + 1, mm3); | 325 psllw_i2r(SHIFT_FRW_COL + 1, mm##3);\ |
324 psubsw_m2r(*(in + offset + 4 * 8), mm1); | 326 psubsw_m2r(*(in + offset + 4 * 8), mm##1);\ |
325 movq_r2r(mm2, mm6); | 327 mov##_r2r(mm##2, mm##6);\ |
326 movq_r2m(mm4, *(out + offset + 4 * 8)); | 328 mov##_r2m(mm##4, *(out + offset + 4 * 8));\ |
327 paddsw_r2r(mm3, mm2); | 329 paddsw_r2r(mm##3, mm##2);\ |
328 pmulhw_m2r(*ocos_4_16, mm2); | 330 pmulhw_m2r(*ocos_4_16, mm##2);\ |
329 psubsw_r2r(mm3, mm6); | 331 psubsw_r2r(mm##3, mm##6);\ |
330 pmulhw_m2r(*ocos_4_16, mm6); | 332 pmulhw_m2r(*ocos_4_16, mm##6);\ |
331 psubsw_r2r(mm0, mm5); | 333 psubsw_r2r(mm##0, mm##5);\ |
332 por_m2r(fdct_one_corr, mm5); | 334 por_m2r(*fdct_one_corr, mm##5);\ |
333 psllw_i2r(SHIFT_FRW_COL, mm1); | 335 psllw_i2r(SHIFT_FRW_COL, mm##1);\ |
334 por_m2r(fdct_one_corr, mm2); | 336 por_m2r(*fdct_one_corr, mm##2);\ |
335 movq_r2r(mm1, mm4); | 337 mov##_r2r(mm##1, mm##4);\ |
336 movq_m2r(*(in + offset + 0 * 8), mm3); | 338 mov##_m2r(*(in + offset + 0 * 8), mm##3);\ |
337 paddsw_r2r(mm6, mm1); | 339 paddsw_r2r(mm##6, mm##1);\ |
338 psubsw_m2r(*(in + offset + 7 * 8), mm3); | 340 psubsw_m2r(*(in + offset + 7 * 8), mm##3);\ |
339 psubsw_r2r(mm6, mm4); | 341 psubsw_r2r(mm##6, mm##4);\ |
340 movq_m2r(*(fdct_tg_all_16 + 0), mm0); | 342 mov##_m2r(*(fdct_tg_all_16 + 0), mm##0);\ |
341 psllw_i2r(SHIFT_FRW_COL, mm3); | 343 psllw_i2r(SHIFT_FRW_COL, mm##3);\ |
342 movq_m2r(*(fdct_tg_all_16 + 8), mm6); | 344 mov##_m2r(*(fdct_tg_all_16 + 16), mm##6);\ |
343 pmulhw_r2r(mm1, mm0); | 345 pmulhw_r2r(mm##1, mm##0);\ |
344 movq_r2m(mm7, *(out + offset + 0 * 8)); | 346 mov##_r2m(mm##7, *(out + offset + 0 * 8));\ |
345 pmulhw_r2r(mm4, mm6); | 347 pmulhw_r2r(mm##4, mm##6);\ |
346 movq_r2m(mm5, *(out + offset + 6 * 8)); | 348 mov##_r2m(mm##5, *(out + offset + 6 * 8));\ |
347 movq_r2r(mm3, mm7); | 349 mov##_r2r(mm##3, mm##7);\ |
348 movq_m2r(*(fdct_tg_all_16 + 8), mm5); | 350 mov##_m2r(*(fdct_tg_all_16 + 16), mm##5);\ |
349 psubsw_r2r(mm2, mm7); | 351 psubsw_r2r(mm##2, mm##7);\ |
350 paddsw_r2r(mm2, mm3); | 352 paddsw_r2r(mm##2, mm##3);\ |
351 pmulhw_r2r(mm7, mm5); | 353 pmulhw_r2r(mm##7, mm##5);\ |
352 paddsw_r2r(mm3, mm0); | 354 paddsw_r2r(mm##3, mm##0);\ |
353 paddsw_r2r(mm4, mm6); | 355 paddsw_r2r(mm##4, mm##6);\ |
354 pmulhw_m2r(*(fdct_tg_all_16 + 0), mm3); | 356 pmulhw_m2r(*(fdct_tg_all_16 + 0), mm##3);\ |
355 por_m2r(fdct_one_corr, mm0); | 357 por_m2r(*fdct_one_corr, mm##0);\ |
356 paddsw_r2r(mm7, mm5); | 358 paddsw_r2r(mm##7, mm##5);\ |
357 psubsw_r2r(mm6, mm7); | 359 psubsw_r2r(mm##6, mm##7);\ |
358 movq_r2m(mm0, *(out + offset + 1 * 8)); | 360 mov##_r2m(mm##0, *(out + offset + 1 * 8));\ |
359 paddsw_r2r(mm4, mm5); | 361 paddsw_r2r(mm##4, mm##5);\ |
360 movq_r2m(mm7, *(out + offset + 3 * 8)); | 362 mov##_r2m(mm##7, *(out + offset + 3 * 8));\ |
361 psubsw_r2r(mm1, mm3); | 363 psubsw_r2r(mm##1, mm##3);\ |
362 movq_r2m(mm5, *(out + offset + 5 * 8)); | 364 mov##_r2m(mm##5, *(out + offset + 5 * 8));\ |
363 movq_r2m(mm3, *(out + offset + 7 * 8)); | 365 mov##_r2m(mm##3, *(out + offset + 7 * 8));\ |
364 } | 366 } |
365 | 367 |
368 FDCT_COL(mmx, mm, movq) | |
369 FDCT_COL(sse2, xmm, movdqa) | |
366 | 370 |
367 static av_always_inline void fdct_row_sse2(const int16_t *in, int16_t *out) | 371 static av_always_inline void fdct_row_sse2(const int16_t *in, int16_t *out) |
368 { | 372 { |
369 asm volatile( | 373 asm volatile( |
370 #define FDCT_ROW_SSE2_H1(i,t) \ | 374 #define FDCT_ROW_SSE2_H1(i,t) \ |
522 int64_t align_tmp[16] ATTR_ALIGN(8); | 526 int64_t align_tmp[16] ATTR_ALIGN(8); |
523 int16_t * block1= (int16_t*)align_tmp; | 527 int16_t * block1= (int16_t*)align_tmp; |
524 const int16_t *table= tab_frw_01234567; | 528 const int16_t *table= tab_frw_01234567; |
525 int i; | 529 int i; |
526 | 530 |
527 fdct_col(block, block1, 0); | 531 fdct_col_mmx(block, block1, 0); |
528 fdct_col(block, block1, 4); | 532 fdct_col_mmx(block, block1, 4); |
529 | 533 |
530 for(i=8;i>0;i--) { | 534 for(i=8;i>0;i--) { |
531 fdct_row_mmx(block1, block, table); | 535 fdct_row_mmx(block1, block, table); |
532 block1 += 8; | 536 block1 += 8; |
533 table += 32; | 537 table += 32; |
540 int64_t align_tmp[16] ATTR_ALIGN(8); | 544 int64_t align_tmp[16] ATTR_ALIGN(8); |
541 int16_t *block1= (int16_t*)align_tmp; | 545 int16_t *block1= (int16_t*)align_tmp; |
542 const int16_t *table= tab_frw_01234567; | 546 const int16_t *table= tab_frw_01234567; |
543 int i; | 547 int i; |
544 | 548 |
545 fdct_col(block, block1, 0); | 549 fdct_col_mmx(block, block1, 0); |
546 fdct_col(block, block1, 4); | 550 fdct_col_mmx(block, block1, 4); |
547 | 551 |
548 for(i=8;i>0;i--) { | 552 for(i=8;i>0;i--) { |
549 fdct_row_mmx2(block1, block, table); | 553 fdct_row_mmx2(block1, block, table); |
550 block1 += 8; | 554 block1 += 8; |
551 table += 32; | 555 table += 32; |
556 void ff_fdct_sse2(int16_t *block) | 560 void ff_fdct_sse2(int16_t *block) |
557 { | 561 { |
558 int64_t align_tmp[16] ATTR_ALIGN(16); | 562 int64_t align_tmp[16] ATTR_ALIGN(16); |
559 int16_t * const block1= (int16_t*)align_tmp; | 563 int16_t * const block1= (int16_t*)align_tmp; |
560 | 564 |
561 fdct_col(block, block1, 0); | 565 fdct_col_sse2(block, block1, 0); |
562 fdct_col(block, block1, 4); | |
563 | |
564 fdct_row_sse2(block1, block); | 566 fdct_row_sse2(block1, block); |
565 } | 567 } |
566 | 568 |