Mercurial > mplayer.hg
comparison liba52/imdct.c @ 3537:d7e5a32643c9
C optimizations
sse opt.
author | michael |
---|---|
date | Sun, 16 Dec 2001 23:00:03 +0000 |
parents | 3483390a902b |
children | 4e772a3c6b62 |
comparison
equal
deleted
inserted
replaced
3536:633a8d6e40dc | 3537:d7e5a32643c9 |
---|---|
330 Note sseW2+16={0,0,sqrt(2),-sqrt(2)) | 330 Note sseW2+16={0,0,sqrt(2),-sqrt(2)) |
331 Note sseW2+32={0,0,-sqrt(2),-sqrt(2)) | 331 Note sseW2+32={0,0,-sqrt(2),-sqrt(2)) |
332 Note sseW2+48={1,-1,sqrt(2),-sqrt(2)) | 332 Note sseW2+48={1,-1,sqrt(2),-sqrt(2)) |
333 */ | 333 */ |
334 asm volatile( | 334 asm volatile( |
335 "movaps sseW2, %%xmm6 \n\t" | 335 "movaps 48+sseW2, %%xmm6 \n\t" |
336 "movaps 16+sseW2, %%xmm7 \n\t" | 336 "movaps 16+sseW2, %%xmm7 \n\t" |
337 "xorps %%xmm5, %%xmm5 \n\t" | 337 "xorps %%xmm5, %%xmm5 \n\t" |
338 "xorps %%xmm2, %%xmm2 \n\t" | 338 "xorps %%xmm2, %%xmm2 \n\t" |
339 "movl %0, %%esi \n\t" | 339 "movl %0, %%esi \n\t" |
340 ".balign 16 \n\t" | 340 ".balign 16 \n\t" |
341 "1: \n\t" | 341 "1: \n\t" |
342 "movhps 40(%%esi), %%xmm2 \n\t" //r4,i4,r5,i5 | 342 "movaps 32(%%esi), %%xmm2 \n\t" //r4,i4,r5,i5 |
343 "movaps 48(%%esi), %%xmm3 \n\t" //r6,i6,r7,i7 | 343 "movaps 48(%%esi), %%xmm3 \n\t" //r6,i6,r7,i7 |
344 "movaps 32(%%esi), %%xmm4 \n\t" //r4,i4,r5,i5 | 344 "movaps sseW2, %%xmm4 \n\t" //r4,i4,r5,i5 |
345 "movhps 56(%%esi), %%xmm5 \n\t" //r6,i6,r7,i7 | 345 "movaps 32+sseW2, %%xmm5 \n\t" //r6,i6,r7,i7 |
346 "mulps %%xmm2, %%xmm4 \n\t" | |
347 "mulps %%xmm3, %%xmm5 \n\t" | |
346 "shufps $0xB1, %%xmm2, %%xmm2 \n\t" //i4,r4,i5,r5 | 348 "shufps $0xB1, %%xmm2, %%xmm2 \n\t" //i4,r4,i5,r5 |
347 "shufps $0xB1, %%xmm3, %%xmm3 \n\t" //i6,r6,i7,r7 | 349 "shufps $0xB1, %%xmm3, %%xmm3 \n\t" //i6,r6,i7,r7 |
348 "mulps %%xmm6, %%xmm4 \n\t" | 350 "mulps %%xmm6, %%xmm3 \n\t" |
349 "mulps 32+sseW2, %%xmm5 \n\t" | |
350 "mulps %%xmm7, %%xmm2 \n\t" | 351 "mulps %%xmm7, %%xmm2 \n\t" |
351 "mulps 48+sseW2, %%xmm3 \n\t" | |
352 "movaps (%%esi), %%xmm0 \n\t" //r0,i0,r1,i1 | 352 "movaps (%%esi), %%xmm0 \n\t" //r0,i0,r1,i1 |
353 "movaps 16(%%esi), %%xmm1 \n\t" //r2,i2,r3,i3 | 353 "movaps 16(%%esi), %%xmm1 \n\t" //r2,i2,r3,i3 |
354 "addps %%xmm4, %%xmm2 \n\t" | 354 "addps %%xmm4, %%xmm2 \n\t" |
355 "addps %%xmm5, %%xmm3 \n\t" | 355 "addps %%xmm5, %%xmm3 \n\t" |
356 "movaps %%xmm2, %%xmm4 \n\t" | 356 "movaps %%xmm2, %%xmm4 \n\t" |
374 for (m=3; m < 7; m++) { | 374 for (m=3; m < 7; m++) { |
375 two_m = (1 << m); | 375 two_m = (1 << m); |
376 | 376 |
377 two_m_plus_one = two_m<<1; | 377 two_m_plus_one = two_m<<1; |
378 | 378 |
379 for(k = 0; k < two_m; k++) { | 379 for(i = 0; i < 128; i += two_m_plus_one) { |
380 for(i = 0; i < 128; i += two_m_plus_one) { | 380 for(k = 0; k < two_m; k++) { |
381 int p = k + i; | |
382 int q = p + two_m; | |
383 tmp_a_r = buf[p].real; | |
384 tmp_a_i = buf[p].imag; | |
385 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag; | |
386 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag; | |
387 buf[p].real = tmp_a_r + tmp_b_r; | |
388 buf[p].imag = tmp_a_i + tmp_b_i; | |
389 buf[q].real = tmp_a_r - tmp_b_r; | |
390 buf[q].imag = tmp_a_i - tmp_b_i; | |
391 } | |
392 } | |
393 } | |
394 #else | |
395 /* unoptimized variant | |
396 for (m=1; m < 7; m++) { | |
397 if(m) | |
398 two_m = (1 << m); | |
399 else | |
400 two_m = 1; | |
401 | |
402 two_m_plus_one = (1 << (m+1)); | |
403 | |
404 for(i = 0; i < 128; i += two_m_plus_one) { | |
405 for(k = 0; k < two_m; k++) { | |
381 p = k + i; | 406 p = k + i; |
382 q = p + two_m; | 407 q = p + two_m; |
383 tmp_a_r = buf[p].real; | 408 tmp_a_r = buf[p].real; |
384 tmp_a_i = buf[p].imag; | 409 tmp_a_i = buf[p].imag; |
385 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag; | 410 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag; |
389 buf[q].real = tmp_a_r - tmp_b_r; | 414 buf[q].real = tmp_a_r - tmp_b_r; |
390 buf[q].imag = tmp_a_i - tmp_b_i; | 415 buf[q].imag = tmp_a_i - tmp_b_i; |
391 } | 416 } |
392 } | 417 } |
393 } | 418 } |
394 #else | 419 */ |
395 for (m=0; m < 7; m++) { | 420 |
396 if(m) | 421 for(i = 0; i < 128; i += 2) { |
397 two_m = (1 << m); | 422 tmp_a_r = buf[i].real; |
398 else | 423 tmp_a_i = buf[i].imag; |
399 two_m = 1; | 424 tmp_b_r = buf[i+1].real; |
400 | 425 tmp_b_i = buf[i+1].imag; |
401 two_m_plus_one = (1 << (m+1)); | 426 buf[i].real = tmp_a_r + tmp_b_r; |
402 | 427 buf[i].imag = tmp_a_i + tmp_b_i; |
403 for(k = 0; k < two_m; k++) { | 428 buf[i+1].real = tmp_a_r - tmp_b_r; |
404 for(i = 0; i < 128; i += two_m_plus_one) { | 429 buf[i+1].imag = tmp_a_i - tmp_b_i; |
405 p = k + i; | 430 } |
406 q = p + two_m; | 431 |
432 for(i = 0; i < 128; i += 4) { | |
433 tmp_a_r = buf[i].real; | |
434 tmp_a_i = buf[i].imag; | |
435 tmp_b_r = buf[i+2].real; | |
436 tmp_b_i = buf[i+2].imag; | |
437 buf[i].real = tmp_a_r + tmp_b_r; | |
438 buf[i].imag = tmp_a_i + tmp_b_i; | |
439 buf[i+2].real = tmp_a_r - tmp_b_r; | |
440 buf[i+2].imag = tmp_a_i - tmp_b_i; | |
441 tmp_a_r = buf[i+1].real; | |
442 tmp_a_i = buf[i+1].imag; | |
443 tmp_b_r = buf[i+3].imag; | |
444 tmp_b_i = buf[i+3].real; | |
445 buf[i+1].real = tmp_a_r + tmp_b_r; | |
446 buf[i+1].imag = tmp_a_i - tmp_b_i; | |
447 buf[i+3].real = tmp_a_r - tmp_b_r; | |
448 buf[i+3].imag = tmp_a_i + tmp_b_i; | |
449 } | |
450 | |
451 for(i = 0; i < 128; i += 8) { | |
452 tmp_a_r = buf[i].real; | |
453 tmp_a_i = buf[i].imag; | |
454 tmp_b_r = buf[i+4].real; | |
455 tmp_b_i = buf[i+4].imag; | |
456 buf[i].real = tmp_a_r + tmp_b_r; | |
457 buf[i].imag = tmp_a_i + tmp_b_i; | |
458 buf[i+4].real = tmp_a_r - tmp_b_r; | |
459 buf[i+4].imag = tmp_a_i - tmp_b_i; | |
460 tmp_a_r = buf[1+i].real; | |
461 tmp_a_i = buf[1+i].imag; | |
462 tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real; | |
463 tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real; | |
464 buf[1+i].real = tmp_a_r + tmp_b_r; | |
465 buf[1+i].imag = tmp_a_i + tmp_b_i; | |
466 buf[i+5].real = tmp_a_r - tmp_b_r; | |
467 buf[i+5].imag = tmp_a_i - tmp_b_i; | |
468 tmp_a_r = buf[i+2].real; | |
469 tmp_a_i = buf[i+2].imag; | |
470 tmp_b_r = buf[i+6].imag; | |
471 tmp_b_i = - buf[i+6].real; | |
472 buf[i+2].real = tmp_a_r + tmp_b_r; | |
473 buf[i+2].imag = tmp_a_i + tmp_b_i; | |
474 buf[i+6].real = tmp_a_r - tmp_b_r; | |
475 buf[i+6].imag = tmp_a_i - tmp_b_i; | |
476 tmp_a_r = buf[i+3].real; | |
477 tmp_a_i = buf[i+3].imag; | |
478 tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag; | |
479 tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag; | |
480 buf[i+3].real = tmp_a_r + tmp_b_r; | |
481 buf[i+3].imag = tmp_a_i + tmp_b_i; | |
482 buf[i+7].real = tmp_a_r - tmp_b_r; | |
483 buf[i+7].imag = tmp_a_i - tmp_b_i; | |
484 } | |
485 | |
486 for (m=3; m < 7; m++) { | |
487 two_m = (1 << m); | |
488 | |
489 two_m_plus_one = two_m<<1; | |
490 | |
491 for(i = 0; i < 128; i += two_m_plus_one) { | |
492 for(k = 0; k < two_m; k++) { | |
493 int p = k + i; | |
494 int q = p + two_m; | |
407 tmp_a_r = buf[p].real; | 495 tmp_a_r = buf[p].real; |
408 tmp_a_i = buf[p].imag; | 496 tmp_a_i = buf[p].imag; |
409 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag; | 497 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag; |
410 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag; | 498 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag; |
411 buf[p].real = tmp_a_r + tmp_b_r; | 499 buf[p].real = tmp_a_r + tmp_b_r; |
413 buf[q].real = tmp_a_r - tmp_b_r; | 501 buf[q].real = tmp_a_r - tmp_b_r; |
414 buf[q].imag = tmp_a_i - tmp_b_i; | 502 buf[q].imag = tmp_a_i - tmp_b_i; |
415 } | 503 } |
416 } | 504 } |
417 } | 505 } |
506 | |
418 #endif | 507 #endif |
419 | 508 |
420 /* Post IFFT complex multiply plus IFFT complex conjugate*/ | 509 /* Post IFFT complex multiply plus IFFT complex conjugate*/ |
421 for( i=0; i < 128; i++) { | 510 for( i=0; i < 128; i++) { |
422 /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */ | 511 /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */ |