comparison liba52/imdct.c @ 3537:d7e5a32643c9

C optimizations sse opt.
author michael
date Sun, 16 Dec 2001 23:00:03 +0000
parents 3483390a902b
children 4e772a3c6b62
comparison
equal deleted inserted replaced
3536:633a8d6e40dc 3537:d7e5a32643c9
330 Note sseW2+16={0,0,sqrt(2),-sqrt(2)) 330 Note sseW2+16={0,0,sqrt(2),-sqrt(2))
331 Note sseW2+32={0,0,-sqrt(2),-sqrt(2)) 331 Note sseW2+32={0,0,-sqrt(2),-sqrt(2))
332 Note sseW2+48={1,-1,sqrt(2),-sqrt(2)) 332 Note sseW2+48={1,-1,sqrt(2),-sqrt(2))
333 */ 333 */
334 asm volatile( 334 asm volatile(
335 "movaps sseW2, %%xmm6 \n\t" 335 "movaps 48+sseW2, %%xmm6 \n\t"
336 "movaps 16+sseW2, %%xmm7 \n\t" 336 "movaps 16+sseW2, %%xmm7 \n\t"
337 "xorps %%xmm5, %%xmm5 \n\t" 337 "xorps %%xmm5, %%xmm5 \n\t"
338 "xorps %%xmm2, %%xmm2 \n\t" 338 "xorps %%xmm2, %%xmm2 \n\t"
339 "movl %0, %%esi \n\t" 339 "movl %0, %%esi \n\t"
340 ".balign 16 \n\t" 340 ".balign 16 \n\t"
341 "1: \n\t" 341 "1: \n\t"
342 "movhps 40(%%esi), %%xmm2 \n\t" //r4,i4,r5,i5 342 "movaps 32(%%esi), %%xmm2 \n\t" //r4,i4,r5,i5
343 "movaps 48(%%esi), %%xmm3 \n\t" //r6,i6,r7,i7 343 "movaps 48(%%esi), %%xmm3 \n\t" //r6,i6,r7,i7
344 "movaps 32(%%esi), %%xmm4 \n\t" //r4,i4,r5,i5 344 "movaps sseW2, %%xmm4 \n\t" //r4,i4,r5,i5
345 "movhps 56(%%esi), %%xmm5 \n\t" //r6,i6,r7,i7 345 "movaps 32+sseW2, %%xmm5 \n\t" //r6,i6,r7,i7
346 "mulps %%xmm2, %%xmm4 \n\t"
347 "mulps %%xmm3, %%xmm5 \n\t"
346 "shufps $0xB1, %%xmm2, %%xmm2 \n\t" //i4,r4,i5,r5 348 "shufps $0xB1, %%xmm2, %%xmm2 \n\t" //i4,r4,i5,r5
347 "shufps $0xB1, %%xmm3, %%xmm3 \n\t" //i6,r6,i7,r7 349 "shufps $0xB1, %%xmm3, %%xmm3 \n\t" //i6,r6,i7,r7
348 "mulps %%xmm6, %%xmm4 \n\t" 350 "mulps %%xmm6, %%xmm3 \n\t"
349 "mulps 32+sseW2, %%xmm5 \n\t"
350 "mulps %%xmm7, %%xmm2 \n\t" 351 "mulps %%xmm7, %%xmm2 \n\t"
351 "mulps 48+sseW2, %%xmm3 \n\t"
352 "movaps (%%esi), %%xmm0 \n\t" //r0,i0,r1,i1 352 "movaps (%%esi), %%xmm0 \n\t" //r0,i0,r1,i1
353 "movaps 16(%%esi), %%xmm1 \n\t" //r2,i2,r3,i3 353 "movaps 16(%%esi), %%xmm1 \n\t" //r2,i2,r3,i3
354 "addps %%xmm4, %%xmm2 \n\t" 354 "addps %%xmm4, %%xmm2 \n\t"
355 "addps %%xmm5, %%xmm3 \n\t" 355 "addps %%xmm5, %%xmm3 \n\t"
356 "movaps %%xmm2, %%xmm4 \n\t" 356 "movaps %%xmm2, %%xmm4 \n\t"
374 for (m=3; m < 7; m++) { 374 for (m=3; m < 7; m++) {
375 two_m = (1 << m); 375 two_m = (1 << m);
376 376
377 two_m_plus_one = two_m<<1; 377 two_m_plus_one = two_m<<1;
378 378
379 for(k = 0; k < two_m; k++) { 379 for(i = 0; i < 128; i += two_m_plus_one) {
380 for(i = 0; i < 128; i += two_m_plus_one) { 380 for(k = 0; k < two_m; k++) {
381 int p = k + i;
382 int q = p + two_m;
383 tmp_a_r = buf[p].real;
384 tmp_a_i = buf[p].imag;
385 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
386 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
387 buf[p].real = tmp_a_r + tmp_b_r;
388 buf[p].imag = tmp_a_i + tmp_b_i;
389 buf[q].real = tmp_a_r - tmp_b_r;
390 buf[q].imag = tmp_a_i - tmp_b_i;
391 }
392 }
393 }
394 #else
395 /* unoptimized variant
396 for (m=1; m < 7; m++) {
397 if(m)
398 two_m = (1 << m);
399 else
400 two_m = 1;
401
402 two_m_plus_one = (1 << (m+1));
403
404 for(i = 0; i < 128; i += two_m_plus_one) {
405 for(k = 0; k < two_m; k++) {
381 p = k + i; 406 p = k + i;
382 q = p + two_m; 407 q = p + two_m;
383 tmp_a_r = buf[p].real; 408 tmp_a_r = buf[p].real;
384 tmp_a_i = buf[p].imag; 409 tmp_a_i = buf[p].imag;
385 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag; 410 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
389 buf[q].real = tmp_a_r - tmp_b_r; 414 buf[q].real = tmp_a_r - tmp_b_r;
390 buf[q].imag = tmp_a_i - tmp_b_i; 415 buf[q].imag = tmp_a_i - tmp_b_i;
391 } 416 }
392 } 417 }
393 } 418 }
394 #else 419 */
395 for (m=0; m < 7; m++) { 420
396 if(m) 421 for(i = 0; i < 128; i += 2) {
397 two_m = (1 << m); 422 tmp_a_r = buf[i].real;
398 else 423 tmp_a_i = buf[i].imag;
399 two_m = 1; 424 tmp_b_r = buf[i+1].real;
400 425 tmp_b_i = buf[i+1].imag;
401 two_m_plus_one = (1 << (m+1)); 426 buf[i].real = tmp_a_r + tmp_b_r;
402 427 buf[i].imag = tmp_a_i + tmp_b_i;
403 for(k = 0; k < two_m; k++) { 428 buf[i+1].real = tmp_a_r - tmp_b_r;
404 for(i = 0; i < 128; i += two_m_plus_one) { 429 buf[i+1].imag = tmp_a_i - tmp_b_i;
405 p = k + i; 430 }
406 q = p + two_m; 431
432 for(i = 0; i < 128; i += 4) {
433 tmp_a_r = buf[i].real;
434 tmp_a_i = buf[i].imag;
435 tmp_b_r = buf[i+2].real;
436 tmp_b_i = buf[i+2].imag;
437 buf[i].real = tmp_a_r + tmp_b_r;
438 buf[i].imag = tmp_a_i + tmp_b_i;
439 buf[i+2].real = tmp_a_r - tmp_b_r;
440 buf[i+2].imag = tmp_a_i - tmp_b_i;
441 tmp_a_r = buf[i+1].real;
442 tmp_a_i = buf[i+1].imag;
443 tmp_b_r = buf[i+3].imag;
444 tmp_b_i = buf[i+3].real;
445 buf[i+1].real = tmp_a_r + tmp_b_r;
446 buf[i+1].imag = tmp_a_i - tmp_b_i;
447 buf[i+3].real = tmp_a_r - tmp_b_r;
448 buf[i+3].imag = tmp_a_i + tmp_b_i;
449 }
450
451 for(i = 0; i < 128; i += 8) {
452 tmp_a_r = buf[i].real;
453 tmp_a_i = buf[i].imag;
454 tmp_b_r = buf[i+4].real;
455 tmp_b_i = buf[i+4].imag;
456 buf[i].real = tmp_a_r + tmp_b_r;
457 buf[i].imag = tmp_a_i + tmp_b_i;
458 buf[i+4].real = tmp_a_r - tmp_b_r;
459 buf[i+4].imag = tmp_a_i - tmp_b_i;
460 tmp_a_r = buf[1+i].real;
461 tmp_a_i = buf[1+i].imag;
462 tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
463 tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
464 buf[1+i].real = tmp_a_r + tmp_b_r;
465 buf[1+i].imag = tmp_a_i + tmp_b_i;
466 buf[i+5].real = tmp_a_r - tmp_b_r;
467 buf[i+5].imag = tmp_a_i - tmp_b_i;
468 tmp_a_r = buf[i+2].real;
469 tmp_a_i = buf[i+2].imag;
470 tmp_b_r = buf[i+6].imag;
471 tmp_b_i = - buf[i+6].real;
472 buf[i+2].real = tmp_a_r + tmp_b_r;
473 buf[i+2].imag = tmp_a_i + tmp_b_i;
474 buf[i+6].real = tmp_a_r - tmp_b_r;
475 buf[i+6].imag = tmp_a_i - tmp_b_i;
476 tmp_a_r = buf[i+3].real;
477 tmp_a_i = buf[i+3].imag;
478 tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
479 tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
480 buf[i+3].real = tmp_a_r + tmp_b_r;
481 buf[i+3].imag = tmp_a_i + tmp_b_i;
482 buf[i+7].real = tmp_a_r - tmp_b_r;
483 buf[i+7].imag = tmp_a_i - tmp_b_i;
484 }
485
486 for (m=3; m < 7; m++) {
487 two_m = (1 << m);
488
489 two_m_plus_one = two_m<<1;
490
491 for(i = 0; i < 128; i += two_m_plus_one) {
492 for(k = 0; k < two_m; k++) {
493 int p = k + i;
494 int q = p + two_m;
407 tmp_a_r = buf[p].real; 495 tmp_a_r = buf[p].real;
408 tmp_a_i = buf[p].imag; 496 tmp_a_i = buf[p].imag;
409 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag; 497 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
410 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag; 498 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
411 buf[p].real = tmp_a_r + tmp_b_r; 499 buf[p].real = tmp_a_r + tmp_b_r;
413 buf[q].real = tmp_a_r - tmp_b_r; 501 buf[q].real = tmp_a_r - tmp_b_r;
414 buf[q].imag = tmp_a_i - tmp_b_i; 502 buf[q].imag = tmp_a_i - tmp_b_i;
415 } 503 }
416 } 504 }
417 } 505 }
506
418 #endif 507 #endif
419 508
420 /* Post IFFT complex multiply plus IFFT complex conjugate*/ 509 /* Post IFFT complex multiply plus IFFT complex conjugate*/
421 for( i=0; i < 128; i++) { 510 for( i=0; i < 128; i++) {
422 /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */ 511 /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */