changeset 1575:f16ae8e69bd9 libavcodec

reorder table instead of wasting instructions to reorder the input to match the table
author michael
date Sat, 25 Oct 2003 21:59:28 +0000
parents 1396e33a9cb6
children bc4b13ca74de
files i386/fdct_mmx.c
diffstat 1 files changed, 107 insertions(+), 123 deletions(-) [+]
line wrap: on
line diff
--- a/i386/fdct_mmx.c	Sat Oct 25 19:44:37 2003 +0000
+++ b/i386/fdct_mmx.c	Sat Oct 25 21:59:28 2003 +0000
@@ -49,85 +49,77 @@
 static const long fdct_r_row[2] ATTR_ALIGN(8) = {RND_FRW_ROW, RND_FRW_ROW };
 
 static const int16_t tab_frw_01234567[] ATTR_ALIGN(8) = {  // forward_dct coeff table
-    //row0
-    16384, 16384, 21407, -8867,     //    w09 w01 w08 w00
-    16384, 16384, 8867, -21407,     //    w13 w05 w12 w04
-    16384, -16384, 8867, 21407,     //    w11 w03 w10 w02
-    -16384, 16384, -21407, -8867,   //    w15 w07 w14 w06
-    22725, 12873, 19266, -22725,    //    w22 w20 w18 w16
-    19266, 4520, -4520, -12873,     //    w23 w21 w19 w17
-    12873, 4520, 4520, 19266,       //    w30 w28 w26 w24
-    -22725, 19266, -12873, -22725,  //    w31 w29 w27 w25
+  16384,   16384,   -8867,  -21407, 
+  16384,   16384,   21407,    8867, 
+  16384,  -16384,   21407,   -8867, 
+ -16384,   16384,    8867,  -21407, 
+  22725,   19266,  -22725,  -12873, 
+  12873,    4520,   19266,   -4520, 
+  12873,  -22725,   19266,  -22725, 
+   4520,   19266,    4520,  -12873, 
 
-    //row1
-    22725, 22725, 29692, -12299,    //    w09 w01 w08 w00
-    22725, 22725, 12299, -29692,    //    w13 w05 w12 w04
-    22725, -22725, 12299, 29692,    //    w11 w03 w10 w02
-    -22725, 22725, -29692, -12299,  //    w15 w07 w14 w06
-    31521, 17855, 26722, -31521,    //    w22 w20 w18 w16
-    26722, 6270, -6270, -17855,     //    w23 w21 w19 w17
-    17855, 6270, 6270, 26722,       //    w30 w28 w26 w24
-    -31521, 26722, -17855, -31521,  //    w31 w29 w27 w25
+  22725,   22725,  -12299,  -29692, 
+  22725,   22725,   29692,   12299, 
+  22725,  -22725,   29692,  -12299, 
+ -22725,   22725,   12299,  -29692, 
+  31521,   26722,  -31521,  -17855, 
+  17855,    6270,   26722,   -6270, 
+  17855,  -31521,   26722,  -31521, 
+   6270,   26722,    6270,  -17855, 
 
-    //row2
-    21407, 21407, 27969, -11585,    //    w09 w01 w08 w00
-    21407, 21407, 11585, -27969,    //    w13 w05 w12 w04
-    21407, -21407, 11585, 27969,    //    w11 w03 w10 w02
-    -21407, 21407, -27969, -11585,  //    w15 w07 w14 w06
-    29692, 16819, 25172, -29692,    //    w22 w20 w18 w16
-    25172, 5906, -5906, -16819,     //    w23 w21 w19 w17
-    16819, 5906, 5906, 25172,       //    w30 w28 w26 w24
-    -29692, 25172, -16819, -29692,  //    w31 w29 w27 w25
+  21407,   21407,  -11585,  -27969, 
+  21407,   21407,   27969,   11585, 
+  21407,  -21407,   27969,  -11585, 
+ -21407,   21407,   11585,  -27969, 
+  29692,   25172,  -29692,  -16819, 
+  16819,    5906,   25172,   -5906, 
+  16819,  -29692,   25172,  -29692, 
+   5906,   25172,    5906,  -16819, 
 
-    //row3
-    19266, 19266, 25172, -10426,    //    w09 w01 w08 w00
-    19266, 19266, 10426, -25172,    //    w13 w05 w12 w04
-    19266, -19266, 10426, 25172,    //    w11 w03 w10 w02
-    -19266, 19266, -25172, -10426,  //    w15 w07 w14 w06, 
-    26722, 15137, 22654, -26722,    //    w22 w20 w18 w16
-    22654, 5315, -5315, -15137,     //    w23 w21 w19 w17
-    15137, 5315, 5315, 22654,       //    w30 w28 w26 w24
-    -26722, 22654, -15137, -26722,  //    w31 w29 w27 w25, 
+  19266,   19266,  -10426,  -25172, 
+  19266,   19266,   25172,   10426, 
+  19266,  -19266,   25172,  -10426, 
+ -19266,   19266,   10426,  -25172, 
+  26722,   22654,  -26722,  -15137, 
+  15137,    5315,   22654,   -5315, 
+  15137,  -26722,   22654,  -26722, 
+   5315,   22654,    5315,  -15137, 
 
-    //row4
-    16384, 16384, 21407, -8867,     //    w09 w01 w08 w00
-    16384, 16384, 8867, -21407,     //    w13 w05 w12 w04
-    16384, -16384, 8867, 21407,     //    w11 w03 w10 w02
-    -16384, 16384, -21407, -8867,   //    w15 w07 w14 w06
-    22725, 12873, 19266, -22725,    //    w22 w20 w18 w16
-    19266, 4520, -4520, -12873,     //    w23 w21 w19 w17
-    12873, 4520, 4520, 19266,       //    w30 w28 w26 w24
-    -22725, 19266, -12873, -22725,  //    w31 w29 w27 w25 
+  16384,   16384,   -8867,  -21407, 
+  16384,   16384,   21407,    8867, 
+  16384,  -16384,   21407,   -8867, 
+ -16384,   16384,    8867,  -21407, 
+  22725,   19266,  -22725,  -12873, 
+  12873,    4520,   19266,   -4520, 
+  12873,  -22725,   19266,  -22725, 
+   4520,   19266,    4520,  -12873, 
 
-    //row5
-    19266, 19266, 25172, -10426,    //    w09 w01 w08 w00
-    19266, 19266, 10426, -25172,    //    w13 w05 w12 w04
-    19266, -19266, 10426, 25172,    //    w11 w03 w10 w02
-    -19266, 19266, -25172, -10426,  //    w15 w07 w14 w06
-    26722, 15137, 22654, -26722,    //    w22 w20 w18 w16
-    22654, 5315, -5315, -15137,     //    w23 w21 w19 w17
-    15137, 5315, 5315, 22654,       //    w30 w28 w26 w24
-    -26722, 22654, -15137, -26722,  //    w31 w29 w27 w25
+  19266,   19266,  -10426,  -25172, 
+  19266,   19266,   25172,   10426, 
+  19266,  -19266,   25172,  -10426, 
+ -19266,   19266,   10426,  -25172, 
+  26722,   22654,  -26722,  -15137, 
+  15137,    5315,   22654,   -5315, 
+  15137,  -26722,   22654,  -26722, 
+   5315,   22654,    5315,  -15137, 
 
-    //row6
-    21407, 21407, 27969, -11585,    //    w09 w01 w08 w00
-    21407, 21407, 11585, -27969,    //    w13 w05 w12 w04
-    21407, -21407, 11585, 27969,    //    w11 w03 w10 w02
-    -21407, 21407, -27969, -11585,  //    w15 w07 w14 w06, 
-    29692, 16819, 25172, -29692,    //    w22 w20 w18 w16
-    25172, 5906, -5906, -16819,     //    w23 w21 w19 w17
-    16819, 5906, 5906, 25172,       //    w30 w28 w26 w24
-    -29692, 25172, -16819, -29692,  //    w31 w29 w27 w25, 
+  21407,   21407,  -11585,  -27969, 
+  21407,   21407,   27969,   11585, 
+  21407,  -21407,   27969,  -11585, 
+ -21407,   21407,   11585,  -27969, 
+  29692,   25172,  -29692,  -16819, 
+  16819,    5906,   25172,   -5906, 
+  16819,  -29692,   25172,  -29692, 
+   5906,   25172,    5906,  -16819, 
 
-    //row7
-    22725, 22725, 29692, -12299,    //    w09 w01 w08 w00
-    22725, 22725, 12299, -29692,    //    w13 w05 w12 w04
-    22725, -22725, 12299, 29692,    //    w11 w03 w10 w02
-    -22725, 22725, -29692, -12299,  //    w15 w07 w14 w06, 
-    31521, 17855, 26722, -31521,    //    w22 w20 w18 w16
-    26722, 6270, -6270, -17855,     //    w23 w21 w19 w17
-    17855, 6270, 6270, 26722,       //    w30 w28 w26 w24
-    -31521, 26722, -17855, -31521   //    w31 w29 w27 w25
+  22725,   22725,  -12299,  -29692, 
+  22725,   22725,   29692,   12299, 
+  22725,  -22725,   29692,  -12299, 
+ -22725,   22725,   12299,  -29692, 
+  31521,   26722,  -31521,  -17855, 
+  17855,    6270,   26722,   -6270, 
+  17855,  -31521,   26722,  -31521, 
+   6270,   26722,    6270,  -17855, 
 };
 
 
@@ -217,73 +209,65 @@
     movq_r2r(mm0, mm1);	
     paddsw_r2r(mm5, mm0);
     psubsw_r2r(mm5, mm1);
-    pshufw_r2r(mm1, mm5, 0x88);
-    pshufw_r2r(mm1, mm6, 0xDD);
-    pshufw_r2r(mm0, mm1, 0xDD);
-    pshufw_r2r(mm0, mm0, 0x88);
-    movq_m2r(*(table + 0), mm3);
-    movq_m2r(*(table + 4), mm4);
-    movq_m2r(*(table + 16), mm2);
+    pshufw_r2r(mm0, mm2, 0x4E);
+    pshufw_r2r(mm1, mm3, 0x4E);
+    movq_m2r(*(table + 0), mm4);
+    movq_m2r(*(table + 4), mm6);
+    movq_m2r(*(table + 16), mm5);
     movq_m2r(*(table + 20), mm7);
-    pmaddwd_r2r(mm0, mm3);
-    pmaddwd_r2r(mm1, mm4);
-    pmaddwd_r2r(mm5, mm2);
-    pmaddwd_r2r(mm6, mm7);
+    pmaddwd_r2r(mm0, mm4);
+    pmaddwd_r2r(mm1, mm5);
+    pmaddwd_r2r(mm2, mm6);
+    pmaddwd_r2r(mm3, mm7);
     pmaddwd_m2r(*(table + 8), mm0);
-    pmaddwd_m2r(*(table + 12), mm1);
-    pmaddwd_m2r(*(table + 24), mm5);
-    pmaddwd_m2r(*(table + 28), mm6);
-    paddd_r2r(mm4, mm3);
-    paddd_r2r(mm7, mm2);
-    paddd_r2r(mm1, mm0);
-    paddd_r2r(mm6, mm5);
+    pmaddwd_m2r(*(table + 12), mm2);
+    pmaddwd_m2r(*(table + 24), mm1);
+    pmaddwd_m2r(*(table + 28), mm3);
+    paddd_r2r(mm6, mm4);
+    paddd_r2r(mm7, mm5);
+    paddd_r2r(mm2, mm0);
+    paddd_r2r(mm3, mm1);
     movq_m2r(*fdct_r_row, mm7);
-    paddd_r2r(mm7, mm3);
-    paddd_r2r(mm7, mm0);
-    paddd_r2r(mm7, mm2);
+    paddd_r2r(mm7, mm4);
     paddd_r2r(mm7, mm5);
-    psrad_i2r(SHIFT_FRW_ROW, mm3);
-    psrad_i2r(SHIFT_FRW_ROW, mm2);
-    psrad_i2r(SHIFT_FRW_ROW, mm0);
+    paddd_r2r(mm7, mm0);
+    paddd_r2r(mm7, mm1);
+    psrad_i2r(SHIFT_FRW_ROW, mm4);
     psrad_i2r(SHIFT_FRW_ROW, mm5);
-    packssdw_r2r(mm0, mm3);
-    packssdw_r2r(mm5, mm2);
-    movq_r2r(mm3, mm6);
-    punpcklwd_r2r(mm2, mm3);
-    punpckhwd_r2r(mm2, mm6);
-    movq_r2m(mm3, *(out + 0));
-    movq_r2m(mm6, *(out + 4));
+    psrad_i2r(SHIFT_FRW_ROW, mm0);
+    psrad_i2r(SHIFT_FRW_ROW, mm1);
+    packssdw_r2r(mm0, mm4);
+    packssdw_r2r(mm1, mm5);
+    movq_r2r(mm4, mm2);
+    punpcklwd_r2r(mm5, mm4);
+    punpckhwd_r2r(mm5, mm2);
+    movq_r2m(mm4, *(out + 0));
+    movq_r2m(mm2, *(out + 4));
 }
 
 static always_inline void fdct_row_mmx(const int16_t *in, int16_t *out, const int16_t *table)
 { 
-    movd_m2r(*(in + 6), mm5);
-    punpcklwd_m2r(*(in + 4), mm5);
-    movq_r2r(mm5, mm2);
-    psrlq_i2r(0x20, mm5);
+    movd_m2r(*(in + 6), mm1);
+    punpcklwd_m2r(*(in + 4), mm1);
+    movq_r2r(mm1, mm2);
+    psrlq_i2r(0x20, mm1);
     movq_m2r(*(in + 0), mm0);
-    punpcklwd_r2r(mm2, mm5);
-    movq_r2r(mm0, mm1);	
-    paddsw_r2r(mm5, mm0);
-    psubsw_r2r(mm5, mm1);
-    movq_r2r(mm0, mm2);
-    punpcklwd_r2r(mm1, mm0);
-    punpckhwd_r2r(mm1, mm2);
-    movq_r2r(mm0, mm1);
+    punpcklwd_r2r(mm2, mm1);
+    movq_r2r(mm0, mm5);	
+    paddsw_r2r(mm1, mm0);
+    psubsw_r2r(mm1, mm5);
+    movq_r2r(mm0, mm1);	    
+    movq_r2r(mm5, mm6);	
+    punpckldq_r2r(mm5, mm3);
+    punpckhdq_r2r(mm3, mm6);
     movq_m2r(*(table + 0), mm3);
-    punpcklwd_r2r(mm2, mm0);
-    movq_r2r(mm0, mm5);
-    punpckldq_r2r(mm0, mm0);
     movq_m2r(*(table + 4), mm4);
-    punpckhwd_r2r(mm2, mm1);
+    punpckldq_r2r(mm0, mm2);
     pmaddwd_r2r(mm0, mm3);
-    movq_r2r(mm1, mm6);
+    punpckhdq_r2r(mm2, mm1);
     movq_m2r(*(table + 16), mm2);
-    punpckldq_r2r(mm1, mm1);
     pmaddwd_r2r(mm1, mm4);
-    punpckhdq_r2r(mm5, mm5);
     pmaddwd_m2r(*(table + 8), mm0);
-    punpckhdq_r2r(mm6, mm6);
     movq_m2r(*(table + 20), mm7);
     pmaddwd_r2r(mm5, mm2);
     paddd_m2r(*fdct_r_row, mm3);