Mercurial > libavcodec.hg
comparison i386/dsputil_mmx_rnd.h @ 448:e8c8ca9106aa libavcodec
* removed MANGLE from macros for setting constants
* using MOVQ_WONE/MOVQ_BFE as two instruction instead of static memory value access
as its always faster
* PAVGB_MMX macro is using now mm6 -> mm7 is unmodified
* replaced original pixels_xy2_mmx with new faster and equal implementation
* replaced usage of mm7 for other then ZERO contstant in _rnd & _avg file with mm6
author | kabi |
---|---|
date | Thu, 30 May 2002 15:14:56 +0000 |
parents | 810f726ee3cc |
children | b94e82d31b06 |
comparison
equal
deleted
inserted
replaced
447:810f726ee3cc | 448:e8c8ca9106aa |
---|---|
22 */ | 22 */ |
23 | 23 |
24 // put_pixels | 24 // put_pixels |
25 static void DEF(put, pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | 25 static void DEF(put, pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
26 { | 26 { |
27 MOVQ_BFE(mm6); | |
27 __asm __volatile( | 28 __asm __volatile( |
28 MOVQ_BFE(%%mm7) | |
29 "lea (%3, %3), %%eax \n\t" | 29 "lea (%3, %3), %%eax \n\t" |
30 ".balign 8 \n\t" | 30 ".balign 8 \n\t" |
31 "1: \n\t" | 31 "1: \n\t" |
32 "movq (%1), %%mm0 \n\t" | 32 "movq (%1), %%mm0 \n\t" |
33 "movq 1(%1), %%mm1 \n\t" | 33 "movq 1(%1), %%mm1 \n\t" |
34 "movq (%1, %3), %%mm2 \n\t" | 34 "movq (%1, %3), %%mm2 \n\t" |
35 "movq 1(%1, %3), %%mm3 \n\t" | 35 "movq 1(%1, %3), %%mm3 \n\t" |
36 PAVGBP(%%mm0, %%mm1, %%mm5, %%mm2, %%mm3, %%mm6) | 36 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) |
37 "movq %%mm5, (%2) \n\t" | 37 "movq %%mm4, (%2) \n\t" |
38 "movq %%mm6, (%2, %3) \n\t" | 38 "movq %%mm5, (%2, %3) \n\t" |
39 "addl %%eax, %1 \n\t" | 39 "addl %%eax, %1 \n\t" |
40 "addl %%eax, %2 \n\t" | 40 "addl %%eax, %2 \n\t" |
41 "movq (%1), %%mm0 \n\t" | 41 "movq (%1), %%mm0 \n\t" |
42 "movq 1(%1), %%mm1 \n\t" | 42 "movq 1(%1), %%mm1 \n\t" |
43 "movq (%1, %3), %%mm2 \n\t" | 43 "movq (%1, %3), %%mm2 \n\t" |
44 "movq 1(%1, %3), %%mm3 \n\t" | 44 "movq 1(%1, %3), %%mm3 \n\t" |
45 PAVGBP(%%mm0, %%mm1, %%mm5, %%mm2, %%mm3, %%mm6) | 45 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) |
46 "movq %%mm5, (%2) \n\t" | 46 "movq %%mm4, (%2) \n\t" |
47 "movq %%mm6, (%2, %3) \n\t" | 47 "movq %%mm5, (%2, %3) \n\t" |
48 "addl %%eax, %1 \n\t" | 48 "addl %%eax, %1 \n\t" |
49 "addl %%eax, %2 \n\t" | 49 "addl %%eax, %2 \n\t" |
50 "subl $4, %0 \n\t" | 50 "subl $4, %0 \n\t" |
51 "jnz 1b \n\t" | 51 "jnz 1b \n\t" |
52 :"+g"(h), "+S"(pixels), "+D"(block) | 52 :"+g"(h), "+S"(pixels), "+D"(block) |
54 :"eax", "memory"); | 54 :"eax", "memory"); |
55 } | 55 } |
56 | 56 |
57 static void DEF(put, pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | 57 static void DEF(put, pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
58 { | 58 { |
59 __asm __volatile( | 59 MOVQ_BFE(mm6); |
60 MOVQ_BFE(%%mm7) | 60 __asm __volatile( |
61 "lea (%3, %3), %%eax \n\t" | 61 "lea (%3, %3), %%eax \n\t" |
62 "movq (%1), %%mm0 \n\t" | 62 "movq (%1), %%mm0 \n\t" |
63 ".balign 8 \n\t" | 63 ".balign 8 \n\t" |
64 "1: \n\t" | 64 "1: \n\t" |
65 "movq (%1, %3), %%mm1 \n\t" | 65 "movq (%1, %3), %%mm1 \n\t" |
66 "movq (%1, %%eax),%%mm2 \n\t" | 66 "movq (%1, %%eax),%%mm2 \n\t" |
67 PAVGBP(%%mm1, %%mm0, %%mm5, %%mm2, %%mm1, %%mm6) | 67 PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) |
68 "movq %%mm5, (%2) \n\t" | 68 "movq %%mm4, (%2) \n\t" |
69 "movq %%mm6, (%2, %3) \n\t" | 69 "movq %%mm5, (%2, %3) \n\t" |
70 "addl %%eax, %1 \n\t" | 70 "addl %%eax, %1 \n\t" |
71 "addl %%eax, %2 \n\t" | 71 "addl %%eax, %2 \n\t" |
72 "movq (%1, %3), %%mm1 \n\t" | 72 "movq (%1, %3), %%mm1 \n\t" |
73 "movq (%1, %%eax),%%mm0 \n\t" | 73 "movq (%1, %%eax),%%mm0 \n\t" |
74 PAVGBP(%%mm1, %%mm2, %%mm5, %%mm0, %%mm1, %%mm6) | 74 PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) |
75 "movq %%mm5, (%2) \n\t" | 75 "movq %%mm4, (%2) \n\t" |
76 "movq %%mm6, (%2, %3) \n\t" | 76 "movq %%mm5, (%2, %3) \n\t" |
77 "addl %%eax, %1 \n\t" | 77 "addl %%eax, %1 \n\t" |
78 "addl %%eax, %2 \n\t" | 78 "addl %%eax, %2 \n\t" |
79 "subl $4, %0 \n\t" | 79 "subl $4, %0 \n\t" |
80 "jnz 1b \n\t" | 80 "jnz 1b \n\t" |
81 :"+g"(h), "+S"(pixels), "+D"(block) | 81 :"+g"(h), "+S"(pixels), "+D"(block) |
82 :"r"(line_size) | 82 :"r"(line_size) |
83 :"eax", "memory"); | 83 :"eax", "memory"); |
84 } | 84 } |
85 | 85 |
86 // ((a + b)/2 + (c + d)/2)/2 | 86 static void DEF(put, pixels_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
87 // not sure if this is properly replacing original code | |
88 // - ok it's really unsable at this moment -> disabled | |
89 static void DEF(put, disabled_pixels_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
90 { | 87 { |
88 MOVQ_ZERO(mm7); | |
89 SET_RND(mm6); // =2 for rnd and =1 for no_rnd version | |
91 __asm __volatile( | 90 __asm __volatile( |
92 MOVQ_BFE(%%mm7) | |
93 "lea (%3, %3), %%eax \n\t" | |
94 "movq (%1), %%mm0 \n\t" | |
95 "movq (%1), %%mm0 \n\t" | 91 "movq (%1), %%mm0 \n\t" |
96 "movq 1(%1), %%mm1 \n\t" | 92 "movq 1(%1), %%mm4 \n\t" |
97 ".balign 8 \n\t" | 93 "movq %%mm0, %%mm1 \n\t" |
94 "movq %%mm4, %%mm5 \n\t" | |
95 "punpcklbw %%mm7, %%mm0 \n\t" | |
96 "punpcklbw %%mm7, %%mm4 \n\t" | |
97 "punpckhbw %%mm7, %%mm1 \n\t" | |
98 "punpckhbw %%mm7, %%mm5 \n\t" | |
99 "paddusw %%mm0, %%mm4 \n\t" | |
100 "paddusw %%mm1, %%mm5 \n\t" | |
101 "xorl %%eax, %%eax \n\t" | |
102 "addl %3, %1 \n\t" | |
103 ".balign 4 \n\t" | |
98 "1: \n\t" | 104 "1: \n\t" |
99 "movq (%1, %3), %%mm2 \n\t" | 105 "movq (%1, %%eax), %%mm0 \n\t" |
100 "movq 1(%1, %3), %%mm3 \n\t" | 106 "movq 1(%1, %%eax), %%mm2 \n\t" |
101 PAVGBP(%%mm2, %%mm0, %%mm4, %%mm3, %%mm1, %%mm5) | 107 "movq %%mm0, %%mm1 \n\t" |
102 //PAVGBR(%%mm2, %%mm0, %%mm4) | 108 "movq %%mm2, %%mm3 \n\t" |
103 //PAVGBR(%%mm3, %%mm1, %%mm5) | 109 "punpcklbw %%mm7, %%mm0 \n\t" |
104 PAVGB(%%mm4, %%mm5) | 110 "punpcklbw %%mm7, %%mm2 \n\t" |
105 "movq %%mm6, (%2) \n\t" | 111 "punpckhbw %%mm7, %%mm1 \n\t" |
112 "punpckhbw %%mm7, %%mm3 \n\t" | |
113 "paddusw %%mm2, %%mm0 \n\t" | |
114 "paddusw %%mm3, %%mm1 \n\t" | |
115 "paddusw %%mm6, %%mm4 \n\t" | |
116 "paddusw %%mm6, %%mm5 \n\t" | |
117 "paddusw %%mm0, %%mm4 \n\t" | |
118 "paddusw %%mm1, %%mm5 \n\t" | |
119 "psrlw $2, %%mm4 \n\t" | |
120 "psrlw $2, %%mm5 \n\t" | |
121 "packuswb %%mm5, %%mm4 \n\t" | |
122 "movq %%mm4, (%2, %%eax) \n\t" | |
123 "addl %3, %%eax \n\t" | |
106 | 124 |
107 "movq (%1, %%eax), %%mm0 \n\t" | 125 "movq (%1, %%eax), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 |
108 "movq 1(%1, %%eax), %%mm1 \n\t" | 126 "movq 1(%1, %%eax), %%mm4 \n\t" |
109 PAVGBP(%%mm0, %%mm2, %%mm4, %%mm1, %%mm3, %%mm5) | 127 "movq %%mm2, %%mm3 \n\t" |
110 //PAVGBR(%%mm0, %%mm2, %%mm4) | 128 "movq %%mm4, %%mm5 \n\t" |
111 //PAVGBR(%%mm1, %%mm3, %%mm5) | 129 "punpcklbw %%mm7, %%mm2 \n\t" |
112 PAVGB(%%mm4, %%mm5) | 130 "punpcklbw %%mm7, %%mm4 \n\t" |
113 "movq %%mm6, (%2, %3) \n\t" | 131 "punpckhbw %%mm7, %%mm3 \n\t" |
114 "addl %%eax, %1 \n\t" | 132 "punpckhbw %%mm7, %%mm5 \n\t" |
115 "addl %%eax, %2 \n\t" | 133 "paddusw %%mm2, %%mm4 \n\t" |
134 "paddusw %%mm3, %%mm5 \n\t" | |
135 "paddusw %%mm6, %%mm0 \n\t" | |
136 "paddusw %%mm6, %%mm1 \n\t" | |
137 "paddusw %%mm4, %%mm0 \n\t" | |
138 "paddusw %%mm5, %%mm1 \n\t" | |
139 "psrlw $2, %%mm0 \n\t" | |
140 "psrlw $2, %%mm1 \n\t" | |
141 "packuswb %%mm1, %%mm0 \n\t" | |
142 "movq %%mm0, (%2, %%eax) \n\t" | |
143 "addl %3, %%eax \n\t" | |
116 | 144 |
117 "subl $2, %0 \n\t" | 145 "subl $2, %0 \n\t" |
118 | |
119 "jnz 1b \n\t" | 146 "jnz 1b \n\t" |
120 :"+g"(h), "+S"(pixels), "+D"(block) | 147 :"+g"(h), "+S"(pixels) |
121 :"r"(line_size) | 148 :"D"(block), "r"(line_size) |
122 :"eax", "memory"); | 149 :"eax", "memory"); |
123 } | 150 } |
124 | 151 |
125 // avg_pixels | 152 // avg_pixels |
126 |