Mercurial > mplayer.hg
comparison libswscale/yuv2rgb_template.c @ 23157:ebc55c913d73
cosmetics attack, part III: Remove all tabs and prettyprint/reindent the code.
author | diego |
---|---|
date | Sun, 29 Apr 2007 21:36:43 +0000 |
parents | 9528d1ebe68f |
children | 2c8b8b4e63c8 |
comparison
equal
deleted
inserted
replaced
23156:f9a8f92087ef | 23157:ebc55c913d73 |
---|---|
45 #define MOVNTQ "movq" | 45 #define MOVNTQ "movq" |
46 #define SFENCE "/nop" | 46 #define SFENCE "/nop" |
47 #endif | 47 #endif |
48 | 48 |
49 #define YUV2RGB \ | 49 #define YUV2RGB \ |
50 /* Do the multiply part of the conversion for even and odd pixels, | 50 /* Do the multiply part of the conversion for even and odd pixels, |
51 register usage: | 51 register usage: |
52 mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels, | 52 mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels, |
53 mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels, | 53 mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels, |
54 mm6 -> Y even, mm7 -> Y odd */\ | 54 mm6 -> Y even, mm7 -> Y odd */\ |
55 /* convert the chroma part */\ | 55 /* convert the chroma part */\ |
56 "punpcklbw %%mm4, %%mm0;" /* scatter 4 Cb 00 u3 00 u2 00 u1 00 u0 */ \ | 56 "punpcklbw %%mm4, %%mm0;" /* scatter 4 Cb 00 u3 00 u2 00 u1 00 u0 */ \ |
57 "punpcklbw %%mm4, %%mm1;" /* scatter 4 Cr 00 v3 00 v2 00 v1 00 v0 */ \ | 57 "punpcklbw %%mm4, %%mm1;" /* scatter 4 Cr 00 v3 00 v2 00 v1 00 v0 */ \ |
58 \ | 58 \ |
59 "psllw $3, %%mm0;" /* Promote precision */ \ | 59 "psllw $3, %%mm0;" /* Promote precision */ \ |
60 "psllw $3, %%mm1;" /* Promote precision */ \ | 60 "psllw $3, %%mm1;" /* Promote precision */ \ |
61 \ | 61 \ |
62 "psubsw "U_OFFSET"(%4), %%mm0;" /* Cb -= 128 */ \ | 62 "psubsw "U_OFFSET"(%4), %%mm0;" /* Cb -= 128 */ \ |
63 "psubsw "V_OFFSET"(%4), %%mm1;" /* Cr -= 128 */ \ | 63 "psubsw "V_OFFSET"(%4), %%mm1;" /* Cr -= 128 */ \ |
64 \ | 64 \ |
65 "movq %%mm0, %%mm2;" /* Copy 4 Cb 00 u3 00 u2 00 u1 00 u0 */ \ | 65 "movq %%mm0, %%mm2;" /* Copy 4 Cb 00 u3 00 u2 00 u1 00 u0 */ \ |
66 "movq %%mm1, %%mm3;" /* Copy 4 Cr 00 v3 00 v2 00 v1 00 v0 */ \ | 66 "movq %%mm1, %%mm3;" /* Copy 4 Cr 00 v3 00 v2 00 v1 00 v0 */ \ |
67 \ | 67 \ |
68 "pmulhw "UG_COEFF"(%4), %%mm2;" /* Mul Cb with green coeff -> Cb green */ \ | 68 "pmulhw "UG_COEFF"(%4), %%mm2;" /* Mul Cb with green coeff -> Cb green */ \ |
69 "pmulhw "VG_COEFF"(%4), %%mm3;" /* Mul Cr with green coeff -> Cr green */ \ | 69 "pmulhw "VG_COEFF"(%4), %%mm3;" /* Mul Cr with green coeff -> Cr green */ \ |
70 \ | 70 \ |
71 "pmulhw "UB_COEFF"(%4), %%mm0;" /* Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0 */\ | 71 "pmulhw "UB_COEFF"(%4), %%mm0;" /* Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0 */\ |
72 "pmulhw "VR_COEFF"(%4), %%mm1;" /* Mul Cr -> Cred 00 r3 00 r2 00 r1 00 r0 */\ | 72 "pmulhw "VR_COEFF"(%4), %%mm1;" /* Mul Cr -> Cred 00 r3 00 r2 00 r1 00 r0 */\ |
73 \ | 73 \ |
74 "paddsw %%mm3, %%mm2;" /* Cb green + Cr green -> Cgreen */\ | 74 "paddsw %%mm3, %%mm2;" /* Cb green + Cr green -> Cgreen */\ |
75 \ | 75 \ |
76 /* convert the luma part */\ | 76 /* convert the luma part */\ |
77 "movq %%mm6, %%mm7;" /* Copy 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */\ | 77 "movq %%mm6, %%mm7;" /* Copy 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */\ |
78 "pand "MANGLE(mmx_00ffw)", %%mm6;" /* get Y even 00 Y6 00 Y4 00 Y2 00 Y0 */\ | 78 "pand "MANGLE(mmx_00ffw)", %%mm6;" /* get Y even 00 Y6 00 Y4 00 Y2 00 Y0 */\ |
79 \ | 79 \ |
80 "psrlw $8, %%mm7;" /* get Y odd 00 Y7 00 Y5 00 Y3 00 Y1 */\ | 80 "psrlw $8, %%mm7;" /* get Y odd 00 Y7 00 Y5 00 Y3 00 Y1 */\ |
81 \ | 81 \ |
82 "psllw $3, %%mm6;" /* Promote precision */\ | 82 "psllw $3, %%mm6;" /* Promote precision */\ |
83 "psllw $3, %%mm7;" /* Promote precision */\ | 83 "psllw $3, %%mm7;" /* Promote precision */\ |
84 \ | 84 \ |
85 "psubw "Y_OFFSET"(%4), %%mm6;" /* Y -= 16 */\ | 85 "psubw "Y_OFFSET"(%4), %%mm6;" /* Y -= 16 */\ |
86 "psubw "Y_OFFSET"(%4), %%mm7;" /* Y -= 16 */\ | 86 "psubw "Y_OFFSET"(%4), %%mm7;" /* Y -= 16 */\ |
87 \ | 87 \ |
88 "pmulhw "Y_COEFF"(%4), %%mm6;" /* Mul 4 Y even 00 y6 00 y4 00 y2 00 y0 */\ | 88 "pmulhw "Y_COEFF"(%4), %%mm6;" /* Mul 4 Y even 00 y6 00 y4 00 y2 00 y0 */\ |
89 "pmulhw "Y_COEFF"(%4), %%mm7;" /* Mul 4 Y odd 00 y7 00 y5 00 y3 00 y1 */\ | 89 "pmulhw "Y_COEFF"(%4), %%mm7;" /* Mul 4 Y odd 00 y7 00 y5 00 y3 00 y1 */\ |
90 \ | 90 \ |
91 /* Do the addition part of the conversion for even and odd pixels, | 91 /* Do the addition part of the conversion for even and odd pixels, |
92 register usage: | 92 register usage: |
93 mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels, | 93 mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels, |
94 mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels, | 94 mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels, |
95 mm6 -> Y even, mm7 -> Y odd */\ | 95 mm6 -> Y even, mm7 -> Y odd */\ |
96 "movq %%mm0, %%mm3;" /* Copy Cblue */\ | 96 "movq %%mm0, %%mm3;" /* Copy Cblue */\ |
97 "movq %%mm1, %%mm4;" /* Copy Cred */\ | 97 "movq %%mm1, %%mm4;" /* Copy Cred */\ |
98 "movq %%mm2, %%mm5;" /* Copy Cgreen */\ | 98 "movq %%mm2, %%mm5;" /* Copy Cgreen */\ |
99 \ | 99 \ |
100 "paddsw %%mm6, %%mm0;" /* Y even + Cblue 00 B6 00 B4 00 B2 00 B0 */\ | 100 "paddsw %%mm6, %%mm0;" /* Y even + Cblue 00 B6 00 B4 00 B2 00 B0 */\ |
101 "paddsw %%mm7, %%mm3;" /* Y odd + Cblue 00 B7 00 B5 00 B3 00 B1 */\ | 101 "paddsw %%mm7, %%mm3;" /* Y odd + Cblue 00 B7 00 B5 00 B3 00 B1 */\ |
102 \ | 102 \ |
103 "paddsw %%mm6, %%mm1;" /* Y even + Cred 00 R6 00 R4 00 R2 00 R0 */\ | 103 "paddsw %%mm6, %%mm1;" /* Y even + Cred 00 R6 00 R4 00 R2 00 R0 */\ |
104 "paddsw %%mm7, %%mm4;" /* Y odd + Cred 00 R7 00 R5 00 R3 00 R1 */\ | 104 "paddsw %%mm7, %%mm4;" /* Y odd + Cred 00 R7 00 R5 00 R3 00 R1 */\ |
105 \ | 105 \ |
106 "paddsw %%mm6, %%mm2;" /* Y even + Cgreen 00 G6 00 G4 00 G2 00 G0 */\ | 106 "paddsw %%mm6, %%mm2;" /* Y even + Cgreen 00 G6 00 G4 00 G2 00 G0 */\ |
107 "paddsw %%mm7, %%mm5;" /* Y odd + Cgreen 00 G7 00 G5 00 G3 00 G1 */\ | 107 "paddsw %%mm7, %%mm5;" /* Y odd + Cgreen 00 G7 00 G5 00 G3 00 G1 */\ |
108 \ | 108 \ |
109 /* Limit RGB even to 0..255 */\ | 109 /* Limit RGB even to 0..255 */\ |
110 "packuswb %%mm0, %%mm0;" /* B6 B4 B2 B0 B6 B4 B2 B0 */\ | 110 "packuswb %%mm0, %%mm0;" /* B6 B4 B2 B0 B6 B4 B2 B0 */\ |
111 "packuswb %%mm1, %%mm1;" /* R6 R4 R2 R0 R6 R4 R2 R0 */\ | 111 "packuswb %%mm1, %%mm1;" /* R6 R4 R2 R0 R6 R4 R2 R0 */\ |
112 "packuswb %%mm2, %%mm2;" /* G6 G4 G2 G0 G6 G4 G2 G0 */\ | 112 "packuswb %%mm2, %%mm2;" /* G6 G4 G2 G0 G6 G4 G2 G0 */\ |
113 \ | 113 \ |
114 /* Limit RGB odd to 0..255 */\ | 114 /* Limit RGB odd to 0..255 */\ |
115 "packuswb %%mm3, %%mm3;" /* B7 B5 B3 B1 B7 B5 B3 B1 */\ | 115 "packuswb %%mm3, %%mm3;" /* B7 B5 B3 B1 B7 B5 B3 B1 */\ |
116 "packuswb %%mm4, %%mm4;" /* R7 R5 R3 R1 R7 R5 R3 R1 */\ | 116 "packuswb %%mm4, %%mm4;" /* R7 R5 R3 R1 R7 R5 R3 R1 */\ |
117 "packuswb %%mm5, %%mm5;" /* G7 G5 G3 G1 G7 G5 G3 G1 */\ | 117 "packuswb %%mm5, %%mm5;" /* G7 G5 G3 G1 G7 G5 G3 G1 */\ |
118 \ | 118 \ |
119 /* Interleave RGB even and odd */\ | 119 /* Interleave RGB even and odd */\ |
120 "punpcklbw %%mm3, %%mm0;" /* B7 B6 B5 B4 B3 B2 B1 B0 */\ | 120 "punpcklbw %%mm3, %%mm0;" /* B7 B6 B5 B4 B3 B2 B1 B0 */\ |
121 "punpcklbw %%mm4, %%mm1;" /* R7 R6 R5 R4 R3 R2 R1 R0 */\ | 121 "punpcklbw %%mm4, %%mm1;" /* R7 R6 R5 R4 R3 R2 R1 R0 */\ |
122 "punpcklbw %%mm5, %%mm2;" /* G7 G6 G5 G4 G3 G2 G1 G0 */\ | 122 "punpcklbw %%mm5, %%mm2;" /* G7 G6 G5 G4 G3 G2 G1 G0 */\ |
123 | 123 |
124 | 124 |
125 static inline int RENAME(yuv420_rgb16)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, | 125 static inline int RENAME(yuv420_rgb16)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, |
126 int srcSliceH, uint8_t* dst[], int dstStride[]){ | 126 int srcSliceH, uint8_t* dst[], int dstStride[]){ |
127 int y, h_size; | 127 int y, h_size; |
128 | 128 |
129 if(c->srcFormat == PIX_FMT_YUV422P){ | 129 if(c->srcFormat == PIX_FMT_YUV422P){ |
130 srcStride[1] *= 2; | 130 srcStride[1] *= 2; |
131 srcStride[2] *= 2; | 131 srcStride[2] *= 2; |
132 } | 132 } |
133 | 133 |
134 h_size= (c->dstW+7)&~7; | 134 h_size= (c->dstW+7)&~7; |
135 if(h_size*2 > FFABS(dstStride[0])) h_size-=8; | 135 if(h_size*2 > FFABS(dstStride[0])) h_size-=8; |
136 | 136 |
137 __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ ); | 137 __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ ); |
138 //printf("%X %X %X %X %X %X %X %X %X %X\n", (int)&c->redDither, (int)&b5Dither, (int)src[0], (int)src[1], (int)src[2], (int)dst[0], | 138 //printf("%X %X %X %X %X %X %X %X %X %X\n", (int)&c->redDither, (int)&b5Dither, (int)src[0], (int)src[1], (int)src[2], (int)dst[0], |
139 //srcStride[0],srcStride[1],srcStride[2],dstStride[0]); | 139 //srcStride[0],srcStride[1],srcStride[2],dstStride[0]); |
140 for (y= 0; y<srcSliceH; y++ ) { | 140 for (y= 0; y<srcSliceH; y++ ) { |
141 uint8_t *_image = dst[0] + (y+srcSliceY)*dstStride[0]; | 141 uint8_t *_image = dst[0] + (y+srcSliceY)*dstStride[0]; |
142 uint8_t *_py = src[0] + y*srcStride[0]; | 142 uint8_t *_py = src[0] + y*srcStride[0]; |
143 uint8_t *_pu = src[1] + (y>>1)*srcStride[1]; | 143 uint8_t *_pu = src[1] + (y>>1)*srcStride[1]; |
144 uint8_t *_pv = src[2] + (y>>1)*srcStride[2]; | 144 uint8_t *_pv = src[2] + (y>>1)*srcStride[2]; |
145 long index= -h_size/2; | 145 long index= -h_size/2; |
146 | 146 |
147 b5Dither= dither8[y&1]; | 147 b5Dither= dither8[y&1]; |
148 g6Dither= dither4[y&1]; | 148 g6Dither= dither4[y&1]; |
149 g5Dither= dither8[y&1]; | 149 g5Dither= dither8[y&1]; |
150 r5Dither= dither8[(y+1)&1]; | 150 r5Dither= dither8[(y+1)&1]; |
151 /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8 | 151 /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8 |
152 pixels in each iteration */ | 152 pixels in each iteration */ |
153 __asm__ __volatile__ ( | 153 __asm__ __volatile__ ( |
154 /* load data for start of next scan line */ | 154 /* load data for start of next scan line */ |
155 "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ | 155 "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ |
156 "movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ | 156 "movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ |
157 "movq (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ | 157 "movq (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ |
158 // ".balign 16 \n\t" | 158 //".balign 16 \n\t" |
159 "1: \n\t" | 159 "1: \n\t" |
160 /* no speed diference on my p3@500 with prefetch, | 160 /* no speed diference on my p3@500 with prefetch, |
161 * if it is faster for anyone with -benchmark then tell me | 161 * if it is faster for anyone with -benchmark then tell me |
162 PREFETCH" 64(%0) \n\t" | 162 PREFETCH" 64(%0) \n\t" |
163 PREFETCH" 64(%1) \n\t" | 163 PREFETCH" 64(%1) \n\t" |
164 PREFETCH" 64(%2) \n\t" | 164 PREFETCH" 64(%2) \n\t" |
165 */ | 165 */ |
166 YUV2RGB | 166 YUV2RGB |
167 | 167 |
168 #ifdef DITHER1XBPP | 168 #ifdef DITHER1XBPP |
169 "paddusb "MANGLE(b5Dither)", %%mm0;" | 169 "paddusb "MANGLE(b5Dither)", %%mm0;" |
170 "paddusb "MANGLE(g6Dither)", %%mm2;" | 170 "paddusb "MANGLE(g6Dither)", %%mm2;" |
171 "paddusb "MANGLE(r5Dither)", %%mm1;" | 171 "paddusb "MANGLE(r5Dither)", %%mm1;" |
172 #endif | 172 #endif |
173 /* mask unneeded bits off */ | 173 /* mask unneeded bits off */ |
174 "pand "MANGLE(mmx_redmask)", %%mm0;" /* b7b6b5b4 b3_0_0_0 b7b6b5b4 b3_0_0_0 */ | 174 "pand "MANGLE(mmx_redmask)", %%mm0;" /* b7b6b5b4 b3_0_0_0 b7b6b5b4 b3_0_0_0 */ |
175 "pand "MANGLE(mmx_grnmask)", %%mm2;" /* g7g6g5g4 g3g2_0_0 g7g6g5g4 g3g2_0_0 */ | 175 "pand "MANGLE(mmx_grnmask)", %%mm2;" /* g7g6g5g4 g3g2_0_0 g7g6g5g4 g3g2_0_0 */ |
176 "pand "MANGLE(mmx_redmask)", %%mm1;" /* r7r6r5r4 r3_0_0_0 r7r6r5r4 r3_0_0_0 */ | 176 "pand "MANGLE(mmx_redmask)", %%mm1;" /* r7r6r5r4 r3_0_0_0 r7r6r5r4 r3_0_0_0 */ |
177 | 177 |
178 "psrlw $3,%%mm0;" /* 0_0_0_b7 b6b5b4b3 0_0_0_b7 b6b5b4b3 */ | 178 "psrlw $3, %%mm0;" /* 0_0_0_b7 b6b5b4b3 0_0_0_b7 b6b5b4b3 */ |
179 "pxor %%mm4, %%mm4;" /* zero mm4 */ | 179 "pxor %%mm4, %%mm4;" /* zero mm4 */ |
180 | 180 |
181 "movq %%mm0, %%mm5;" /* Copy B7-B0 */ | 181 "movq %%mm0, %%mm5;" /* Copy B7-B0 */ |
182 "movq %%mm2, %%mm7;" /* Copy G7-G0 */ | 182 "movq %%mm2, %%mm7;" /* Copy G7-G0 */ |
183 | 183 |
184 /* convert rgb24 plane to rgb16 pack for pixel 0-3 */ | 184 /* convert rgb24 plane to rgb16 pack for pixel 0-3 */ |
185 "punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */ | 185 "punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */ |
186 "punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */ | 186 "punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */ |
187 | 187 |
188 "psllw $3, %%mm2;" /* 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0 */ | 188 "psllw $3, %%mm2;" /* 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0 */ |
189 "por %%mm2, %%mm0;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */ | 189 "por %%mm2, %%mm0;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */ |
190 | 190 |
191 "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ | 191 "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ |
192 MOVNTQ " %%mm0, (%1);" /* store pixel 0-3 */ | 192 MOVNTQ " %%mm0, (%1);" /* store pixel 0-3 */ |
193 | 193 |
194 /* convert rgb24 plane to rgb16 pack for pixel 0-3 */ | 194 /* convert rgb24 plane to rgb16 pack for pixel 0-3 */ |
195 "punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */ | 195 "punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */ |
196 "punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */ | 196 "punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */ |
197 | 197 |
198 "psllw $3, %%mm7;" /* 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0 */ | 198 "psllw $3, %%mm7;" /* 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0 */ |
199 "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ | 199 "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ |
200 | 200 |
201 "por %%mm7, %%mm5;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */ | 201 "por %%mm7, %%mm5;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */ |
202 "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ | 202 "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ |
203 | 203 |
204 MOVNTQ " %%mm5, 8 (%1);" /* store pixel 4-7 */ | 204 MOVNTQ " %%mm5, 8 (%1);" /* store pixel 4-7 */ |
205 | 205 |
206 "add $16, %1 \n\t" | 206 "add $16, %1 \n\t" |
207 "add $4, %0 \n\t" | 207 "add $4, %0 \n\t" |
208 " js 1b \n\t" | 208 " js 1b \n\t" |
209 | 209 |
210 : "+r" (index), "+r" (_image) | 210 : "+r" (index), "+r" (_image) |
211 : "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index) | 211 : "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index) |
212 ); | 212 ); |
213 } | 213 } |
214 | 214 |
215 __asm__ __volatile__ (EMMS); | 215 __asm__ __volatile__ (EMMS); |
216 | 216 |
217 return srcSliceH; | 217 return srcSliceH; |
218 } | 218 } |
219 | 219 |
220 static inline int RENAME(yuv420_rgb15)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, | 220 static inline int RENAME(yuv420_rgb15)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, |
221 int srcSliceH, uint8_t* dst[], int dstStride[]){ | 221 int srcSliceH, uint8_t* dst[], int dstStride[]){ |
222 int y, h_size; | 222 int y, h_size; |
223 | 223 |
224 if(c->srcFormat == PIX_FMT_YUV422P){ | 224 if(c->srcFormat == PIX_FMT_YUV422P){ |
225 srcStride[1] *= 2; | 225 srcStride[1] *= 2; |
226 srcStride[2] *= 2; | 226 srcStride[2] *= 2; |
227 } | 227 } |
228 | 228 |
229 h_size= (c->dstW+7)&~7; | 229 h_size= (c->dstW+7)&~7; |
230 if(h_size*2 > FFABS(dstStride[0])) h_size-=8; | 230 if(h_size*2 > FFABS(dstStride[0])) h_size-=8; |
231 | 231 |
232 __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ ); | 232 __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ ); |
233 //printf("%X %X %X %X %X %X %X %X %X %X\n", (int)&c->redDither, (int)&b5Dither, (int)src[0], (int)src[1], (int)src[2], (int)dst[0], | 233 //printf("%X %X %X %X %X %X %X %X %X %X\n", (int)&c->redDither, (int)&b5Dither, (int)src[0], (int)src[1], (int)src[2], (int)dst[0], |
234 //srcStride[0],srcStride[1],srcStride[2],dstStride[0]); | 234 //srcStride[0],srcStride[1],srcStride[2],dstStride[0]); |
235 for (y= 0; y<srcSliceH; y++ ) { | 235 for (y= 0; y<srcSliceH; y++ ) { |
236 uint8_t *_image = dst[0] + (y+srcSliceY)*dstStride[0]; | 236 uint8_t *_image = dst[0] + (y+srcSliceY)*dstStride[0]; |
237 uint8_t *_py = src[0] + y*srcStride[0]; | 237 uint8_t *_py = src[0] + y*srcStride[0]; |
238 uint8_t *_pu = src[1] + (y>>1)*srcStride[1]; | 238 uint8_t *_pu = src[1] + (y>>1)*srcStride[1]; |
239 uint8_t *_pv = src[2] + (y>>1)*srcStride[2]; | 239 uint8_t *_pv = src[2] + (y>>1)*srcStride[2]; |
240 long index= -h_size/2; | 240 long index= -h_size/2; |
241 | 241 |
242 b5Dither= dither8[y&1]; | 242 b5Dither= dither8[y&1]; |
243 g6Dither= dither4[y&1]; | 243 g6Dither= dither4[y&1]; |
244 g5Dither= dither8[y&1]; | 244 g5Dither= dither8[y&1]; |
245 r5Dither= dither8[(y+1)&1]; | 245 r5Dither= dither8[(y+1)&1]; |
246 /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8 | 246 /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8 |
247 pixels in each iteration */ | 247 pixels in each iteration */ |
248 __asm__ __volatile__ ( | 248 __asm__ __volatile__ ( |
249 /* load data for start of next scan line */ | 249 /* load data for start of next scan line */ |
250 "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ | 250 "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ |
251 "movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ | 251 "movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ |
252 "movq (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ | 252 "movq (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ |
253 // ".balign 16 \n\t" | 253 //".balign 16 \n\t" |
254 "1: \n\t" | 254 "1: \n\t" |
255 YUV2RGB | 255 YUV2RGB |
256 | 256 |
257 #ifdef DITHER1XBPP | 257 #ifdef DITHER1XBPP |
258 "paddusb "MANGLE(b5Dither)", %%mm0 \n\t" | 258 "paddusb "MANGLE(b5Dither)", %%mm0 \n\t" |
259 "paddusb "MANGLE(g5Dither)", %%mm2 \n\t" | 259 "paddusb "MANGLE(g5Dither)", %%mm2 \n\t" |
260 "paddusb "MANGLE(r5Dither)", %%mm1 \n\t" | 260 "paddusb "MANGLE(r5Dither)", %%mm1 \n\t" |
261 #endif | 261 #endif |
262 | 262 |
263 /* mask unneeded bits off */ | 263 /* mask unneeded bits off */ |
264 "pand "MANGLE(mmx_redmask)", %%mm0;" /* b7b6b5b4 b3_0_0_0 b7b6b5b4 b3_0_0_0 */ | 264 "pand "MANGLE(mmx_redmask)", %%mm0;" /* b7b6b5b4 b3_0_0_0 b7b6b5b4 b3_0_0_0 */ |
265 "pand "MANGLE(mmx_redmask)", %%mm2;" /* g7g6g5g4 g3_0_0_0 g7g6g5g4 g3_0_0_0 */ | 265 "pand "MANGLE(mmx_redmask)", %%mm2;" /* g7g6g5g4 g3_0_0_0 g7g6g5g4 g3_0_0_0 */ |
266 "pand "MANGLE(mmx_redmask)", %%mm1;" /* r7r6r5r4 r3_0_0_0 r7r6r5r4 r3_0_0_0 */ | 266 "pand "MANGLE(mmx_redmask)", %%mm1;" /* r7r6r5r4 r3_0_0_0 r7r6r5r4 r3_0_0_0 */ |
267 | 267 |
268 "psrlw $3,%%mm0;" /* 0_0_0_b7 b6b5b4b3 0_0_0_b7 b6b5b4b3 */ | 268 "psrlw $3, %%mm0;" /* 0_0_0_b7 b6b5b4b3 0_0_0_b7 b6b5b4b3 */ |
269 "psrlw $1,%%mm1;" /* 0_r7r6r5 r4r3_0_0 0_r7r6r5 r4r3_0_0 */ | 269 "psrlw $1, %%mm1;" /* 0_r7r6r5 r4r3_0_0 0_r7r6r5 r4r3_0_0 */ |
270 "pxor %%mm4, %%mm4;" /* zero mm4 */ | 270 "pxor %%mm4, %%mm4;" /* zero mm4 */ |
271 | 271 |
272 "movq %%mm0, %%mm5;" /* Copy B7-B0 */ | 272 "movq %%mm0, %%mm5;" /* Copy B7-B0 */ |
273 "movq %%mm2, %%mm7;" /* Copy G7-G0 */ | 273 "movq %%mm2, %%mm7;" /* Copy G7-G0 */ |
274 | 274 |
275 /* convert rgb24 plane to rgb16 pack for pixel 0-3 */ | 275 /* convert rgb24 plane to rgb16 pack for pixel 0-3 */ |
276 "punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3_0_0_0 */ | 276 "punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3_0_0_0 */ |
277 "punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */ | 277 "punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */ |
278 | 278 |
279 "psllw $2, %%mm2;" /* 0_0_0_0 0_0_g7g6 g5g4g3_0 0_0_0_0 */ | 279 "psllw $2, %%mm2;" /* 0_0_0_0 0_0_g7g6 g5g4g3_0 0_0_0_0 */ |
280 "por %%mm2, %%mm0;" /* 0_r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3 */ | 280 "por %%mm2, %%mm0;" /* 0_r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3 */ |
281 | 281 |
282 "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ | 282 "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ |
283 MOVNTQ " %%mm0, (%1);" /* store pixel 0-3 */ | 283 MOVNTQ " %%mm0, (%1);" /* store pixel 0-3 */ |
284 | 284 |
285 /* convert rgb24 plane to rgb16 pack for pixel 0-3 */ | 285 /* convert rgb24 plane to rgb16 pack for pixel 0-3 */ |
286 "punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 0_g7g6g5 g4g3_0_0 */ | 286 "punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 0_g7g6g5 g4g3_0_0 */ |
287 "punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */ | 287 "punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */ |
288 | 288 |
289 "psllw $2, %%mm7;" /* 0_0_0_0 0_0_g7g6 g5g4g3_0 0_0_0_0 */ | 289 "psllw $2, %%mm7;" /* 0_0_0_0 0_0_g7g6 g5g4g3_0 0_0_0_0 */ |
290 "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ | 290 "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ |
291 | 291 |
292 "por %%mm7, %%mm5;" /* 0_r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3 */ | 292 "por %%mm7, %%mm5;" /* 0_r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3 */ |
293 "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ | 293 "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ |
294 | 294 |
295 MOVNTQ " %%mm5, 8 (%1);" /* store pixel 4-7 */ | 295 MOVNTQ " %%mm5, 8 (%1);" /* store pixel 4-7 */ |
296 | 296 |
297 "add $16, %1 \n\t" | 297 "add $16, %1 \n\t" |
298 "add $4, %0 \n\t" | 298 "add $4, %0 \n\t" |
299 " js 1b \n\t" | 299 " js 1b \n\t" |
300 : "+r" (index), "+r" (_image) | 300 : "+r" (index), "+r" (_image) |
301 : "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index) | 301 : "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index) |
302 ); | 302 ); |
303 } | 303 } |
304 | 304 |
305 __asm__ __volatile__ (EMMS); | 305 __asm__ __volatile__ (EMMS); |
306 return srcSliceH; | 306 return srcSliceH; |
307 } | 307 } |
308 | 308 |
309 static inline int RENAME(yuv420_rgb24)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, | 309 static inline int RENAME(yuv420_rgb24)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, |
310 int srcSliceH, uint8_t* dst[], int dstStride[]){ | 310 int srcSliceH, uint8_t* dst[], int dstStride[]){ |
311 int y, h_size; | 311 int y, h_size; |
312 | 312 |
313 if(c->srcFormat == PIX_FMT_YUV422P){ | 313 if(c->srcFormat == PIX_FMT_YUV422P){ |
314 srcStride[1] *= 2; | 314 srcStride[1] *= 2; |
315 srcStride[2] *= 2; | 315 srcStride[2] *= 2; |
316 } | 316 } |
317 | 317 |
318 h_size= (c->dstW+7)&~7; | 318 h_size= (c->dstW+7)&~7; |
319 if(h_size*3 > FFABS(dstStride[0])) h_size-=8; | 319 if(h_size*3 > FFABS(dstStride[0])) h_size-=8; |
320 | 320 |
321 __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ ); | 321 __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ ); |
322 | 322 |
323 for (y= 0; y<srcSliceH; y++ ) { | 323 for (y= 0; y<srcSliceH; y++ ) { |
324 uint8_t *_image = dst[0] + (y+srcSliceY)*dstStride[0]; | 324 uint8_t *_image = dst[0] + (y+srcSliceY)*dstStride[0]; |
325 uint8_t *_py = src[0] + y*srcStride[0]; | 325 uint8_t *_py = src[0] + y*srcStride[0]; |
326 uint8_t *_pu = src[1] + (y>>1)*srcStride[1]; | 326 uint8_t *_pu = src[1] + (y>>1)*srcStride[1]; |
327 uint8_t *_pv = src[2] + (y>>1)*srcStride[2]; | 327 uint8_t *_pv = src[2] + (y>>1)*srcStride[2]; |
328 long index= -h_size/2; | 328 long index= -h_size/2; |
329 | 329 |
330 /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8 | 330 /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8 |
331 pixels in each iteration */ | 331 pixels in each iteration */ |
332 __asm__ __volatile__ ( | 332 __asm__ __volatile__ ( |
333 /* load data for start of next scan line */ | 333 /* load data for start of next scan line */ |
334 "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ | 334 "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ |
335 "movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ | 335 "movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ |
336 "movq (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ | 336 "movq (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ |
337 // ".balign 16 \n\t" | 337 //".balign 16 \n\t" |
338 "1: \n\t" | 338 "1: \n\t" |
339 YUV2RGB | 339 YUV2RGB |
340 /* mm0=B, %%mm2=G, %%mm1=R */ | 340 /* mm0=B, %%mm2=G, %%mm1=R */ |
341 #ifdef HAVE_MMX2 | 341 #ifdef HAVE_MMX2 |
342 "movq "MANGLE(M24A)", %%mm4 \n\t" | 342 "movq "MANGLE(M24A)", %%mm4 \n\t" |
343 "movq "MANGLE(M24C)", %%mm7 \n\t" | 343 "movq "MANGLE(M24C)", %%mm7 \n\t" |
344 "pshufw $0x50, %%mm0, %%mm5 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */ | 344 "pshufw $0x50, %%mm0, %%mm5 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */ |
345 "pshufw $0x50, %%mm2, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */ | 345 "pshufw $0x50, %%mm2, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */ |
346 "pshufw $0x00, %%mm1, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */ | 346 "pshufw $0x00, %%mm1, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */ |
347 | 347 |
348 "pand %%mm4, %%mm5 \n\t" /* B2 B1 B0 */ | 348 "pand %%mm4, %%mm5 \n\t" /* B2 B1 B0 */ |
349 "pand %%mm4, %%mm3 \n\t" /* G2 G1 G0 */ | 349 "pand %%mm4, %%mm3 \n\t" /* G2 G1 G0 */ |
350 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */ | 350 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */ |
351 | 351 |
352 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */ | 352 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */ |
353 "por %%mm5, %%mm6 \n\t" | 353 "por %%mm5, %%mm6 \n\t" |
354 "por %%mm3, %%mm6 \n\t" | 354 "por %%mm3, %%mm6 \n\t" |
355 MOVNTQ" %%mm6, (%1) \n\t" | 355 MOVNTQ" %%mm6, (%1) \n\t" |
356 | 356 |
357 "psrlq $8, %%mm2 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */ | 357 "psrlq $8, %%mm2 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */ |
358 "pshufw $0xA5, %%mm0, %%mm5 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */ | 358 "pshufw $0xA5, %%mm0, %%mm5 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */ |
359 "pshufw $0x55, %%mm2, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */ | 359 "pshufw $0x55, %%mm2, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */ |
360 "pshufw $0xA5, %%mm1, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */ | 360 "pshufw $0xA5, %%mm1, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */ |
361 | 361 |
362 "pand "MANGLE(M24B)", %%mm5 \n\t" /* B5 B4 B3 */ | 362 "pand "MANGLE(M24B)", %%mm5 \n\t" /* B5 B4 B3 */ |
363 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */ | 363 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */ |
364 "pand %%mm4, %%mm6 \n\t" /* R4 R3 R2 */ | 364 "pand %%mm4, %%mm6 \n\t" /* R4 R3 R2 */ |
365 | 365 |
366 "por %%mm5, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */ | 366 "por %%mm5, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */ |
367 "por %%mm3, %%mm6 \n\t" | 367 "por %%mm3, %%mm6 \n\t" |
368 MOVNTQ" %%mm6, 8(%1) \n\t" | 368 MOVNTQ" %%mm6, 8(%1) \n\t" |
369 | 369 |
370 "pshufw $0xFF, %%mm0, %%mm5 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */ | 370 "pshufw $0xFF, %%mm0, %%mm5 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */ |
371 "pshufw $0xFA, %%mm2, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */ | 371 "pshufw $0xFA, %%mm2, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */ |
372 "pshufw $0xFA, %%mm1, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */ | 372 "pshufw $0xFA, %%mm1, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */ |
373 "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ | 373 "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ |
374 | 374 |
375 "pand %%mm7, %%mm5 \n\t" /* B7 B6 */ | 375 "pand %%mm7, %%mm5 \n\t" /* B7 B6 */ |
376 "pand %%mm4, %%mm3 \n\t" /* G7 G6 G5 */ | 376 "pand %%mm4, %%mm3 \n\t" /* G7 G6 G5 */ |
377 "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */ | 377 "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */ |
378 "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ | 378 "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ |
379 \ | 379 \ |
380 "por %%mm5, %%mm3 \n\t" | 380 "por %%mm5, %%mm3 \n\t" |
381 "por %%mm3, %%mm6 \n\t" | 381 "por %%mm3, %%mm6 \n\t" |
382 MOVNTQ" %%mm6, 16(%1) \n\t" | 382 MOVNTQ" %%mm6, 16(%1) \n\t" |
383 "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ | 383 "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ |
384 "pxor %%mm4, %%mm4 \n\t" | 384 "pxor %%mm4, %%mm4 \n\t" |
385 | 385 |
386 #else | 386 #else |
387 | 387 |
388 "pxor %%mm4, %%mm4 \n\t" | 388 "pxor %%mm4, %%mm4 \n\t" |
389 "movq %%mm0, %%mm5 \n\t" /* B */ | 389 "movq %%mm0, %%mm5 \n\t" /* B */ |
390 "movq %%mm1, %%mm6 \n\t" /* R */ | 390 "movq %%mm1, %%mm6 \n\t" /* R */ |
391 "punpcklbw %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */ | 391 "punpcklbw %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */ |
392 "punpcklbw %%mm4, %%mm1 \n\t" /* 0R0R0R0R 0 */ | 392 "punpcklbw %%mm4, %%mm1 \n\t" /* 0R0R0R0R 0 */ |
393 "punpckhbw %%mm2, %%mm5 \n\t" /* GBGBGBGB 2 */ | 393 "punpckhbw %%mm2, %%mm5 \n\t" /* GBGBGBGB 2 */ |
394 "punpckhbw %%mm4, %%mm6 \n\t" /* 0R0R0R0R 2 */ | 394 "punpckhbw %%mm4, %%mm6 \n\t" /* 0R0R0R0R 2 */ |
395 "movq %%mm0, %%mm7 \n\t" /* GBGBGBGB 0 */ | 395 "movq %%mm0, %%mm7 \n\t" /* GBGBGBGB 0 */ |
396 "movq %%mm5, %%mm3 \n\t" /* GBGBGBGB 2 */ | 396 "movq %%mm5, %%mm3 \n\t" /* GBGBGBGB 2 */ |
397 "punpcklwd %%mm1, %%mm7 \n\t" /* 0RGB0RGB 0 */ | 397 "punpcklwd %%mm1, %%mm7 \n\t" /* 0RGB0RGB 0 */ |
398 "punpckhwd %%mm1, %%mm0 \n\t" /* 0RGB0RGB 1 */ | 398 "punpckhwd %%mm1, %%mm0 \n\t" /* 0RGB0RGB 1 */ |
399 "punpcklwd %%mm6, %%mm5 \n\t" /* 0RGB0RGB 2 */ | 399 "punpcklwd %%mm6, %%mm5 \n\t" /* 0RGB0RGB 2 */ |
400 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */ | 400 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */ |
401 | 401 |
402 "movq %%mm7, %%mm2 \n\t" /* 0RGB0RGB 0 */ | 402 "movq %%mm7, %%mm2 \n\t" /* 0RGB0RGB 0 */ |
403 "movq %%mm0, %%mm6 \n\t" /* 0RGB0RGB 1 */ | 403 "movq %%mm0, %%mm6 \n\t" /* 0RGB0RGB 1 */ |
404 "movq %%mm5, %%mm1 \n\t" /* 0RGB0RGB 2 */ | 404 "movq %%mm5, %%mm1 \n\t" /* 0RGB0RGB 2 */ |
405 "movq %%mm3, %%mm4 \n\t" /* 0RGB0RGB 3 */ | 405 "movq %%mm3, %%mm4 \n\t" /* 0RGB0RGB 3 */ |
406 | 406 |
407 "psllq $40, %%mm7 \n\t" /* RGB00000 0 */ | 407 "psllq $40, %%mm7 \n\t" /* RGB00000 0 */ |
408 "psllq $40, %%mm0 \n\t" /* RGB00000 1 */ | 408 "psllq $40, %%mm0 \n\t" /* RGB00000 1 */ |
409 "psllq $40, %%mm5 \n\t" /* RGB00000 2 */ | 409 "psllq $40, %%mm5 \n\t" /* RGB00000 2 */ |
410 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */ | 410 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */ |
411 | 411 |
412 "punpckhdq %%mm2, %%mm7 \n\t" /* 0RGBRGB0 0 */ | 412 "punpckhdq %%mm2, %%mm7 \n\t" /* 0RGBRGB0 0 */ |
413 "punpckhdq %%mm6, %%mm0 \n\t" /* 0RGBRGB0 1 */ | 413 "punpckhdq %%mm6, %%mm0 \n\t" /* 0RGBRGB0 1 */ |
414 "punpckhdq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */ | 414 "punpckhdq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */ |
415 "punpckhdq %%mm4, %%mm3 \n\t" /* 0RGBRGB0 3 */ | 415 "punpckhdq %%mm4, %%mm3 \n\t" /* 0RGBRGB0 3 */ |
416 | 416 |
417 "psrlq $8, %%mm7 \n\t" /* 00RGBRGB 0 */ | 417 "psrlq $8, %%mm7 \n\t" /* 00RGBRGB 0 */ |
418 "movq %%mm0, %%mm6 \n\t" /* 0RGBRGB0 1 */ | 418 "movq %%mm0, %%mm6 \n\t" /* 0RGBRGB0 1 */ |
419 "psllq $40, %%mm0 \n\t" /* GB000000 1 */ | 419 "psllq $40, %%mm0 \n\t" /* GB000000 1 */ |
420 "por %%mm0, %%mm7 \n\t" /* GBRGBRGB 0 */ | 420 "por %%mm0, %%mm7 \n\t" /* GBRGBRGB 0 */ |
421 MOVNTQ" %%mm7, (%1) \n\t" | 421 MOVNTQ" %%mm7, (%1) \n\t" |
422 | 422 |
423 "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ | 423 "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ |
424 | 424 |
425 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */ | 425 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */ |
426 "movq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */ | 426 "movq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */ |
427 "psllq $24, %%mm5 \n\t" /* BRGB0000 2 */ | 427 "psllq $24, %%mm5 \n\t" /* BRGB0000 2 */ |
428 "por %%mm5, %%mm6 \n\t" /* BRGBRGBR 1 */ | 428 "por %%mm5, %%mm6 \n\t" /* BRGBRGBR 1 */ |
429 MOVNTQ" %%mm6, 8(%1) \n\t" | 429 MOVNTQ" %%mm6, 8(%1) \n\t" |
430 | 430 |
431 "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ | 431 "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ |
432 | 432 |
433 "psrlq $40, %%mm1 \n\t" /* 000000RG 2 */ | 433 "psrlq $40, %%mm1 \n\t" /* 000000RG 2 */ |
434 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */ | 434 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */ |
435 "por %%mm3, %%mm1 \n\t" /* RGBRGBRG 2 */ | 435 "por %%mm3, %%mm1 \n\t" /* RGBRGBRG 2 */ |
436 MOVNTQ" %%mm1, 16(%1) \n\t" | 436 MOVNTQ" %%mm1, 16(%1) \n\t" |
437 | 437 |
438 "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ | 438 "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ |
439 "pxor %%mm4, %%mm4 \n\t" | 439 "pxor %%mm4, %%mm4 \n\t" |
440 #endif | 440 #endif |
441 | 441 |
442 "add $24, %1 \n\t" | 442 "add $24, %1 \n\t" |
443 "add $4, %0 \n\t" | 443 "add $4, %0 \n\t" |
444 " js 1b \n\t" | 444 " js 1b \n\t" |
445 | 445 |
446 : "+r" (index), "+r" (_image) | 446 : "+r" (index), "+r" (_image) |
447 : "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index) | 447 : "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index) |
448 ); | 448 ); |
449 } | 449 } |
450 | 450 |
451 __asm__ __volatile__ (EMMS); | 451 __asm__ __volatile__ (EMMS); |
452 return srcSliceH; | 452 return srcSliceH; |
453 } | 453 } |
454 | 454 |
455 static inline int RENAME(yuv420_rgb32)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, | 455 static inline int RENAME(yuv420_rgb32)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, |
456 int srcSliceH, uint8_t* dst[], int dstStride[]){ | 456 int srcSliceH, uint8_t* dst[], int dstStride[]){ |
457 int y, h_size; | 457 int y, h_size; |
458 | 458 |
459 if(c->srcFormat == PIX_FMT_YUV422P){ | 459 if(c->srcFormat == PIX_FMT_YUV422P){ |
460 srcStride[1] *= 2; | 460 srcStride[1] *= 2; |
461 srcStride[2] *= 2; | 461 srcStride[2] *= 2; |
462 } | 462 } |
463 | 463 |
464 h_size= (c->dstW+7)&~7; | 464 h_size= (c->dstW+7)&~7; |
465 if(h_size*4 > FFABS(dstStride[0])) h_size-=8; | 465 if(h_size*4 > FFABS(dstStride[0])) h_size-=8; |
466 | 466 |
467 __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ ); | 467 __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ ); |
468 | 468 |
469 for (y= 0; y<srcSliceH; y++ ) { | 469 for (y= 0; y<srcSliceH; y++ ) { |
470 uint8_t *_image = dst[0] + (y+srcSliceY)*dstStride[0]; | 470 uint8_t *_image = dst[0] + (y+srcSliceY)*dstStride[0]; |
471 uint8_t *_py = src[0] + y*srcStride[0]; | 471 uint8_t *_py = src[0] + y*srcStride[0]; |
472 uint8_t *_pu = src[1] + (y>>1)*srcStride[1]; | 472 uint8_t *_pu = src[1] + (y>>1)*srcStride[1]; |
473 uint8_t *_pv = src[2] + (y>>1)*srcStride[2]; | 473 uint8_t *_pv = src[2] + (y>>1)*srcStride[2]; |
474 long index= -h_size/2; | 474 long index= -h_size/2; |
475 | 475 |
476 /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8 | 476 /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8 |
477 pixels in each iteration */ | 477 pixels in each iteration */ |
478 __asm__ __volatile__ ( | 478 __asm__ __volatile__ ( |
479 /* load data for start of next scan line */ | 479 /* load data for start of next scan line */ |
480 "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ | 480 "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ |
481 "movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ | 481 "movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ |
482 "movq (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ | 482 "movq (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ |
483 // ".balign 16 \n\t" | 483 //".balign 16 \n\t" |
484 "1: \n\t" | 484 "1: \n\t" |
485 YUV2RGB | 485 YUV2RGB |
486 /* convert RGB plane to RGB packed format, | 486 /* convert RGB plane to RGB packed format, |
487 mm0 -> B, mm1 -> R, mm2 -> G, mm3 -> 0, | 487 mm0 -> B, mm1 -> R, mm2 -> G, mm3 -> 0, |
488 mm4 -> GB, mm5 -> AR pixel 4-7, | 488 mm4 -> GB, mm5 -> AR pixel 4-7, |
489 mm6 -> GB, mm7 -> AR pixel 0-3 */ | 489 mm6 -> GB, mm7 -> AR pixel 0-3 */ |
490 "pxor %%mm3, %%mm3;" /* zero mm3 */ | 490 "pxor %%mm3, %%mm3;" /* zero mm3 */ |
491 | 491 |
492 "movq %%mm0, %%mm6;" /* B7 B6 B5 B4 B3 B2 B1 B0 */ | 492 "movq %%mm0, %%mm6;" /* B7 B6 B5 B4 B3 B2 B1 B0 */ |
493 "movq %%mm1, %%mm7;" /* R7 R6 R5 R4 R3 R2 R1 R0 */ | 493 "movq %%mm1, %%mm7;" /* R7 R6 R5 R4 R3 R2 R1 R0 */ |
494 | 494 |
495 "movq %%mm0, %%mm4;" /* B7 B6 B5 B4 B3 B2 B1 B0 */ | 495 "movq %%mm0, %%mm4;" /* B7 B6 B5 B4 B3 B2 B1 B0 */ |
496 "movq %%mm1, %%mm5;" /* R7 R6 R5 R4 R3 R2 R1 R0 */ | 496 "movq %%mm1, %%mm5;" /* R7 R6 R5 R4 R3 R2 R1 R0 */ |
497 | 497 |
498 "punpcklbw %%mm2, %%mm6;" /* G3 B3 G2 B2 G1 B1 G0 B0 */ | 498 "punpcklbw %%mm2, %%mm6;" /* G3 B3 G2 B2 G1 B1 G0 B0 */ |
499 "punpcklbw %%mm3, %%mm7;" /* 00 R3 00 R2 00 R1 00 R0 */ | 499 "punpcklbw %%mm3, %%mm7;" /* 00 R3 00 R2 00 R1 00 R0 */ |
500 | 500 |
501 "punpcklwd %%mm7, %%mm6;" /* 00 R1 B1 G1 00 R0 B0 G0 */ | 501 "punpcklwd %%mm7, %%mm6;" /* 00 R1 B1 G1 00 R0 B0 G0 */ |
502 MOVNTQ " %%mm6, (%1);" /* Store ARGB1 ARGB0 */ | 502 MOVNTQ " %%mm6, (%1);" /* Store ARGB1 ARGB0 */ |
503 | 503 |
504 "movq %%mm0, %%mm6;" /* B7 B6 B5 B4 B3 B2 B1 B0 */ | 504 "movq %%mm0, %%mm6;" /* B7 B6 B5 B4 B3 B2 B1 B0 */ |
505 "punpcklbw %%mm2, %%mm6;" /* G3 B3 G2 B2 G1 B1 G0 B0 */ | 505 "punpcklbw %%mm2, %%mm6;" /* G3 B3 G2 B2 G1 B1 G0 B0 */ |
506 | 506 |
507 "punpckhwd %%mm7, %%mm6;" /* 00 R3 G3 B3 00 R2 B3 G2 */ | 507 "punpckhwd %%mm7, %%mm6;" /* 00 R3 G3 B3 00 R2 B3 G2 */ |
508 MOVNTQ " %%mm6, 8 (%1);" /* Store ARGB3 ARGB2 */ | 508 MOVNTQ " %%mm6, 8 (%1);" /* Store ARGB3 ARGB2 */ |
509 | 509 |
510 "punpckhbw %%mm2, %%mm4;" /* G7 B7 G6 B6 G5 B5 G4 B4 */ | 510 "punpckhbw %%mm2, %%mm4;" /* G7 B7 G6 B6 G5 B5 G4 B4 */ |
511 "punpckhbw %%mm3, %%mm5;" /* 00 R7 00 R6 00 R5 00 R4 */ | 511 "punpckhbw %%mm3, %%mm5;" /* 00 R7 00 R6 00 R5 00 R4 */ |
512 | 512 |
513 "punpcklwd %%mm5, %%mm4;" /* 00 R5 B5 G5 00 R4 B4 G4 */ | 513 "punpcklwd %%mm5, %%mm4;" /* 00 R5 B5 G5 00 R4 B4 G4 */ |
514 MOVNTQ " %%mm4, 16 (%1);" /* Store ARGB5 ARGB4 */ | 514 MOVNTQ " %%mm4, 16 (%1);" /* Store ARGB5 ARGB4 */ |
515 | 515 |
516 "movq %%mm0, %%mm4;" /* B7 B6 B5 B4 B3 B2 B1 B0 */ | 516 "movq %%mm0, %%mm4;" /* B7 B6 B5 B4 B3 B2 B1 B0 */ |
517 "punpckhbw %%mm2, %%mm4;" /* G7 B7 G6 B6 G5 B5 G4 B4 */ | 517 "punpckhbw %%mm2, %%mm4;" /* G7 B7 G6 B6 G5 B5 G4 B4 */ |
518 | 518 |
519 "punpckhwd %%mm5, %%mm4;" /* 00 R7 G7 B7 00 R6 B6 G6 */ | 519 "punpckhwd %%mm5, %%mm4;" /* 00 R7 G7 B7 00 R6 B6 G6 */ |
520 MOVNTQ " %%mm4, 24 (%1);" /* Store ARGB7 ARGB6 */ | 520 MOVNTQ " %%mm4, 24 (%1);" /* Store ARGB7 ARGB6 */ |
521 | 521 |
522 "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ | 522 "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ |
523 "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ | 523 "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ |
524 | 524 |
525 "pxor %%mm4, %%mm4;" /* zero mm4 */ | 525 "pxor %%mm4, %%mm4;" /* zero mm4 */ |
526 "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ | 526 "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ |
527 | 527 |
528 "add $32, %1 \n\t" | 528 "add $32, %1 \n\t" |
529 "add $4, %0 \n\t" | 529 "add $4, %0 \n\t" |
530 " js 1b \n\t" | 530 " js 1b \n\t" |
531 | 531 |
532 : "+r" (index), "+r" (_image) | 532 : "+r" (index), "+r" (_image) |
533 : "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index) | 533 : "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index) |
534 ); | 534 ); |
535 } | 535 } |
536 | 536 |
537 __asm__ __volatile__ (EMMS); | 537 __asm__ __volatile__ (EMMS); |
538 return srcSliceH; | 538 return srcSliceH; |
539 } | 539 } |