comparison postproc/yuv2rgb_template.c @ 13720:821f464b4d90

adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64
author aurel
date Thu, 21 Oct 2004 11:55:20 +0000
parents bc5b87370cd1
children
comparison
equal deleted inserted replaced
13719:43ecd6a73ec0 13720:821f464b4d90
141 for (y= 0; y<srcSliceH; y++ ) { 141 for (y= 0; y<srcSliceH; y++ ) {
142 uint8_t *_image = dst[0] + (y+srcSliceY)*dstStride[0]; 142 uint8_t *_image = dst[0] + (y+srcSliceY)*dstStride[0];
143 uint8_t *_py = src[0] + y*srcStride[0]; 143 uint8_t *_py = src[0] + y*srcStride[0];
144 uint8_t *_pu = src[1] + (y>>1)*srcStride[1]; 144 uint8_t *_pu = src[1] + (y>>1)*srcStride[1];
145 uint8_t *_pv = src[2] + (y>>1)*srcStride[2]; 145 uint8_t *_pv = src[2] + (y>>1)*srcStride[2];
146 int index= -h_size/2; 146 long index= -h_size/2;
147 147
148 b5Dither= dither8[y&1]; 148 b5Dither= dither8[y&1];
149 g6Dither= dither4[y&1]; 149 g6Dither= dither4[y&1];
150 g5Dither= dither8[y&1]; 150 g5Dither= dither8[y&1];
151 r5Dither= dither8[(y+1)&1]; 151 r5Dither= dither8[(y+1)&1];
202 "por %%mm7, %%mm5;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */ 202 "por %%mm7, %%mm5;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */
203 "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ 203 "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
204 204
205 MOVNTQ " %%mm5, 8 (%1);" /* store pixel 4-7 */ 205 MOVNTQ " %%mm5, 8 (%1);" /* store pixel 4-7 */
206 206
207 "addl $16, %1 \n\t" 207 "add $16, %1 \n\t"
208 "addl $4, %0 \n\t" 208 "add $4, %0 \n\t"
209 " js 1b \n\t" 209 " js 1b \n\t"
210 210
211 : "+r" (index), "+r" (_image) 211 : "+r" (index), "+r" (_image)
212 : "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index) 212 : "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index)
213 ); 213 );
236 for (y= 0; y<srcSliceH; y++ ) { 236 for (y= 0; y<srcSliceH; y++ ) {
237 uint8_t *_image = dst[0] + (y+srcSliceY)*dstStride[0]; 237 uint8_t *_image = dst[0] + (y+srcSliceY)*dstStride[0];
238 uint8_t *_py = src[0] + y*srcStride[0]; 238 uint8_t *_py = src[0] + y*srcStride[0];
239 uint8_t *_pu = src[1] + (y>>1)*srcStride[1]; 239 uint8_t *_pu = src[1] + (y>>1)*srcStride[1];
240 uint8_t *_pv = src[2] + (y>>1)*srcStride[2]; 240 uint8_t *_pv = src[2] + (y>>1)*srcStride[2];
241 int index= -h_size/2; 241 long index= -h_size/2;
242 242
243 b5Dither= dither8[y&1]; 243 b5Dither= dither8[y&1];
244 g6Dither= dither4[y&1]; 244 g6Dither= dither4[y&1];
245 g5Dither= dither8[y&1]; 245 g5Dither= dither8[y&1];
246 r5Dither= dither8[(y+1)&1]; 246 r5Dither= dither8[(y+1)&1];
293 "por %%mm7, %%mm5;" /* 0_r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3 */ 293 "por %%mm7, %%mm5;" /* 0_r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3 */
294 "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ 294 "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
295 295
296 MOVNTQ " %%mm5, 8 (%1);" /* store pixel 4-7 */ 296 MOVNTQ " %%mm5, 8 (%1);" /* store pixel 4-7 */
297 297
298 "addl $16, %1 \n\t" 298 "add $16, %1 \n\t"
299 "addl $4, %0 \n\t" 299 "add $4, %0 \n\t"
300 " js 1b \n\t" 300 " js 1b \n\t"
301 : "+r" (index), "+r" (_image) 301 : "+r" (index), "+r" (_image)
302 : "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index) 302 : "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index)
303 ); 303 );
304 } 304 }
324 for (y= 0; y<srcSliceH; y++ ) { 324 for (y= 0; y<srcSliceH; y++ ) {
325 uint8_t *_image = dst[0] + (y+srcSliceY)*dstStride[0]; 325 uint8_t *_image = dst[0] + (y+srcSliceY)*dstStride[0];
326 uint8_t *_py = src[0] + y*srcStride[0]; 326 uint8_t *_py = src[0] + y*srcStride[0];
327 uint8_t *_pu = src[1] + (y>>1)*srcStride[1]; 327 uint8_t *_pu = src[1] + (y>>1)*srcStride[1];
328 uint8_t *_pv = src[2] + (y>>1)*srcStride[2]; 328 uint8_t *_pv = src[2] + (y>>1)*srcStride[2];
329 int index= -h_size/2; 329 long index= -h_size/2;
330 330
331 /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8 331 /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
332 pixels in each iteration */ 332 pixels in each iteration */
333 __asm__ __volatile__ ( 333 __asm__ __volatile__ (
334 /* load data for start of next scan line */ 334 /* load data for start of next scan line */
438 438
439 "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ 439 "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
440 "pxor %%mm4, %%mm4 \n\t" 440 "pxor %%mm4, %%mm4 \n\t"
441 #endif 441 #endif
442 442
443 "addl $24, %1 \n\t" 443 "add $24, %1 \n\t"
444 "addl $4, %0 \n\t" 444 "add $4, %0 \n\t"
445 " js 1b \n\t" 445 " js 1b \n\t"
446 446
447 : "+r" (index), "+r" (_image) 447 : "+r" (index), "+r" (_image)
448 : "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index) 448 : "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index)
449 ); 449 );
470 for (y= 0; y<srcSliceH; y++ ) { 470 for (y= 0; y<srcSliceH; y++ ) {
471 uint8_t *_image = dst[0] + (y+srcSliceY)*dstStride[0]; 471 uint8_t *_image = dst[0] + (y+srcSliceY)*dstStride[0];
472 uint8_t *_py = src[0] + y*srcStride[0]; 472 uint8_t *_py = src[0] + y*srcStride[0];
473 uint8_t *_pu = src[1] + (y>>1)*srcStride[1]; 473 uint8_t *_pu = src[1] + (y>>1)*srcStride[1];
474 uint8_t *_pv = src[2] + (y>>1)*srcStride[2]; 474 uint8_t *_pv = src[2] + (y>>1)*srcStride[2];
475 int index= -h_size/2; 475 long index= -h_size/2;
476 476
477 /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8 477 /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
478 pixels in each iteration */ 478 pixels in each iteration */
479 __asm__ __volatile__ ( 479 __asm__ __volatile__ (
480 /* load data for start of next scan line */ 480 /* load data for start of next scan line */
524 "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ 524 "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
525 525
526 "pxor %%mm4, %%mm4;" /* zero mm4 */ 526 "pxor %%mm4, %%mm4;" /* zero mm4 */
527 "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ 527 "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
528 528
529 "addl $32, %1 \n\t" 529 "add $32, %1 \n\t"
530 "addl $4, %0 \n\t" 530 "add $4, %0 \n\t"
531 " js 1b \n\t" 531 " js 1b \n\t"
532 532
533 : "+r" (index), "+r" (_image) 533 : "+r" (index), "+r" (_image)
534 : "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index) 534 : "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index)
535 ); 535 );