comparison i386/dsputil_mmx.c @ 2293:15cfba1b97b5 libavcodec

adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
author michael
date Mon, 11 Oct 2004 02:19:29 +0000
parents 7e0b2e86afa9
children 86e2b1424801
comparison
equal deleted inserted replaced
2292:8556f080fcc2 2293:15cfba1b97b5
185 185
186 #ifdef CONFIG_ENCODERS 186 #ifdef CONFIG_ENCODERS
187 static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size) 187 static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
188 { 188 {
189 asm volatile( 189 asm volatile(
190 "movl $-128, %%eax \n\t" 190 "mov $-128, %%"REG_a" \n\t"
191 "pxor %%mm7, %%mm7 \n\t" 191 "pxor %%mm7, %%mm7 \n\t"
192 ".balign 16 \n\t" 192 ".balign 16 \n\t"
193 "1: \n\t" 193 "1: \n\t"
194 "movq (%0), %%mm0 \n\t" 194 "movq (%0), %%mm0 \n\t"
195 "movq (%0, %2), %%mm2 \n\t" 195 "movq (%0, %2), %%mm2 \n\t"
197 "movq %%mm2, %%mm3 \n\t" 197 "movq %%mm2, %%mm3 \n\t"
198 "punpcklbw %%mm7, %%mm0 \n\t" 198 "punpcklbw %%mm7, %%mm0 \n\t"
199 "punpckhbw %%mm7, %%mm1 \n\t" 199 "punpckhbw %%mm7, %%mm1 \n\t"
200 "punpcklbw %%mm7, %%mm2 \n\t" 200 "punpcklbw %%mm7, %%mm2 \n\t"
201 "punpckhbw %%mm7, %%mm3 \n\t" 201 "punpckhbw %%mm7, %%mm3 \n\t"
202 "movq %%mm0, (%1, %%eax)\n\t" 202 "movq %%mm0, (%1, %%"REG_a")\n\t"
203 "movq %%mm1, 8(%1, %%eax)\n\t" 203 "movq %%mm1, 8(%1, %%"REG_a")\n\t"
204 "movq %%mm2, 16(%1, %%eax)\n\t" 204 "movq %%mm2, 16(%1, %%"REG_a")\n\t"
205 "movq %%mm3, 24(%1, %%eax)\n\t" 205 "movq %%mm3, 24(%1, %%"REG_a")\n\t"
206 "addl %3, %0 \n\t" 206 "add %3, %0 \n\t"
207 "addl $32, %%eax \n\t" 207 "add $32, %%"REG_a" \n\t"
208 "js 1b \n\t" 208 "js 1b \n\t"
209 : "+r" (pixels) 209 : "+r" (pixels)
210 : "r" (block+64), "r" (line_size), "r" (line_size*2) 210 : "r" (block+64), "r" ((long)line_size), "r" ((long)line_size*2)
211 : "%eax" 211 : "%"REG_a
212 ); 212 );
213 } 213 }
214 214
215 static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride) 215 static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
216 { 216 {
217 asm volatile( 217 asm volatile(
218 "pxor %%mm7, %%mm7 \n\t" 218 "pxor %%mm7, %%mm7 \n\t"
219 "movl $-128, %%eax \n\t" 219 "mov $-128, %%"REG_a" \n\t"
220 ".balign 16 \n\t" 220 ".balign 16 \n\t"
221 "1: \n\t" 221 "1: \n\t"
222 "movq (%0), %%mm0 \n\t" 222 "movq (%0), %%mm0 \n\t"
223 "movq (%1), %%mm2 \n\t" 223 "movq (%1), %%mm2 \n\t"
224 "movq %%mm0, %%mm1 \n\t" 224 "movq %%mm0, %%mm1 \n\t"
227 "punpckhbw %%mm7, %%mm1 \n\t" 227 "punpckhbw %%mm7, %%mm1 \n\t"
228 "punpcklbw %%mm7, %%mm2 \n\t" 228 "punpcklbw %%mm7, %%mm2 \n\t"
229 "punpckhbw %%mm7, %%mm3 \n\t" 229 "punpckhbw %%mm7, %%mm3 \n\t"
230 "psubw %%mm2, %%mm0 \n\t" 230 "psubw %%mm2, %%mm0 \n\t"
231 "psubw %%mm3, %%mm1 \n\t" 231 "psubw %%mm3, %%mm1 \n\t"
232 "movq %%mm0, (%2, %%eax)\n\t" 232 "movq %%mm0, (%2, %%"REG_a")\n\t"
233 "movq %%mm1, 8(%2, %%eax)\n\t" 233 "movq %%mm1, 8(%2, %%"REG_a")\n\t"
234 "addl %3, %0 \n\t" 234 "add %3, %0 \n\t"
235 "addl %3, %1 \n\t" 235 "add %3, %1 \n\t"
236 "addl $16, %%eax \n\t" 236 "add $16, %%"REG_a" \n\t"
237 "jnz 1b \n\t" 237 "jnz 1b \n\t"
238 : "+r" (s1), "+r" (s2) 238 : "+r" (s1), "+r" (s2)
239 : "r" (block+64), "r" (stride) 239 : "r" (block+64), "r" ((long)stride)
240 : "%eax" 240 : "%"REG_a
241 ); 241 );
242 } 242 }
243 #endif //CONFIG_ENCODERS 243 #endif //CONFIG_ENCODERS
244 244
245 void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) 245 void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
266 "packuswb %%mm7, %%mm6\n\t" 266 "packuswb %%mm7, %%mm6\n\t"
267 "movq %%mm0, (%0)\n\t" 267 "movq %%mm0, (%0)\n\t"
268 "movq %%mm2, (%0, %1)\n\t" 268 "movq %%mm2, (%0, %1)\n\t"
269 "movq %%mm4, (%0, %1, 2)\n\t" 269 "movq %%mm4, (%0, %1, 2)\n\t"
270 "movq %%mm6, (%0, %2)\n\t" 270 "movq %%mm6, (%0, %2)\n\t"
271 ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p) 271 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "m"(*p)
272 :"memory"); 272 :"memory");
273 pix += line_size*4; 273 pix += line_size*4;
274 p += 32; 274 p += 32;
275 275
276 // if here would be an exact copy of the code above 276 // if here would be an exact copy of the code above
291 "packuswb %%mm7, %%mm6\n\t" 291 "packuswb %%mm7, %%mm6\n\t"
292 "movq %%mm0, (%0)\n\t" 292 "movq %%mm0, (%0)\n\t"
293 "movq %%mm2, (%0, %1)\n\t" 293 "movq %%mm2, (%0, %1)\n\t"
294 "movq %%mm4, (%0, %1, 2)\n\t" 294 "movq %%mm4, (%0, %1, 2)\n\t"
295 "movq %%mm6, (%0, %2)\n\t" 295 "movq %%mm6, (%0, %2)\n\t"
296 ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p) 296 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "r"(p)
297 :"memory"); 297 :"memory");
298 } 298 }
299 299
300 static unsigned char __align8 vector128[8] = 300 static unsigned char __align8 vector128[8] =
301 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }; 301 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
357 } 357 }
358 358
359 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) 359 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
360 { 360 {
361 __asm __volatile( 361 __asm __volatile(
362 "lea (%3, %3), %%eax \n\t" 362 "lea (%3, %3), %%"REG_a" \n\t"
363 ".balign 8 \n\t" 363 ".balign 8 \n\t"
364 "1: \n\t" 364 "1: \n\t"
365 "movd (%1), %%mm0 \n\t" 365 "movd (%1), %%mm0 \n\t"
366 "movd (%1, %3), %%mm1 \n\t" 366 "movd (%1, %3), %%mm1 \n\t"
367 "movd %%mm0, (%2) \n\t" 367 "movd %%mm0, (%2) \n\t"
368 "movd %%mm1, (%2, %3) \n\t" 368 "movd %%mm1, (%2, %3) \n\t"
369 "addl %%eax, %1 \n\t" 369 "add %%"REG_a", %1 \n\t"
370 "addl %%eax, %2 \n\t" 370 "add %%"REG_a", %2 \n\t"
371 "movd (%1), %%mm0 \n\t" 371 "movd (%1), %%mm0 \n\t"
372 "movd (%1, %3), %%mm1 \n\t" 372 "movd (%1, %3), %%mm1 \n\t"
373 "movd %%mm0, (%2) \n\t" 373 "movd %%mm0, (%2) \n\t"
374 "movd %%mm1, (%2, %3) \n\t" 374 "movd %%mm1, (%2, %3) \n\t"
375 "addl %%eax, %1 \n\t" 375 "add %%"REG_a", %1 \n\t"
376 "addl %%eax, %2 \n\t" 376 "add %%"REG_a", %2 \n\t"
377 "subl $4, %0 \n\t" 377 "subl $4, %0 \n\t"
378 "jnz 1b \n\t" 378 "jnz 1b \n\t"
379 : "+g"(h), "+r" (pixels), "+r" (block) 379 : "+g"(h), "+r" (pixels), "+r" (block)
380 : "r"(line_size) 380 : "r"((long)line_size)
381 : "%eax", "memory" 381 : "%"REG_a, "memory"
382 ); 382 );
383 } 383 }
384 384
385 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) 385 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
386 { 386 {
387 __asm __volatile( 387 __asm __volatile(
388 "lea (%3, %3), %%eax \n\t" 388 "lea (%3, %3), %%"REG_a" \n\t"
389 ".balign 8 \n\t" 389 ".balign 8 \n\t"
390 "1: \n\t" 390 "1: \n\t"
391 "movq (%1), %%mm0 \n\t" 391 "movq (%1), %%mm0 \n\t"
392 "movq (%1, %3), %%mm1 \n\t" 392 "movq (%1, %3), %%mm1 \n\t"
393 "movq %%mm0, (%2) \n\t" 393 "movq %%mm0, (%2) \n\t"
394 "movq %%mm1, (%2, %3) \n\t" 394 "movq %%mm1, (%2, %3) \n\t"
395 "addl %%eax, %1 \n\t" 395 "add %%"REG_a", %1 \n\t"
396 "addl %%eax, %2 \n\t" 396 "add %%"REG_a", %2 \n\t"
397 "movq (%1), %%mm0 \n\t" 397 "movq (%1), %%mm0 \n\t"
398 "movq (%1, %3), %%mm1 \n\t" 398 "movq (%1, %3), %%mm1 \n\t"
399 "movq %%mm0, (%2) \n\t" 399 "movq %%mm0, (%2) \n\t"
400 "movq %%mm1, (%2, %3) \n\t" 400 "movq %%mm1, (%2, %3) \n\t"
401 "addl %%eax, %1 \n\t" 401 "add %%"REG_a", %1 \n\t"
402 "addl %%eax, %2 \n\t" 402 "add %%"REG_a", %2 \n\t"
403 "subl $4, %0 \n\t" 403 "subl $4, %0 \n\t"
404 "jnz 1b \n\t" 404 "jnz 1b \n\t"
405 : "+g"(h), "+r" (pixels), "+r" (block) 405 : "+g"(h), "+r" (pixels), "+r" (block)
406 : "r"(line_size) 406 : "r"((long)line_size)
407 : "%eax", "memory" 407 : "%"REG_a, "memory"
408 ); 408 );
409 } 409 }
410 410
411 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) 411 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
412 { 412 {
413 __asm __volatile( 413 __asm __volatile(
414 "lea (%3, %3), %%eax \n\t" 414 "lea (%3, %3), %%"REG_a" \n\t"
415 ".balign 8 \n\t" 415 ".balign 8 \n\t"
416 "1: \n\t" 416 "1: \n\t"
417 "movq (%1), %%mm0 \n\t" 417 "movq (%1), %%mm0 \n\t"
418 "movq 8(%1), %%mm4 \n\t" 418 "movq 8(%1), %%mm4 \n\t"
419 "movq (%1, %3), %%mm1 \n\t" 419 "movq (%1, %3), %%mm1 \n\t"
420 "movq 8(%1, %3), %%mm5 \n\t" 420 "movq 8(%1, %3), %%mm5 \n\t"
421 "movq %%mm0, (%2) \n\t" 421 "movq %%mm0, (%2) \n\t"
422 "movq %%mm4, 8(%2) \n\t" 422 "movq %%mm4, 8(%2) \n\t"
423 "movq %%mm1, (%2, %3) \n\t" 423 "movq %%mm1, (%2, %3) \n\t"
424 "movq %%mm5, 8(%2, %3) \n\t" 424 "movq %%mm5, 8(%2, %3) \n\t"
425 "addl %%eax, %1 \n\t" 425 "add %%"REG_a", %1 \n\t"
426 "addl %%eax, %2 \n\t" 426 "add %%"REG_a", %2 \n\t"
427 "movq (%1), %%mm0 \n\t" 427 "movq (%1), %%mm0 \n\t"
428 "movq 8(%1), %%mm4 \n\t" 428 "movq 8(%1), %%mm4 \n\t"
429 "movq (%1, %3), %%mm1 \n\t" 429 "movq (%1, %3), %%mm1 \n\t"
430 "movq 8(%1, %3), %%mm5 \n\t" 430 "movq 8(%1, %3), %%mm5 \n\t"
431 "movq %%mm0, (%2) \n\t" 431 "movq %%mm0, (%2) \n\t"
432 "movq %%mm4, 8(%2) \n\t" 432 "movq %%mm4, 8(%2) \n\t"
433 "movq %%mm1, (%2, %3) \n\t" 433 "movq %%mm1, (%2, %3) \n\t"
434 "movq %%mm5, 8(%2, %3) \n\t" 434 "movq %%mm5, 8(%2, %3) \n\t"
435 "addl %%eax, %1 \n\t" 435 "add %%"REG_a", %1 \n\t"
436 "addl %%eax, %2 \n\t" 436 "add %%"REG_a", %2 \n\t"
437 "subl $4, %0 \n\t" 437 "subl $4, %0 \n\t"
438 "jnz 1b \n\t" 438 "jnz 1b \n\t"
439 : "+g"(h), "+r" (pixels), "+r" (block) 439 : "+g"(h), "+r" (pixels), "+r" (block)
440 : "r"(line_size) 440 : "r"((long)line_size)
441 : "%eax", "memory" 441 : "%"REG_a, "memory"
442 ); 442 );
443 } 443 }
444 444
445 static void clear_blocks_mmx(DCTELEM *blocks) 445 static void clear_blocks_mmx(DCTELEM *blocks)
446 { 446 {
447 __asm __volatile( 447 __asm __volatile(
448 "pxor %%mm7, %%mm7 \n\t" 448 "pxor %%mm7, %%mm7 \n\t"
449 "movl $-128*6, %%eax \n\t" 449 "mov $-128*6, %%"REG_a" \n\t"
450 "1: \n\t" 450 "1: \n\t"
451 "movq %%mm7, (%0, %%eax) \n\t" 451 "movq %%mm7, (%0, %%"REG_a") \n\t"
452 "movq %%mm7, 8(%0, %%eax) \n\t" 452 "movq %%mm7, 8(%0, %%"REG_a") \n\t"
453 "movq %%mm7, 16(%0, %%eax) \n\t" 453 "movq %%mm7, 16(%0, %%"REG_a") \n\t"
454 "movq %%mm7, 24(%0, %%eax) \n\t" 454 "movq %%mm7, 24(%0, %%"REG_a") \n\t"
455 "addl $32, %%eax \n\t" 455 "add $32, %%"REG_a" \n\t"
456 " js 1b \n\t" 456 " js 1b \n\t"
457 : : "r" (((int)blocks)+128*6) 457 : : "r" (((uint8_t *)blocks)+128*6)
458 : "%eax" 458 : "%"REG_a
459 ); 459 );
460 } 460 }
461 461
462 #ifdef CONFIG_ENCODERS 462 #ifdef CONFIG_ENCODERS
463 static int pix_sum16_mmx(uint8_t * pix, int line_size){ 463 static int pix_sum16_mmx(uint8_t * pix, int line_size){
464 const int h=16; 464 const int h=16;
465 int sum; 465 int sum;
466 int index= -line_size*h; 466 long index= -line_size*h;
467 467
468 __asm __volatile( 468 __asm __volatile(
469 "pxor %%mm7, %%mm7 \n\t" 469 "pxor %%mm7, %%mm7 \n\t"
470 "pxor %%mm6, %%mm6 \n\t" 470 "pxor %%mm6, %%mm6 \n\t"
471 "1: \n\t" 471 "1: \n\t"
479 "punpckhbw %%mm7, %%mm3 \n\t" 479 "punpckhbw %%mm7, %%mm3 \n\t"
480 "paddw %%mm0, %%mm1 \n\t" 480 "paddw %%mm0, %%mm1 \n\t"
481 "paddw %%mm2, %%mm3 \n\t" 481 "paddw %%mm2, %%mm3 \n\t"
482 "paddw %%mm1, %%mm3 \n\t" 482 "paddw %%mm1, %%mm3 \n\t"
483 "paddw %%mm3, %%mm6 \n\t" 483 "paddw %%mm3, %%mm6 \n\t"
484 "addl %3, %1 \n\t" 484 "add %3, %1 \n\t"
485 " js 1b \n\t" 485 " js 1b \n\t"
486 "movq %%mm6, %%mm5 \n\t" 486 "movq %%mm6, %%mm5 \n\t"
487 "psrlq $32, %%mm6 \n\t" 487 "psrlq $32, %%mm6 \n\t"
488 "paddw %%mm5, %%mm6 \n\t" 488 "paddw %%mm5, %%mm6 \n\t"
489 "movq %%mm6, %%mm5 \n\t" 489 "movq %%mm6, %%mm5 \n\t"
490 "psrlq $16, %%mm6 \n\t" 490 "psrlq $16, %%mm6 \n\t"
491 "paddw %%mm5, %%mm6 \n\t" 491 "paddw %%mm5, %%mm6 \n\t"
492 "movd %%mm6, %0 \n\t" 492 "movd %%mm6, %0 \n\t"
493 "andl $0xFFFF, %0 \n\t" 493 "andl $0xFFFF, %0 \n\t"
494 : "=&r" (sum), "+r" (index) 494 : "=&r" (sum), "+r" (index)
495 : "r" (pix - index), "r" (line_size) 495 : "r" (pix - index), "r" ((long)line_size)
496 ); 496 );
497 497
498 return sum; 498 return sum;
499 } 499 }
500 #endif //CONFIG_ENCODERS 500 #endif //CONFIG_ENCODERS
501 501
502 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){ 502 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
503 int i=0; 503 long i=0;
504 asm volatile( 504 asm volatile(
505 "1: \n\t" 505 "1: \n\t"
506 "movq (%1, %0), %%mm0 \n\t" 506 "movq (%1, %0), %%mm0 \n\t"
507 "movq (%2, %0), %%mm1 \n\t" 507 "movq (%2, %0), %%mm1 \n\t"
508 "paddb %%mm0, %%mm1 \n\t" 508 "paddb %%mm0, %%mm1 \n\t"
509 "movq %%mm1, (%2, %0) \n\t" 509 "movq %%mm1, (%2, %0) \n\t"
510 "movq 8(%1, %0), %%mm0 \n\t" 510 "movq 8(%1, %0), %%mm0 \n\t"
511 "movq 8(%2, %0), %%mm1 \n\t" 511 "movq 8(%2, %0), %%mm1 \n\t"
512 "paddb %%mm0, %%mm1 \n\t" 512 "paddb %%mm0, %%mm1 \n\t"
513 "movq %%mm1, 8(%2, %0) \n\t" 513 "movq %%mm1, 8(%2, %0) \n\t"
514 "addl $16, %0 \n\t" 514 "add $16, %0 \n\t"
515 "cmpl %3, %0 \n\t" 515 "cmp %3, %0 \n\t"
516 " jb 1b \n\t" 516 " jb 1b \n\t"
517 : "+r" (i) 517 : "+r" (i)
518 : "r"(src), "r"(dst), "r"(w-15) 518 : "r"(src), "r"(dst), "r"((long)w-15)
519 ); 519 );
520 for(; i<w; i++) 520 for(; i<w; i++)
521 dst[i+0] += src[i+0]; 521 dst[i+0] += src[i+0];
522 } 522 }
523 523
724 "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2, 724 "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
725 pix2^2+pix3^2+pix6^2+pix7^2) */ 725 pix2^2+pix3^2+pix6^2+pix7^2) */
726 "paddd %%mm3,%%mm4\n" 726 "paddd %%mm3,%%mm4\n"
727 "paddd %%mm2,%%mm7\n" 727 "paddd %%mm2,%%mm7\n"
728 728
729 "addl %2, %0\n" 729 "add %2, %0\n"
730 "paddd %%mm4,%%mm7\n" 730 "paddd %%mm4,%%mm7\n"
731 "dec %%ecx\n" 731 "dec %%ecx\n"
732 "jnz 1b\n" 732 "jnz 1b\n"
733 733
734 "movq %%mm7,%%mm1\n" 734 "movq %%mm7,%%mm1\n"
735 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ 735 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
736 "paddd %%mm7,%%mm1\n" 736 "paddd %%mm7,%%mm1\n"
737 "movd %%mm1,%1\n" 737 "movd %%mm1,%1\n"
738 : "+r" (pix), "=r"(tmp) : "r" (line_size) : "%ecx" ); 738 : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" );
739 return tmp; 739 return tmp;
740 } 740 }
741 741
742 static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 742 static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
743 int tmp; 743 int tmp;
761 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ 761 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
762 762
763 "pmaddwd %%mm2,%%mm2\n" 763 "pmaddwd %%mm2,%%mm2\n"
764 "pmaddwd %%mm1,%%mm1\n" 764 "pmaddwd %%mm1,%%mm1\n"
765 765
766 "addl %3,%0\n" 766 "add %3,%0\n"
767 "addl %3,%1\n" 767 "add %3,%1\n"
768 768
769 "paddd %%mm2,%%mm1\n" 769 "paddd %%mm2,%%mm1\n"
770 "paddd %%mm1,%%mm7\n" 770 "paddd %%mm1,%%mm7\n"
771 771
772 "decl %%ecx\n" 772 "decl %%ecx\n"
775 "movq %%mm7,%%mm1\n" 775 "movq %%mm7,%%mm1\n"
776 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ 776 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
777 "paddd %%mm7,%%mm1\n" 777 "paddd %%mm7,%%mm1\n"
778 "movd %%mm1,%2\n" 778 "movd %%mm1,%2\n"
779 : "+r" (pix1), "+r" (pix2), "=r"(tmp) 779 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
780 : "r" (line_size) , "m" (h) 780 : "r" ((long)line_size) , "m" (h)
781 : "%ecx"); 781 : "%ecx");
782 return tmp; 782 return tmp;
783 } 783 }
784 784
785 static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 785 static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
819 "pmaddwd %%mm2,%%mm2\n" 819 "pmaddwd %%mm2,%%mm2\n"
820 "pmaddwd %%mm4,%%mm4\n" 820 "pmaddwd %%mm4,%%mm4\n"
821 "pmaddwd %%mm1,%%mm1\n" 821 "pmaddwd %%mm1,%%mm1\n"
822 "pmaddwd %%mm3,%%mm3\n" 822 "pmaddwd %%mm3,%%mm3\n"
823 823
824 "addl %3,%0\n" 824 "add %3,%0\n"
825 "addl %3,%1\n" 825 "add %3,%1\n"
826 826
827 "paddd %%mm2,%%mm1\n" 827 "paddd %%mm2,%%mm1\n"
828 "paddd %%mm4,%%mm3\n" 828 "paddd %%mm4,%%mm3\n"
829 "paddd %%mm1,%%mm7\n" 829 "paddd %%mm1,%%mm7\n"
830 "paddd %%mm3,%%mm7\n" 830 "paddd %%mm3,%%mm7\n"
835 "movq %%mm7,%%mm1\n" 835 "movq %%mm7,%%mm1\n"
836 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ 836 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
837 "paddd %%mm7,%%mm1\n" 837 "paddd %%mm7,%%mm1\n"
838 "movd %%mm1,%2\n" 838 "movd %%mm1,%2\n"
839 : "+r" (pix1), "+r" (pix2), "=r"(tmp) 839 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
840 : "r" (line_size) , "m" (h) 840 : "r" ((long)line_size) , "m" (h)
841 : "%ecx"); 841 : "%ecx");
842 return tmp; 842 return tmp;
843 } 843 }
844 844
845 static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) { 845 static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
861 "punpckhbw %%mm7,%%mm2\n" 861 "punpckhbw %%mm7,%%mm2\n"
862 "punpckhbw %%mm7,%%mm3\n" 862 "punpckhbw %%mm7,%%mm3\n"
863 "psubw %%mm1, %%mm0\n" 863 "psubw %%mm1, %%mm0\n"
864 "psubw %%mm3, %%mm2\n" 864 "psubw %%mm3, %%mm2\n"
865 865
866 "addl %2,%0\n" 866 "add %2,%0\n"
867 867
868 "movq (%0),%%mm4\n" 868 "movq (%0),%%mm4\n"
869 "movq %%mm4, %%mm1\n" 869 "movq %%mm4, %%mm1\n"
870 "psllq $8, %%mm4\n" 870 "psllq $8, %%mm4\n"
871 "psrlq $8, %%mm1\n" 871 "psrlq $8, %%mm1\n"
889 "psubw %%mm3, %%mm0\n" 889 "psubw %%mm3, %%mm0\n"
890 "psubw %%mm1, %%mm2\n" 890 "psubw %%mm1, %%mm2\n"
891 "paddw %%mm0, %%mm2\n" 891 "paddw %%mm0, %%mm2\n"
892 "paddw %%mm2, %%mm6\n" 892 "paddw %%mm2, %%mm6\n"
893 893
894 "addl %2,%0\n" 894 "add %2,%0\n"
895 "1:\n" 895 "1:\n"
896 896
897 "movq (%0),%%mm0\n" 897 "movq (%0),%%mm0\n"
898 "movq %%mm0, %%mm1\n" 898 "movq %%mm0, %%mm1\n"
899 "psllq $8, %%mm0\n" 899 "psllq $8, %%mm0\n"
918 "psubw %%mm3, %%mm4\n" 918 "psubw %%mm3, %%mm4\n"
919 "psubw %%mm1, %%mm5\n" 919 "psubw %%mm1, %%mm5\n"
920 "paddw %%mm4, %%mm5\n" 920 "paddw %%mm4, %%mm5\n"
921 "paddw %%mm5, %%mm6\n" 921 "paddw %%mm5, %%mm6\n"
922 922
923 "addl %2,%0\n" 923 "add %2,%0\n"
924 924
925 "movq (%0),%%mm4\n" 925 "movq (%0),%%mm4\n"
926 "movq %%mm4, %%mm1\n" 926 "movq %%mm4, %%mm1\n"
927 "psllq $8, %%mm4\n" 927 "psllq $8, %%mm4\n"
928 "psrlq $8, %%mm1\n" 928 "psrlq $8, %%mm1\n"
946 "psubw %%mm3, %%mm0\n" 946 "psubw %%mm3, %%mm0\n"
947 "psubw %%mm1, %%mm2\n" 947 "psubw %%mm1, %%mm2\n"
948 "paddw %%mm0, %%mm2\n" 948 "paddw %%mm0, %%mm2\n"
949 "paddw %%mm2, %%mm6\n" 949 "paddw %%mm2, %%mm6\n"
950 950
951 "addl %2,%0\n" 951 "add %2,%0\n"
952 "subl $2, %%ecx\n" 952 "subl $2, %%ecx\n"
953 " jnz 1b\n" 953 " jnz 1b\n"
954 954
955 "movq %%mm6, %%mm0\n" 955 "movq %%mm6, %%mm0\n"
956 "punpcklwd %%mm7,%%mm0\n" 956 "punpcklwd %%mm7,%%mm0\n"
960 "movq %%mm6,%%mm0\n" 960 "movq %%mm6,%%mm0\n"
961 "psrlq $32, %%mm6\n" 961 "psrlq $32, %%mm6\n"
962 "paddd %%mm6,%%mm0\n" 962 "paddd %%mm6,%%mm0\n"
963 "movd %%mm0,%1\n" 963 "movd %%mm0,%1\n"
964 : "+r" (pix1), "=r"(tmp) 964 : "+r" (pix1), "=r"(tmp)
965 : "r" (line_size) , "g" (h-2) 965 : "r" ((long)line_size) , "g" (h-2)
966 : "%ecx"); 966 : "%ecx");
967 return tmp; 967 return tmp;
968 } 968 }
969 969
970 static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) { 970 static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
984 "punpckhbw %%mm7,%%mm2\n" 984 "punpckhbw %%mm7,%%mm2\n"
985 "punpckhbw %%mm7,%%mm3\n" 985 "punpckhbw %%mm7,%%mm3\n"
986 "psubw %%mm1, %%mm0\n" 986 "psubw %%mm1, %%mm0\n"
987 "psubw %%mm3, %%mm2\n" 987 "psubw %%mm3, %%mm2\n"
988 988
989 "addl %2,%0\n" 989 "add %2,%0\n"
990 990
991 "movq (%0),%%mm4\n" 991 "movq (%0),%%mm4\n"
992 "movq 1(%0),%%mm1\n" 992 "movq 1(%0),%%mm1\n"
993 "movq %%mm4, %%mm5\n" 993 "movq %%mm4, %%mm5\n"
994 "movq %%mm1, %%mm3\n" 994 "movq %%mm1, %%mm3\n"
1009 "psubw %%mm3, %%mm0\n" 1009 "psubw %%mm3, %%mm0\n"
1010 "psubw %%mm1, %%mm2\n" 1010 "psubw %%mm1, %%mm2\n"
1011 "paddw %%mm0, %%mm2\n" 1011 "paddw %%mm0, %%mm2\n"
1012 "paddw %%mm2, %%mm6\n" 1012 "paddw %%mm2, %%mm6\n"
1013 1013
1014 "addl %2,%0\n" 1014 "add %2,%0\n"
1015 "1:\n" 1015 "1:\n"
1016 1016
1017 "movq (%0),%%mm0\n" 1017 "movq (%0),%%mm0\n"
1018 "movq 1(%0),%%mm1\n" 1018 "movq 1(%0),%%mm1\n"
1019 "movq %%mm0, %%mm2\n" 1019 "movq %%mm0, %%mm2\n"
1035 "psubw %%mm3, %%mm4\n" 1035 "psubw %%mm3, %%mm4\n"
1036 "psubw %%mm1, %%mm5\n" 1036 "psubw %%mm1, %%mm5\n"
1037 "paddw %%mm4, %%mm5\n" 1037 "paddw %%mm4, %%mm5\n"
1038 "paddw %%mm5, %%mm6\n" 1038 "paddw %%mm5, %%mm6\n"
1039 1039
1040 "addl %2,%0\n" 1040 "add %2,%0\n"
1041 1041
1042 "movq (%0),%%mm4\n" 1042 "movq (%0),%%mm4\n"
1043 "movq 1(%0),%%mm1\n" 1043 "movq 1(%0),%%mm1\n"
1044 "movq %%mm4, %%mm5\n" 1044 "movq %%mm4, %%mm5\n"
1045 "movq %%mm1, %%mm3\n" 1045 "movq %%mm1, %%mm3\n"
1060 "psubw %%mm3, %%mm0\n" 1060 "psubw %%mm3, %%mm0\n"
1061 "psubw %%mm1, %%mm2\n" 1061 "psubw %%mm1, %%mm2\n"
1062 "paddw %%mm0, %%mm2\n" 1062 "paddw %%mm0, %%mm2\n"
1063 "paddw %%mm2, %%mm6\n" 1063 "paddw %%mm2, %%mm6\n"
1064 1064
1065 "addl %2,%0\n" 1065 "add %2,%0\n"
1066 "subl $2, %%ecx\n" 1066 "subl $2, %%ecx\n"
1067 " jnz 1b\n" 1067 " jnz 1b\n"
1068 1068
1069 "movq %%mm6, %%mm0\n" 1069 "movq %%mm6, %%mm0\n"
1070 "punpcklwd %%mm7,%%mm0\n" 1070 "punpcklwd %%mm7,%%mm0\n"
1074 "movq %%mm6,%%mm0\n" 1074 "movq %%mm6,%%mm0\n"
1075 "psrlq $32, %%mm6\n" 1075 "psrlq $32, %%mm6\n"
1076 "paddd %%mm6,%%mm0\n" 1076 "paddd %%mm6,%%mm0\n"
1077 "movd %%mm0,%1\n" 1077 "movd %%mm0,%1\n"
1078 : "+r" (pix1), "=r"(tmp) 1078 : "+r" (pix1), "=r"(tmp)
1079 : "r" (line_size) , "g" (h-2) 1079 : "r" ((long)line_size) , "g" (h-2)
1080 : "%ecx"); 1080 : "%ecx");
1081 return tmp + hf_noise8_mmx(pix+8, line_size, h); 1081 return tmp + hf_noise8_mmx(pix+8, line_size, h);
1082 } 1082 }
1083 1083
1084 static int nsse16_mmx(MpegEncContext *c, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 1084 static int nsse16_mmx(MpegEncContext *c, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
1104 assert((line_size &7) ==0); 1104 assert((line_size &7) ==0);
1105 1105
1106 #define SUM(in0, in1, out0, out1) \ 1106 #define SUM(in0, in1, out0, out1) \
1107 "movq (%0), %%mm2\n"\ 1107 "movq (%0), %%mm2\n"\
1108 "movq 8(%0), %%mm3\n"\ 1108 "movq 8(%0), %%mm3\n"\
1109 "addl %2,%0\n"\ 1109 "add %2,%0\n"\
1110 "movq %%mm2, " #out0 "\n"\ 1110 "movq %%mm2, " #out0 "\n"\
1111 "movq %%mm3, " #out1 "\n"\ 1111 "movq %%mm3, " #out1 "\n"\
1112 "psubusb " #in0 ", %%mm2\n"\ 1112 "psubusb " #in0 ", %%mm2\n"\
1113 "psubusb " #in1 ", %%mm3\n"\ 1113 "psubusb " #in1 ", %%mm3\n"\
1114 "psubusb " #out0 ", " #in0 "\n"\ 1114 "psubusb " #out0 ", " #in0 "\n"\
1131 "movl %3,%%ecx\n" 1131 "movl %3,%%ecx\n"
1132 "pxor %%mm6,%%mm6\n" 1132 "pxor %%mm6,%%mm6\n"
1133 "pxor %%mm7,%%mm7\n" 1133 "pxor %%mm7,%%mm7\n"
1134 "movq (%0),%%mm0\n" 1134 "movq (%0),%%mm0\n"
1135 "movq 8(%0),%%mm1\n" 1135 "movq 8(%0),%%mm1\n"
1136 "addl %2,%0\n" 1136 "add %2,%0\n"
1137 "subl $2, %%ecx\n" 1137 "subl $2, %%ecx\n"
1138 SUM(%%mm0, %%mm1, %%mm4, %%mm5) 1138 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1139 "1:\n" 1139 "1:\n"
1140 1140
1141 SUM(%%mm4, %%mm5, %%mm0, %%mm1) 1141 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1151 "movq %%mm0,%%mm6\n" 1151 "movq %%mm0,%%mm6\n"
1152 "psrlq $16, %%mm0\n" 1152 "psrlq $16, %%mm0\n"
1153 "paddw %%mm6,%%mm0\n" 1153 "paddw %%mm6,%%mm0\n"
1154 "movd %%mm0,%1\n" 1154 "movd %%mm0,%1\n"
1155 : "+r" (pix), "=r"(tmp) 1155 : "+r" (pix), "=r"(tmp)
1156 : "r" (line_size) , "m" (h) 1156 : "r" ((long)line_size) , "m" (h)
1157 : "%ecx"); 1157 : "%ecx");
1158 return tmp & 0xFFFF; 1158 return tmp & 0xFFFF;
1159 } 1159 }
1160 #undef SUM 1160 #undef SUM
1161 1161
1166 assert((line_size &7) ==0); 1166 assert((line_size &7) ==0);
1167 1167
1168 #define SUM(in0, in1, out0, out1) \ 1168 #define SUM(in0, in1, out0, out1) \
1169 "movq (%0), " #out0 "\n"\ 1169 "movq (%0), " #out0 "\n"\
1170 "movq 8(%0), " #out1 "\n"\ 1170 "movq 8(%0), " #out1 "\n"\
1171 "addl %2,%0\n"\ 1171 "add %2,%0\n"\
1172 "psadbw " #out0 ", " #in0 "\n"\ 1172 "psadbw " #out0 ", " #in0 "\n"\
1173 "psadbw " #out1 ", " #in1 "\n"\ 1173 "psadbw " #out1 ", " #in1 "\n"\
1174 "paddw " #in1 ", " #in0 "\n"\ 1174 "paddw " #in1 ", " #in0 "\n"\
1175 "paddw " #in0 ", %%mm6\n" 1175 "paddw " #in0 ", %%mm6\n"
1176 1176
1178 "movl %3,%%ecx\n" 1178 "movl %3,%%ecx\n"
1179 "pxor %%mm6,%%mm6\n" 1179 "pxor %%mm6,%%mm6\n"
1180 "pxor %%mm7,%%mm7\n" 1180 "pxor %%mm7,%%mm7\n"
1181 "movq (%0),%%mm0\n" 1181 "movq (%0),%%mm0\n"
1182 "movq 8(%0),%%mm1\n" 1182 "movq 8(%0),%%mm1\n"
1183 "addl %2,%0\n" 1183 "add %2,%0\n"
1184 "subl $2, %%ecx\n" 1184 "subl $2, %%ecx\n"
1185 SUM(%%mm0, %%mm1, %%mm4, %%mm5) 1185 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
1186 "1:\n" 1186 "1:\n"
1187 1187
1188 SUM(%%mm4, %%mm5, %%mm0, %%mm1) 1188 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
1192 "subl $2, %%ecx\n" 1192 "subl $2, %%ecx\n"
1193 "jnz 1b\n" 1193 "jnz 1b\n"
1194 1194
1195 "movd %%mm6,%1\n" 1195 "movd %%mm6,%1\n"
1196 : "+r" (pix), "=r"(tmp) 1196 : "+r" (pix), "=r"(tmp)
1197 : "r" (line_size) , "m" (h) 1197 : "r" ((long)line_size) , "m" (h)
1198 : "%ecx"); 1198 : "%ecx");
1199 return tmp; 1199 return tmp;
1200 } 1200 }
1201 #undef SUM 1201 #undef SUM
1202 1202
1210 #define SUM(in0, in1, out0, out1) \ 1210 #define SUM(in0, in1, out0, out1) \
1211 "movq (%0),%%mm2\n"\ 1211 "movq (%0),%%mm2\n"\
1212 "movq (%1)," #out0 "\n"\ 1212 "movq (%1)," #out0 "\n"\
1213 "movq 8(%0),%%mm3\n"\ 1213 "movq 8(%0),%%mm3\n"\
1214 "movq 8(%1)," #out1 "\n"\ 1214 "movq 8(%1)," #out1 "\n"\
1215 "addl %3,%0\n"\ 1215 "add %3,%0\n"\
1216 "addl %3,%1\n"\ 1216 "add %3,%1\n"\
1217 "psubb " #out0 ", %%mm2\n"\ 1217 "psubb " #out0 ", %%mm2\n"\
1218 "psubb " #out1 ", %%mm3\n"\ 1218 "psubb " #out1 ", %%mm3\n"\
1219 "pxor %%mm7, %%mm2\n"\ 1219 "pxor %%mm7, %%mm2\n"\
1220 "pxor %%mm7, %%mm3\n"\ 1220 "pxor %%mm7, %%mm3\n"\
1221 "movq %%mm2, " #out0 "\n"\ 1221 "movq %%mm2, " #out0 "\n"\
1246 "packsswb %%mm7, %%mm7\n" 1246 "packsswb %%mm7, %%mm7\n"
1247 "movq (%0),%%mm0\n" 1247 "movq (%0),%%mm0\n"
1248 "movq (%1),%%mm2\n" 1248 "movq (%1),%%mm2\n"
1249 "movq 8(%0),%%mm1\n" 1249 "movq 8(%0),%%mm1\n"
1250 "movq 8(%1),%%mm3\n" 1250 "movq 8(%1),%%mm3\n"
1251 "addl %3,%0\n" 1251 "add %3,%0\n"
1252 "addl %3,%1\n" 1252 "add %3,%1\n"
1253 "subl $2, %%ecx\n" 1253 "subl $2, %%ecx\n"
1254 "psubb %%mm2, %%mm0\n" 1254 "psubb %%mm2, %%mm0\n"
1255 "psubb %%mm3, %%mm1\n" 1255 "psubb %%mm3, %%mm1\n"
1256 "pxor %%mm7, %%mm0\n" 1256 "pxor %%mm7, %%mm0\n"
1257 "pxor %%mm7, %%mm1\n" 1257 "pxor %%mm7, %%mm1\n"
1271 "movq %%mm0,%%mm6\n" 1271 "movq %%mm0,%%mm6\n"
1272 "psrlq $16, %%mm0\n" 1272 "psrlq $16, %%mm0\n"
1273 "paddw %%mm6,%%mm0\n" 1273 "paddw %%mm6,%%mm0\n"
1274 "movd %%mm0,%2\n" 1274 "movd %%mm0,%2\n"
1275 : "+r" (pix1), "+r" (pix2), "=r"(tmp) 1275 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
1276 : "r" (line_size) , "m" (h) 1276 : "r" ((long)line_size) , "m" (h)
1277 : "%ecx"); 1277 : "%ecx");
1278 return tmp & 0x7FFF; 1278 return tmp & 0x7FFF;
1279 } 1279 }
1280 #undef SUM 1280 #undef SUM
1281 1281
1289 #define SUM(in0, in1, out0, out1) \ 1289 #define SUM(in0, in1, out0, out1) \
1290 "movq (%0)," #out0 "\n"\ 1290 "movq (%0)," #out0 "\n"\
1291 "movq (%1),%%mm2\n"\ 1291 "movq (%1),%%mm2\n"\
1292 "movq 8(%0)," #out1 "\n"\ 1292 "movq 8(%0)," #out1 "\n"\
1293 "movq 8(%1),%%mm3\n"\ 1293 "movq 8(%1),%%mm3\n"\
1294 "addl %3,%0\n"\ 1294 "add %3,%0\n"\
1295 "addl %3,%1\n"\ 1295 "add %3,%1\n"\
1296 "psubb %%mm2, " #out0 "\n"\ 1296 "psubb %%mm2, " #out0 "\n"\
1297 "psubb %%mm3, " #out1 "\n"\ 1297 "psubb %%mm3, " #out1 "\n"\
1298 "pxor %%mm7, " #out0 "\n"\ 1298 "pxor %%mm7, " #out0 "\n"\
1299 "pxor %%mm7, " #out1 "\n"\ 1299 "pxor %%mm7, " #out1 "\n"\
1300 "psadbw " #out0 ", " #in0 "\n"\ 1300 "psadbw " #out0 ", " #in0 "\n"\
1310 "packsswb %%mm7, %%mm7\n" 1310 "packsswb %%mm7, %%mm7\n"
1311 "movq (%0),%%mm0\n" 1311 "movq (%0),%%mm0\n"
1312 "movq (%1),%%mm2\n" 1312 "movq (%1),%%mm2\n"
1313 "movq 8(%0),%%mm1\n" 1313 "movq 8(%0),%%mm1\n"
1314 "movq 8(%1),%%mm3\n" 1314 "movq 8(%1),%%mm3\n"
1315 "addl %3,%0\n" 1315 "add %3,%0\n"
1316 "addl %3,%1\n" 1316 "add %3,%1\n"
1317 "subl $2, %%ecx\n" 1317 "subl $2, %%ecx\n"
1318 "psubb %%mm2, %%mm0\n" 1318 "psubb %%mm2, %%mm0\n"
1319 "psubb %%mm3, %%mm1\n" 1319 "psubb %%mm3, %%mm1\n"
1320 "pxor %%mm7, %%mm0\n" 1320 "pxor %%mm7, %%mm0\n"
1321 "pxor %%mm7, %%mm1\n" 1321 "pxor %%mm7, %%mm1\n"
1329 "subl $2, %%ecx\n" 1329 "subl $2, %%ecx\n"
1330 "jnz 1b\n" 1330 "jnz 1b\n"
1331 1331
1332 "movd %%mm6,%2\n" 1332 "movd %%mm6,%2\n"
1333 : "+r" (pix1), "+r" (pix2), "=r"(tmp) 1333 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
1334 : "r" (line_size) , "m" (h) 1334 : "r" ((long)line_size) , "m" (h)
1335 : "%ecx"); 1335 : "%ecx");
1336 return tmp; 1336 return tmp;
1337 } 1337 }
1338 #undef SUM 1338 #undef SUM
1339 1339
1340 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ 1340 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1341 int i=0; 1341 long i=0;
1342 asm volatile( 1342 asm volatile(
1343 "1: \n\t" 1343 "1: \n\t"
1344 "movq (%2, %0), %%mm0 \n\t" 1344 "movq (%2, %0), %%mm0 \n\t"
1345 "movq (%1, %0), %%mm1 \n\t" 1345 "movq (%1, %0), %%mm1 \n\t"
1346 "psubb %%mm0, %%mm1 \n\t" 1346 "psubb %%mm0, %%mm1 \n\t"
1347 "movq %%mm1, (%3, %0) \n\t" 1347 "movq %%mm1, (%3, %0) \n\t"
1348 "movq 8(%2, %0), %%mm0 \n\t" 1348 "movq 8(%2, %0), %%mm0 \n\t"
1349 "movq 8(%1, %0), %%mm1 \n\t" 1349 "movq 8(%1, %0), %%mm1 \n\t"
1350 "psubb %%mm0, %%mm1 \n\t" 1350 "psubb %%mm0, %%mm1 \n\t"
1351 "movq %%mm1, 8(%3, %0) \n\t" 1351 "movq %%mm1, 8(%3, %0) \n\t"
1352 "addl $16, %0 \n\t" 1352 "add $16, %0 \n\t"
1353 "cmpl %4, %0 \n\t" 1353 "cmp %4, %0 \n\t"
1354 " jb 1b \n\t" 1354 " jb 1b \n\t"
1355 : "+r" (i) 1355 : "+r" (i)
1356 : "r"(src1), "r"(src2), "r"(dst), "r"(w-15) 1356 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15)
1357 ); 1357 );
1358 for(; i<w; i++) 1358 for(; i<w; i++)
1359 dst[i+0] = src1[i+0]-src2[i+0]; 1359 dst[i+0] = src1[i+0]-src2[i+0];
1360 } 1360 }
1361 1361
1362 static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){ 1362 static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
1363 int i=0; 1363 long i=0;
1364 uint8_t l, lt; 1364 uint8_t l, lt;
1365 1365
1366 asm volatile( 1366 asm volatile(
1367 "1: \n\t" 1367 "1: \n\t"
1368 "movq -1(%1, %0), %%mm0 \n\t" // LT 1368 "movq -1(%1, %0), %%mm0 \n\t" // LT
1377 "pminub %%mm5, %%mm1 \n\t" // min(T, L) 1377 "pminub %%mm5, %%mm1 \n\t" // min(T, L)
1378 "pminub %%mm2, %%mm4 \n\t" 1378 "pminub %%mm2, %%mm4 \n\t"
1379 "pmaxub %%mm1, %%mm4 \n\t" 1379 "pmaxub %%mm1, %%mm4 \n\t"
1380 "psubb %%mm4, %%mm3 \n\t" // dst - pred 1380 "psubb %%mm4, %%mm3 \n\t" // dst - pred
1381 "movq %%mm3, (%3, %0) \n\t" 1381 "movq %%mm3, (%3, %0) \n\t"
1382 "addl $8, %0 \n\t" 1382 "add $8, %0 \n\t"
1383 "cmpl %4, %0 \n\t" 1383 "cmp %4, %0 \n\t"
1384 " jb 1b \n\t" 1384 " jb 1b \n\t"
1385 : "+r" (i) 1385 : "+r" (i)
1386 : "r"(src1), "r"(src2), "r"(dst), "r"(w) 1386 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w)
1387 ); 1387 );
1388 1388
1389 l= *left; 1389 l= *left;
1390 lt= *left_top; 1390 lt= *left_top;
1391 1391
1770 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\ 1770 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
1771 "psraw $5, %%mm4 \n\t"\ 1771 "psraw $5, %%mm4 \n\t"\
1772 "packuswb %%mm4, %%mm0 \n\t"\ 1772 "packuswb %%mm4, %%mm0 \n\t"\
1773 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\ 1773 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
1774 \ 1774 \
1775 "addl %3, %0 \n\t"\ 1775 "add %3, %0 \n\t"\
1776 "addl %4, %1 \n\t"\ 1776 "add %4, %1 \n\t"\
1777 "decl %2 \n\t"\ 1777 "decl %2 \n\t"\
1778 " jnz 1b \n\t"\ 1778 " jnz 1b \n\t"\
1779 : "+a"(src), "+c"(dst), "+m"(h)\ 1779 : "+a"(src), "+c"(dst), "+m"(h)\
1780 : "d"(srcStride), "S"(dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\ 1780 : "d"((long)srcStride), "S"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
1781 : "memory"\ 1781 : "memory"\
1782 );\ 1782 );\
1783 }\ 1783 }\
1784 \ 1784 \
1785 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ 1785 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1883 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\ 1883 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
1884 "psraw $5, %%mm3 \n\t"\ 1884 "psraw $5, %%mm3 \n\t"\
1885 "packuswb %%mm3, %%mm0 \n\t"\ 1885 "packuswb %%mm3, %%mm0 \n\t"\
1886 OP_MMX2(%%mm0, (%1), %%mm4, q)\ 1886 OP_MMX2(%%mm0, (%1), %%mm4, q)\
1887 \ 1887 \
1888 "addl %3, %0 \n\t"\ 1888 "add %3, %0 \n\t"\
1889 "addl %4, %1 \n\t"\ 1889 "add %4, %1 \n\t"\
1890 "decl %2 \n\t"\ 1890 "decl %2 \n\t"\
1891 " jnz 1b \n\t"\ 1891 " jnz 1b \n\t"\
1892 : "+a"(src), "+c"(dst), "+m"(h)\ 1892 : "+a"(src), "+c"(dst), "+m"(h)\
1893 : "S"(srcStride), "D"(dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\ 1893 : "S"((long)srcStride), "D"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
1894 : "memory"\ 1894 : "memory"\
1895 );\ 1895 );\
1896 }\ 1896 }\
1897 \ 1897 \
1898 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ 1898 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1947 "punpckhbw %%mm7, %%mm3 \n\t"\ 1947 "punpckhbw %%mm7, %%mm3 \n\t"\
1948 "movq %%mm0, (%1) \n\t"\ 1948 "movq %%mm0, (%1) \n\t"\
1949 "movq %%mm1, 17*8(%1) \n\t"\ 1949 "movq %%mm1, 17*8(%1) \n\t"\
1950 "movq %%mm2, 2*17*8(%1) \n\t"\ 1950 "movq %%mm2, 2*17*8(%1) \n\t"\
1951 "movq %%mm3, 3*17*8(%1) \n\t"\ 1951 "movq %%mm3, 3*17*8(%1) \n\t"\
1952 "addl $8, %1 \n\t"\ 1952 "add $8, %1 \n\t"\
1953 "addl %3, %0 \n\t"\ 1953 "add %3, %0 \n\t"\
1954 "decl %2 \n\t"\ 1954 "decl %2 \n\t"\
1955 " jnz 1b \n\t"\ 1955 " jnz 1b \n\t"\
1956 : "+r" (src), "+r" (temp_ptr), "+r"(count)\ 1956 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
1957 : "r" (srcStride)\ 1957 : "r" ((long)srcStride)\
1958 : "memory"\ 1958 : "memory"\
1959 );\ 1959 );\
1960 \ 1960 \
1961 temp_ptr= temp;\ 1961 temp_ptr= temp;\
1962 count=4;\ 1962 count=4;\
1969 "movq 8(%0), %%mm1 \n\t"\ 1969 "movq 8(%0), %%mm1 \n\t"\
1970 "movq 16(%0), %%mm2 \n\t"\ 1970 "movq 16(%0), %%mm2 \n\t"\
1971 "movq 24(%0), %%mm3 \n\t"\ 1971 "movq 24(%0), %%mm3 \n\t"\
1972 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ 1972 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
1973 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ 1973 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
1974 "addl %4, %1 \n\t"\ 1974 "add %4, %1 \n\t"\
1975 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ 1975 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
1976 \ 1976 \
1977 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ 1977 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
1978 "addl %4, %1 \n\t"\ 1978 "add %4, %1 \n\t"\
1979 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ 1979 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
1980 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\ 1980 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
1981 "addl %4, %1 \n\t"\ 1981 "add %4, %1 \n\t"\
1982 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\ 1982 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
1983 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\ 1983 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
1984 "addl %4, %1 \n\t"\ 1984 "add %4, %1 \n\t"\
1985 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\ 1985 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
1986 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\ 1986 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
1987 "addl %4, %1 \n\t"\ 1987 "add %4, %1 \n\t"\
1988 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\ 1988 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
1989 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\ 1989 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
1990 "addl %4, %1 \n\t"\ 1990 "add %4, %1 \n\t"\
1991 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\ 1991 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
1992 \ 1992 \
1993 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\ 1993 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
1994 "addl %4, %1 \n\t" \ 1994 "add %4, %1 \n\t" \
1995 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\ 1995 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
1996 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\ 1996 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
1997 \ 1997 \
1998 "addl $136, %0 \n\t"\ 1998 "add $136, %0 \n\t"\
1999 "addl %6, %1 \n\t"\ 1999 "add %6, %1 \n\t"\
2000 "decl %2 \n\t"\ 2000 "decl %2 \n\t"\
2001 " jnz 1b \n\t"\ 2001 " jnz 1b \n\t"\
2002 \ 2002 \
2003 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\ 2003 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2004 : "r"(dstStride), "r"(2*dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*dstStride)\ 2004 : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(long)dstStride)\
2005 :"memory"\ 2005 :"memory"\
2006 );\ 2006 );\
2007 }\ 2007 }\
2008 \ 2008 \
2009 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 2009 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2019 "movq (%0), %%mm1 \n\t"\ 2019 "movq (%0), %%mm1 \n\t"\
2020 "punpcklbw %%mm7, %%mm0 \n\t"\ 2020 "punpcklbw %%mm7, %%mm0 \n\t"\
2021 "punpckhbw %%mm7, %%mm1 \n\t"\ 2021 "punpckhbw %%mm7, %%mm1 \n\t"\
2022 "movq %%mm0, (%1) \n\t"\ 2022 "movq %%mm0, (%1) \n\t"\
2023 "movq %%mm1, 9*8(%1) \n\t"\ 2023 "movq %%mm1, 9*8(%1) \n\t"\
2024 "addl $8, %1 \n\t"\ 2024 "add $8, %1 \n\t"\
2025 "addl %3, %0 \n\t"\ 2025 "add %3, %0 \n\t"\
2026 "decl %2 \n\t"\ 2026 "decl %2 \n\t"\
2027 " jnz 1b \n\t"\ 2027 " jnz 1b \n\t"\
2028 : "+r" (src), "+r" (temp_ptr), "+r"(count)\ 2028 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
2029 : "r" (srcStride)\ 2029 : "r" ((long)srcStride)\
2030 : "memory"\ 2030 : "memory"\
2031 );\ 2031 );\
2032 \ 2032 \
2033 temp_ptr= temp;\ 2033 temp_ptr= temp;\
2034 count=2;\ 2034 count=2;\
2041 "movq 8(%0), %%mm1 \n\t"\ 2041 "movq 8(%0), %%mm1 \n\t"\
2042 "movq 16(%0), %%mm2 \n\t"\ 2042 "movq 16(%0), %%mm2 \n\t"\
2043 "movq 24(%0), %%mm3 \n\t"\ 2043 "movq 24(%0), %%mm3 \n\t"\
2044 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ 2044 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
2045 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ 2045 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
2046 "addl %4, %1 \n\t"\ 2046 "add %4, %1 \n\t"\
2047 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ 2047 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
2048 \ 2048 \
2049 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ 2049 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
2050 "addl %4, %1 \n\t"\ 2050 "add %4, %1 \n\t"\
2051 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ 2051 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
2052 \ 2052 \
2053 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\ 2053 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
2054 "addl %4, %1 \n\t"\ 2054 "add %4, %1 \n\t"\
2055 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\ 2055 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
2056 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\ 2056 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
2057 \ 2057 \
2058 "addl $72, %0 \n\t"\ 2058 "add $72, %0 \n\t"\
2059 "addl %6, %1 \n\t"\ 2059 "add %6, %1 \n\t"\
2060 "decl %2 \n\t"\ 2060 "decl %2 \n\t"\
2061 " jnz 1b \n\t"\ 2061 " jnz 1b \n\t"\
2062 \ 2062 \
2063 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\ 2063 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
2064 : "r"(dstStride), "r"(2*dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*dstStride)\ 2064 : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(long)dstStride)\
2065 : "memory"\ 2065 : "memory"\
2066 );\ 2066 );\
2067 }\ 2067 }\
2068 \ 2068 \
2069 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ 2069 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
2295 "paddw "#D", %%mm6 \n\t"\ 2295 "paddw "#D", %%mm6 \n\t"\
2296 "psllw $2, %%mm6 \n\t"\ 2296 "psllw $2, %%mm6 \n\t"\
2297 "psubw "#B", %%mm6 \n\t"\ 2297 "psubw "#B", %%mm6 \n\t"\
2298 "psubw "#E", %%mm6 \n\t"\ 2298 "psubw "#E", %%mm6 \n\t"\
2299 "pmullw %4, %%mm6 \n\t"\ 2299 "pmullw %4, %%mm6 \n\t"\
2300 "addl %2, %0 \n\t"\ 2300 "add %2, %0 \n\t"\
2301 "punpcklbw %%mm7, "#F" \n\t"\ 2301 "punpcklbw %%mm7, "#F" \n\t"\
2302 "paddw %5, "#A" \n\t"\ 2302 "paddw %5, "#A" \n\t"\
2303 "paddw "#F", "#A" \n\t"\ 2303 "paddw "#F", "#A" \n\t"\
2304 "paddw "#A", %%mm6 \n\t"\ 2304 "paddw "#A", %%mm6 \n\t"\
2305 "psraw $5, %%mm6 \n\t"\ 2305 "psraw $5, %%mm6 \n\t"\
2306 "packuswb %%mm6, %%mm6 \n\t"\ 2306 "packuswb %%mm6, %%mm6 \n\t"\
2307 OP(%%mm6, (%1), A, d)\ 2307 OP(%%mm6, (%1), A, d)\
2308 "addl %3, %1 \n\t" 2308 "add %3, %1 \n\t"
2309 2309
2310 #define QPEL_H264HV(A,B,C,D,E,F,OF)\ 2310 #define QPEL_H264HV(A,B,C,D,E,F,OF)\
2311 "movd (%0), "#F" \n\t"\ 2311 "movd (%0), "#F" \n\t"\
2312 "movq "#C", %%mm6 \n\t"\ 2312 "movq "#C", %%mm6 \n\t"\
2313 "paddw "#D", %%mm6 \n\t"\ 2313 "paddw "#D", %%mm6 \n\t"\
2314 "psllw $2, %%mm6 \n\t"\ 2314 "psllw $2, %%mm6 \n\t"\
2315 "psubw "#B", %%mm6 \n\t"\ 2315 "psubw "#B", %%mm6 \n\t"\
2316 "psubw "#E", %%mm6 \n\t"\ 2316 "psubw "#E", %%mm6 \n\t"\
2317 "pmullw %3, %%mm6 \n\t"\ 2317 "pmullw %3, %%mm6 \n\t"\
2318 "addl %2, %0 \n\t"\ 2318 "add %2, %0 \n\t"\
2319 "punpcklbw %%mm7, "#F" \n\t"\ 2319 "punpcklbw %%mm7, "#F" \n\t"\
2320 "paddw "#F", "#A" \n\t"\ 2320 "paddw "#F", "#A" \n\t"\
2321 "paddw "#A", %%mm6 \n\t"\ 2321 "paddw "#A", %%mm6 \n\t"\
2322 "movq %%mm6, "#OF"(%1) \n\t" 2322 "movq %%mm6, "#OF"(%1) \n\t"
2323 2323
2351 "paddw %%mm5, %%mm0 \n\t"\ 2351 "paddw %%mm5, %%mm0 \n\t"\
2352 "paddw %%mm2, %%mm0 \n\t"\ 2352 "paddw %%mm2, %%mm0 \n\t"\
2353 "psraw $5, %%mm0 \n\t"\ 2353 "psraw $5, %%mm0 \n\t"\
2354 "packuswb %%mm0, %%mm0 \n\t"\ 2354 "packuswb %%mm0, %%mm0 \n\t"\
2355 OP(%%mm0, (%1),%%mm6, d)\ 2355 OP(%%mm0, (%1),%%mm6, d)\
2356 "addl %3, %0 \n\t"\ 2356 "add %3, %0 \n\t"\
2357 "addl %4, %1 \n\t"\ 2357 "add %4, %1 \n\t"\
2358 "decl %2 \n\t"\ 2358 "decl %2 \n\t"\
2359 " jnz 1b \n\t"\ 2359 " jnz 1b \n\t"\
2360 : "+a"(src), "+c"(dst), "+m"(h)\ 2360 : "+a"(src), "+c"(dst), "+m"(h)\
2361 : "d"(srcStride), "S"(dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ 2361 : "d"((long)srcStride), "S"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
2362 : "memory"\ 2362 : "memory"\
2363 );\ 2363 );\
2364 }\ 2364 }\
2365 static void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 2365 static void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2366 src -= 2*srcStride;\ 2366 src -= 2*srcStride;\
2367 asm volatile(\ 2367 asm volatile(\
2368 "pxor %%mm7, %%mm7 \n\t"\ 2368 "pxor %%mm7, %%mm7 \n\t"\
2369 "movd (%0), %%mm0 \n\t"\ 2369 "movd (%0), %%mm0 \n\t"\
2370 "addl %2, %0 \n\t"\ 2370 "add %2, %0 \n\t"\
2371 "movd (%0), %%mm1 \n\t"\ 2371 "movd (%0), %%mm1 \n\t"\
2372 "addl %2, %0 \n\t"\ 2372 "add %2, %0 \n\t"\
2373 "movd (%0), %%mm2 \n\t"\ 2373 "movd (%0), %%mm2 \n\t"\
2374 "addl %2, %0 \n\t"\ 2374 "add %2, %0 \n\t"\
2375 "movd (%0), %%mm3 \n\t"\ 2375 "movd (%0), %%mm3 \n\t"\
2376 "addl %2, %0 \n\t"\ 2376 "add %2, %0 \n\t"\
2377 "movd (%0), %%mm4 \n\t"\ 2377 "movd (%0), %%mm4 \n\t"\
2378 "addl %2, %0 \n\t"\ 2378 "add %2, %0 \n\t"\
2379 "punpcklbw %%mm7, %%mm0 \n\t"\ 2379 "punpcklbw %%mm7, %%mm0 \n\t"\
2380 "punpcklbw %%mm7, %%mm1 \n\t"\ 2380 "punpcklbw %%mm7, %%mm1 \n\t"\
2381 "punpcklbw %%mm7, %%mm2 \n\t"\ 2381 "punpcklbw %%mm7, %%mm2 \n\t"\
2382 "punpcklbw %%mm7, %%mm3 \n\t"\ 2382 "punpcklbw %%mm7, %%mm3 \n\t"\
2383 "punpcklbw %%mm7, %%mm4 \n\t"\ 2383 "punpcklbw %%mm7, %%mm4 \n\t"\
2385 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ 2385 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
2386 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ 2386 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
2387 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ 2387 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
2388 \ 2388 \
2389 : "+a"(src), "+c"(dst)\ 2389 : "+a"(src), "+c"(dst)\
2390 : "S"(srcStride), "D"(dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ 2390 : "S"((long)srcStride), "D"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
2391 : "memory"\ 2391 : "memory"\
2392 );\ 2392 );\
2393 }\ 2393 }\
2394 static void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ 2394 static void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2395 int h=4;\ 2395 int h=4;\
2397 src -= 2*srcStride+2;\ 2397 src -= 2*srcStride+2;\
2398 while(w--){\ 2398 while(w--){\
2399 asm volatile(\ 2399 asm volatile(\
2400 "pxor %%mm7, %%mm7 \n\t"\ 2400 "pxor %%mm7, %%mm7 \n\t"\
2401 "movd (%0), %%mm0 \n\t"\ 2401 "movd (%0), %%mm0 \n\t"\
2402 "addl %2, %0 \n\t"\ 2402 "add %2, %0 \n\t"\
2403 "movd (%0), %%mm1 \n\t"\ 2403 "movd (%0), %%mm1 \n\t"\
2404 "addl %2, %0 \n\t"\ 2404 "add %2, %0 \n\t"\
2405 "movd (%0), %%mm2 \n\t"\ 2405 "movd (%0), %%mm2 \n\t"\
2406 "addl %2, %0 \n\t"\ 2406 "add %2, %0 \n\t"\
2407 "movd (%0), %%mm3 \n\t"\ 2407 "movd (%0), %%mm3 \n\t"\
2408 "addl %2, %0 \n\t"\ 2408 "add %2, %0 \n\t"\
2409 "movd (%0), %%mm4 \n\t"\ 2409 "movd (%0), %%mm4 \n\t"\
2410 "addl %2, %0 \n\t"\ 2410 "add %2, %0 \n\t"\
2411 "punpcklbw %%mm7, %%mm0 \n\t"\ 2411 "punpcklbw %%mm7, %%mm0 \n\t"\
2412 "punpcklbw %%mm7, %%mm1 \n\t"\ 2412 "punpcklbw %%mm7, %%mm1 \n\t"\
2413 "punpcklbw %%mm7, %%mm2 \n\t"\ 2413 "punpcklbw %%mm7, %%mm2 \n\t"\
2414 "punpcklbw %%mm7, %%mm3 \n\t"\ 2414 "punpcklbw %%mm7, %%mm3 \n\t"\
2415 "punpcklbw %%mm7, %%mm4 \n\t"\ 2415 "punpcklbw %%mm7, %%mm4 \n\t"\
2417 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\ 2417 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\
2418 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\ 2418 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\
2419 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\ 2419 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\
2420 \ 2420 \
2421 : "+a"(src)\ 2421 : "+a"(src)\
2422 : "c"(tmp), "S"(srcStride), "m"(ff_pw_5)\ 2422 : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5)\
2423 : "memory"\ 2423 : "memory"\
2424 );\ 2424 );\
2425 tmp += 4;\ 2425 tmp += 4;\
2426 src += 4 - 9*srcStride;\ 2426 src += 4 - 9*srcStride;\
2427 }\ 2427 }\
2443 "paddw %%mm6, %%mm2 \n\t"\ 2443 "paddw %%mm6, %%mm2 \n\t"\
2444 "paddw %%mm2, %%mm0 \n\t"\ 2444 "paddw %%mm2, %%mm0 \n\t"\
2445 "psraw $6, %%mm0 \n\t"\ 2445 "psraw $6, %%mm0 \n\t"\
2446 "packuswb %%mm0, %%mm0 \n\t"\ 2446 "packuswb %%mm0, %%mm0 \n\t"\
2447 OP(%%mm0, (%1),%%mm7, d)\ 2447 OP(%%mm0, (%1),%%mm7, d)\
2448 "addl $24, %0 \n\t"\ 2448 "add $24, %0 \n\t"\
2449 "addl %3, %1 \n\t"\ 2449 "add %3, %1 \n\t"\
2450 "decl %2 \n\t"\ 2450 "decl %2 \n\t"\
2451 " jnz 1b \n\t"\ 2451 " jnz 1b \n\t"\
2452 : "+a"(tmp), "+c"(dst), "+m"(h)\ 2452 : "+a"(tmp), "+c"(dst), "+m"(h)\
2453 : "S"(dstStride), "m"(ff_pw_32)\ 2453 : "S"((long)dstStride), "m"(ff_pw_32)\
2454 : "memory"\ 2454 : "memory"\
2455 );\ 2455 );\
2456 }\ 2456 }\
2457 \ 2457 \
2458 static void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 2458 static void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2500 "paddw %%mm4, %%mm1 \n\t"\ 2500 "paddw %%mm4, %%mm1 \n\t"\
2501 "psraw $5, %%mm0 \n\t"\ 2501 "psraw $5, %%mm0 \n\t"\
2502 "psraw $5, %%mm1 \n\t"\ 2502 "psraw $5, %%mm1 \n\t"\
2503 "packuswb %%mm1, %%mm0 \n\t"\ 2503 "packuswb %%mm1, %%mm0 \n\t"\
2504 OP(%%mm0, (%1),%%mm5, q)\ 2504 OP(%%mm0, (%1),%%mm5, q)\
2505 "addl %3, %0 \n\t"\ 2505 "add %3, %0 \n\t"\
2506 "addl %4, %1 \n\t"\ 2506 "add %4, %1 \n\t"\
2507 "decl %2 \n\t"\ 2507 "decl %2 \n\t"\
2508 " jnz 1b \n\t"\ 2508 " jnz 1b \n\t"\
2509 : "+a"(src), "+c"(dst), "+m"(h)\ 2509 : "+a"(src), "+c"(dst), "+m"(h)\
2510 : "d"(srcStride), "S"(dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ 2510 : "d"((long)srcStride), "S"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
2511 : "memory"\ 2511 : "memory"\
2512 );\ 2512 );\
2513 }\ 2513 }\
2514 \ 2514 \
2515 static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 2515 static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2518 \ 2518 \
2519 while(h--){\ 2519 while(h--){\
2520 asm volatile(\ 2520 asm volatile(\
2521 "pxor %%mm7, %%mm7 \n\t"\ 2521 "pxor %%mm7, %%mm7 \n\t"\
2522 "movd (%0), %%mm0 \n\t"\ 2522 "movd (%0), %%mm0 \n\t"\
2523 "addl %2, %0 \n\t"\ 2523 "add %2, %0 \n\t"\
2524 "movd (%0), %%mm1 \n\t"\ 2524 "movd (%0), %%mm1 \n\t"\
2525 "addl %2, %0 \n\t"\ 2525 "add %2, %0 \n\t"\
2526 "movd (%0), %%mm2 \n\t"\ 2526 "movd (%0), %%mm2 \n\t"\
2527 "addl %2, %0 \n\t"\ 2527 "add %2, %0 \n\t"\
2528 "movd (%0), %%mm3 \n\t"\ 2528 "movd (%0), %%mm3 \n\t"\
2529 "addl %2, %0 \n\t"\ 2529 "add %2, %0 \n\t"\
2530 "movd (%0), %%mm4 \n\t"\ 2530 "movd (%0), %%mm4 \n\t"\
2531 "addl %2, %0 \n\t"\ 2531 "add %2, %0 \n\t"\
2532 "punpcklbw %%mm7, %%mm0 \n\t"\ 2532 "punpcklbw %%mm7, %%mm0 \n\t"\
2533 "punpcklbw %%mm7, %%mm1 \n\t"\ 2533 "punpcklbw %%mm7, %%mm1 \n\t"\
2534 "punpcklbw %%mm7, %%mm2 \n\t"\ 2534 "punpcklbw %%mm7, %%mm2 \n\t"\
2535 "punpcklbw %%mm7, %%mm3 \n\t"\ 2535 "punpcklbw %%mm7, %%mm3 \n\t"\
2536 "punpcklbw %%mm7, %%mm4 \n\t"\ 2536 "punpcklbw %%mm7, %%mm4 \n\t"\
2542 QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\ 2542 QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
2543 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ 2543 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
2544 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ 2544 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
2545 \ 2545 \
2546 : "+a"(src), "+c"(dst)\ 2546 : "+a"(src), "+c"(dst)\
2547 : "S"(srcStride), "D"(dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ 2547 : "S"((long)srcStride), "D"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
2548 : "memory"\ 2548 : "memory"\
2549 );\ 2549 );\
2550 src += 4-13*srcStride;\ 2550 src += 4-13*srcStride;\
2551 dst += 4-8*dstStride;\ 2551 dst += 4-8*dstStride;\
2552 }\ 2552 }\
2557 src -= 2*srcStride+2;\ 2557 src -= 2*srcStride+2;\
2558 while(w--){\ 2558 while(w--){\
2559 asm volatile(\ 2559 asm volatile(\
2560 "pxor %%mm7, %%mm7 \n\t"\ 2560 "pxor %%mm7, %%mm7 \n\t"\
2561 "movd (%0), %%mm0 \n\t"\ 2561 "movd (%0), %%mm0 \n\t"\
2562 "addl %2, %0 \n\t"\ 2562 "add %2, %0 \n\t"\
2563 "movd (%0), %%mm1 \n\t"\ 2563 "movd (%0), %%mm1 \n\t"\
2564 "addl %2, %0 \n\t"\ 2564 "add %2, %0 \n\t"\
2565 "movd (%0), %%mm2 \n\t"\ 2565 "movd (%0), %%mm2 \n\t"\
2566 "addl %2, %0 \n\t"\ 2566 "add %2, %0 \n\t"\
2567 "movd (%0), %%mm3 \n\t"\ 2567 "movd (%0), %%mm3 \n\t"\
2568 "addl %2, %0 \n\t"\ 2568 "add %2, %0 \n\t"\
2569 "movd (%0), %%mm4 \n\t"\ 2569 "movd (%0), %%mm4 \n\t"\
2570 "addl %2, %0 \n\t"\ 2570 "add %2, %0 \n\t"\
2571 "punpcklbw %%mm7, %%mm0 \n\t"\ 2571 "punpcklbw %%mm7, %%mm0 \n\t"\
2572 "punpcklbw %%mm7, %%mm1 \n\t"\ 2572 "punpcklbw %%mm7, %%mm1 \n\t"\
2573 "punpcklbw %%mm7, %%mm2 \n\t"\ 2573 "punpcklbw %%mm7, %%mm2 \n\t"\
2574 "punpcklbw %%mm7, %%mm3 \n\t"\ 2574 "punpcklbw %%mm7, %%mm3 \n\t"\
2575 "punpcklbw %%mm7, %%mm4 \n\t"\ 2575 "punpcklbw %%mm7, %%mm4 \n\t"\
2581 QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*8*4)\ 2581 QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*8*4)\
2582 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*8*4)\ 2582 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*8*4)\
2583 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*8*4)\ 2583 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*8*4)\
2584 \ 2584 \
2585 : "+a"(src)\ 2585 : "+a"(src)\
2586 : "c"(tmp), "S"(srcStride), "m"(ff_pw_5)\ 2586 : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5)\
2587 : "memory"\ 2587 : "memory"\
2588 );\ 2588 );\
2589 tmp += 4;\ 2589 tmp += 4;\
2590 src += 4 - 13*srcStride;\ 2590 src += 4 - 13*srcStride;\
2591 }\ 2591 }\
2621 "paddw %%mm5, %%mm3 \n\t"\ 2621 "paddw %%mm5, %%mm3 \n\t"\
2622 "psraw $6, %%mm0 \n\t"\ 2622 "psraw $6, %%mm0 \n\t"\
2623 "psraw $6, %%mm3 \n\t"\ 2623 "psraw $6, %%mm3 \n\t"\
2624 "packuswb %%mm3, %%mm0 \n\t"\ 2624 "packuswb %%mm3, %%mm0 \n\t"\
2625 OP(%%mm0, (%1),%%mm7, q)\ 2625 OP(%%mm0, (%1),%%mm7, q)\
2626 "addl $32, %0 \n\t"\ 2626 "add $32, %0 \n\t"\
2627 "addl %3, %1 \n\t"\ 2627 "add %3, %1 \n\t"\
2628 "decl %2 \n\t"\ 2628 "decl %2 \n\t"\
2629 " jnz 1b \n\t"\ 2629 " jnz 1b \n\t"\
2630 : "+a"(tmp), "+c"(dst), "+m"(h)\ 2630 : "+a"(tmp), "+c"(dst), "+m"(h)\
2631 : "S"(dstStride), "m"(ff_pw_32)\ 2631 : "S"((long)dstStride), "m"(ff_pw_32)\
2632 : "memory"\ 2632 : "memory"\
2633 );\ 2633 );\
2634 }\ 2634 }\
2635 static void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 2635 static void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2636 OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ 2636 OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
2829 c->put_ ## postfix1 = put_ ## postfix2;\ 2829 c->put_ ## postfix1 = put_ ## postfix2;\
2830 c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\ 2830 c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
2831 c->avg_ ## postfix1 = avg_ ## postfix2; 2831 c->avg_ ## postfix1 = avg_ ## postfix2;
2832 2832
2833 static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){ 2833 static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
2834 int i=0; 2834 long i=0;
2835 2835
2836 assert(ABS(scale) < 256); 2836 assert(ABS(scale) < 256);
2837 scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT; 2837 scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
2838 2838
2839 asm volatile( 2839 asm volatile(
2861 "pmaddwd %%mm0, %%mm0 \n\t" 2861 "pmaddwd %%mm0, %%mm0 \n\t"
2862 "pmaddwd %%mm1, %%mm1 \n\t" 2862 "pmaddwd %%mm1, %%mm1 \n\t"
2863 "paddd %%mm1, %%mm0 \n\t" 2863 "paddd %%mm1, %%mm0 \n\t"
2864 "psrld $4, %%mm0 \n\t" 2864 "psrld $4, %%mm0 \n\t"
2865 "paddd %%mm0, %%mm7 \n\t" 2865 "paddd %%mm0, %%mm7 \n\t"
2866 "addl $16, %0 \n\t" 2866 "add $16, %0 \n\t"
2867 "cmpl $128, %0 \n\t" //FIXME optimize & bench 2867 "cmp $128, %0 \n\t" //FIXME optimize & bench
2868 " jb 1b \n\t" 2868 " jb 1b \n\t"
2869 "movq %%mm7, %%mm6 \n\t" 2869 "movq %%mm7, %%mm6 \n\t"
2870 "psrlq $32, %%mm7 \n\t" 2870 "psrlq $32, %%mm7 \n\t"
2871 "paddd %%mm6, %%mm7 \n\t" 2871 "paddd %%mm6, %%mm7 \n\t"
2872 "psrld $2, %%mm7 \n\t" 2872 "psrld $2, %%mm7 \n\t"
2877 ); 2877 );
2878 return i; 2878 return i;
2879 } 2879 }
2880 2880
2881 static void add_8x8basis_mmx(int16_t rem[64], int16_t basis[64], int scale){ 2881 static void add_8x8basis_mmx(int16_t rem[64], int16_t basis[64], int scale){
2882 int i=0; 2882 long i=0;
2883 2883
2884 if(ABS(scale) < 256){ 2884 if(ABS(scale) < 256){
2885 scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT; 2885 scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
2886 asm volatile( 2886 asm volatile(
2887 "pcmpeqw %%mm6, %%mm6 \n\t" // -1w 2887 "pcmpeqw %%mm6, %%mm6 \n\t" // -1w
2900 "psraw $1, %%mm1 \n\t" 2900 "psraw $1, %%mm1 \n\t"
2901 "paddw (%2, %0), %%mm0 \n\t" 2901 "paddw (%2, %0), %%mm0 \n\t"
2902 "paddw 8(%2, %0), %%mm1 \n\t" 2902 "paddw 8(%2, %0), %%mm1 \n\t"
2903 "movq %%mm0, (%2, %0) \n\t" 2903 "movq %%mm0, (%2, %0) \n\t"
2904 "movq %%mm1, 8(%2, %0) \n\t" 2904 "movq %%mm1, 8(%2, %0) \n\t"
2905 "addl $16, %0 \n\t" 2905 "add $16, %0 \n\t"
2906 "cmpl $128, %0 \n\t" //FIXME optimize & bench 2906 "cmp $128, %0 \n\t" //FIXME optimize & bench
2907 " jb 1b \n\t" 2907 " jb 1b \n\t"
2908 2908
2909 : "+r" (i) 2909 : "+r" (i)
2910 : "r"(basis), "r"(rem), "g"(scale) 2910 : "r"(basis), "r"(rem), "g"(scale)
2911 ); 2911 );