comparison x86/dsputilenc_mmx.c @ 12497:c5ffa8b81f9c libavcodec

Move sse16_sse2() from inline asm to yasm. It is one of the functions causing Win64/FATE issues.
author rbultje
date Fri, 17 Sep 2010 01:44:17 +0000
parents 9fef0a8ddd63
children c997f09d1e10
comparison
equal deleted inserted replaced
12496:d9b601af5e5e 12497:c5ffa8b81f9c
321 : "r" ((x86_reg)line_size) , "m" (h) 321 : "r" ((x86_reg)line_size) , "m" (h)
322 : "%ecx"); 322 : "%ecx");
323 return tmp; 323 return tmp;
324 } 324 }
325 325
326 static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 326 int ff_sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h);
327 int tmp;
328 __asm__ volatile (
329 "shr $1,%2\n"
330 "pxor %%xmm0,%%xmm0\n" /* mm0 = 0 */
331 "pxor %%xmm7,%%xmm7\n" /* mm7 holds the sum */
332 "1:\n"
333 "movdqu (%0),%%xmm1\n" /* mm1 = pix1[0][0-15] */
334 "movdqu (%1),%%xmm2\n" /* mm2 = pix2[0][0-15] */
335 "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */
336 "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */
337
338 /* todo: mm1-mm2, mm3-mm4 */
339 /* algo: subtract mm1 from mm2 with saturation and vice versa */
340 /* OR the results to get absolute difference */
341 "movdqa %%xmm1,%%xmm5\n"
342 "movdqa %%xmm3,%%xmm6\n"
343 "psubusb %%xmm2,%%xmm1\n"
344 "psubusb %%xmm4,%%xmm3\n"
345 "psubusb %%xmm5,%%xmm2\n"
346 "psubusb %%xmm6,%%xmm4\n"
347
348 "por %%xmm1,%%xmm2\n"
349 "por %%xmm3,%%xmm4\n"
350
351 /* now convert to 16-bit vectors so we can square them */
352 "movdqa %%xmm2,%%xmm1\n"
353 "movdqa %%xmm4,%%xmm3\n"
354
355 "punpckhbw %%xmm0,%%xmm2\n"
356 "punpckhbw %%xmm0,%%xmm4\n"
357 "punpcklbw %%xmm0,%%xmm1\n" /* mm1 now spread over (mm1,mm2) */
358 "punpcklbw %%xmm0,%%xmm3\n" /* mm4 now spread over (mm3,mm4) */
359
360 "pmaddwd %%xmm2,%%xmm2\n"
361 "pmaddwd %%xmm4,%%xmm4\n"
362 "pmaddwd %%xmm1,%%xmm1\n"
363 "pmaddwd %%xmm3,%%xmm3\n"
364
365 "lea (%0,%4,2), %0\n" /* pix1 += 2*line_size */
366 "lea (%1,%4,2), %1\n" /* pix2 += 2*line_size */
367
368 "paddd %%xmm2,%%xmm1\n"
369 "paddd %%xmm4,%%xmm3\n"
370 "paddd %%xmm1,%%xmm7\n"
371 "paddd %%xmm3,%%xmm7\n"
372
373 "decl %2\n"
374 "jnz 1b\n"
375
376 "movdqa %%xmm7,%%xmm1\n"
377 "psrldq $8, %%xmm7\n" /* shift hi qword to lo */
378 "paddd %%xmm1,%%xmm7\n"
379 "movdqa %%xmm7,%%xmm1\n"
380 "psrldq $4, %%xmm7\n" /* shift hi dword to lo */
381 "paddd %%xmm1,%%xmm7\n"
382 "movd %%xmm7,%3\n"
383 : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
384 : "r" ((x86_reg)line_size));
385 return tmp;
386 }
387 327
388 static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) { 328 static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
389 int tmp; 329 int tmp;
390 __asm__ volatile ( 330 __asm__ volatile (
391 "movl %3,%%ecx\n" 331 "movl %3,%%ecx\n"
1374 1314
1375 c->hadamard8_diff[0]= hadamard8_diff16_mmx; 1315 c->hadamard8_diff[0]= hadamard8_diff16_mmx;
1376 c->hadamard8_diff[1]= hadamard8_diff_mmx; 1316 c->hadamard8_diff[1]= hadamard8_diff_mmx;
1377 1317
1378 c->pix_norm1 = pix_norm1_mmx; 1318 c->pix_norm1 = pix_norm1_mmx;
1379 c->sse[0] = (mm_flags & AV_CPU_FLAG_SSE2) ? sse16_sse2 : sse16_mmx; 1319 c->sse[0] = (mm_flags & AV_CPU_FLAG_SSE2) ? ff_sse16_sse2 : sse16_mmx;
1380 c->sse[1] = sse8_mmx; 1320 c->sse[1] = sse8_mmx;
1381 c->vsad[4]= vsad_intra16_mmx; 1321 c->vsad[4]= vsad_intra16_mmx;
1382 1322
1383 c->nsse[0] = nsse16_mmx; 1323 c->nsse[0] = nsse16_mmx;
1384 c->nsse[1] = nsse8_mmx; 1324 c->nsse[1] = nsse8_mmx;