Mercurial > libavcodec.hg
comparison x86/dsputilenc_mmx.c @ 12497:c5ffa8b81f9c libavcodec
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
Win64/FATE issues.
author | rbultje |
---|---|
date | Fri, 17 Sep 2010 01:44:17 +0000 |
parents | 9fef0a8ddd63 |
children | c997f09d1e10 |
comparison
equal
deleted
inserted
replaced
12496:d9b601af5e5e | 12497:c5ffa8b81f9c |
---|---|
321 : "r" ((x86_reg)line_size) , "m" (h) | 321 : "r" ((x86_reg)line_size) , "m" (h) |
322 : "%ecx"); | 322 : "%ecx"); |
323 return tmp; | 323 return tmp; |
324 } | 324 } |
325 | 325 |
326 static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { | 326 int ff_sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h); |
327 int tmp; | |
328 __asm__ volatile ( | |
329 "shr $1,%2\n" | |
330 "pxor %%xmm0,%%xmm0\n" /* mm0 = 0 */ | |
331 "pxor %%xmm7,%%xmm7\n" /* mm7 holds the sum */ | |
332 "1:\n" | |
333 "movdqu (%0),%%xmm1\n" /* mm1 = pix1[0][0-15] */ | |
334 "movdqu (%1),%%xmm2\n" /* mm2 = pix2[0][0-15] */ | |
335 "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */ | |
336 "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */ | |
337 | |
338 /* todo: mm1-mm2, mm3-mm4 */ | |
339 /* algo: subtract mm1 from mm2 with saturation and vice versa */ | |
340 /* OR the results to get absolute difference */ | |
341 "movdqa %%xmm1,%%xmm5\n" | |
342 "movdqa %%xmm3,%%xmm6\n" | |
343 "psubusb %%xmm2,%%xmm1\n" | |
344 "psubusb %%xmm4,%%xmm3\n" | |
345 "psubusb %%xmm5,%%xmm2\n" | |
346 "psubusb %%xmm6,%%xmm4\n" | |
347 | |
348 "por %%xmm1,%%xmm2\n" | |
349 "por %%xmm3,%%xmm4\n" | |
350 | |
351 /* now convert to 16-bit vectors so we can square them */ | |
352 "movdqa %%xmm2,%%xmm1\n" | |
353 "movdqa %%xmm4,%%xmm3\n" | |
354 | |
355 "punpckhbw %%xmm0,%%xmm2\n" | |
356 "punpckhbw %%xmm0,%%xmm4\n" | |
357 "punpcklbw %%xmm0,%%xmm1\n" /* mm1 now spread over (mm1,mm2) */ | |
358 "punpcklbw %%xmm0,%%xmm3\n" /* mm4 now spread over (mm3,mm4) */ | |
359 | |
360 "pmaddwd %%xmm2,%%xmm2\n" | |
361 "pmaddwd %%xmm4,%%xmm4\n" | |
362 "pmaddwd %%xmm1,%%xmm1\n" | |
363 "pmaddwd %%xmm3,%%xmm3\n" | |
364 | |
365 "lea (%0,%4,2), %0\n" /* pix1 += 2*line_size */ | |
366 "lea (%1,%4,2), %1\n" /* pix2 += 2*line_size */ | |
367 | |
368 "paddd %%xmm2,%%xmm1\n" | |
369 "paddd %%xmm4,%%xmm3\n" | |
370 "paddd %%xmm1,%%xmm7\n" | |
371 "paddd %%xmm3,%%xmm7\n" | |
372 | |
373 "decl %2\n" | |
374 "jnz 1b\n" | |
375 | |
376 "movdqa %%xmm7,%%xmm1\n" | |
377 "psrldq $8, %%xmm7\n" /* shift hi qword to lo */ | |
378 "paddd %%xmm1,%%xmm7\n" | |
379 "movdqa %%xmm7,%%xmm1\n" | |
380 "psrldq $4, %%xmm7\n" /* shift hi dword to lo */ | |
381 "paddd %%xmm1,%%xmm7\n" | |
382 "movd %%xmm7,%3\n" | |
383 : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp) | |
384 : "r" ((x86_reg)line_size)); | |
385 return tmp; | |
386 } | |
387 | 327 |
388 static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) { | 328 static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) { |
389 int tmp; | 329 int tmp; |
390 __asm__ volatile ( | 330 __asm__ volatile ( |
391 "movl %3,%%ecx\n" | 331 "movl %3,%%ecx\n" |
1374 | 1314 |
1375 c->hadamard8_diff[0]= hadamard8_diff16_mmx; | 1315 c->hadamard8_diff[0]= hadamard8_diff16_mmx; |
1376 c->hadamard8_diff[1]= hadamard8_diff_mmx; | 1316 c->hadamard8_diff[1]= hadamard8_diff_mmx; |
1377 | 1317 |
1378 c->pix_norm1 = pix_norm1_mmx; | 1318 c->pix_norm1 = pix_norm1_mmx; |
1379 c->sse[0] = (mm_flags & AV_CPU_FLAG_SSE2) ? sse16_sse2 : sse16_mmx; | 1319 c->sse[0] = (mm_flags & AV_CPU_FLAG_SSE2) ? ff_sse16_sse2 : sse16_mmx; |
1380 c->sse[1] = sse8_mmx; | 1320 c->sse[1] = sse8_mmx; |
1381 c->vsad[4]= vsad_intra16_mmx; | 1321 c->vsad[4]= vsad_intra16_mmx; |
1382 | 1322 |
1383 c->nsse[0] = nsse16_mmx; | 1323 c->nsse[0] = nsse16_mmx; |
1384 c->nsse[1] = nsse8_mmx; | 1324 c->nsse[1] = nsse8_mmx; |