Mercurial > libavcodec.hg
comparison x86/vp8dsp.asm @ 12227:d07e6037846d libavcodec
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
CPUs supporting it.
author | rbultje |
---|---|
date | Thu, 22 Jul 2010 19:59:34 +0000 |
parents | 657d353cd515 |
children | e08d65897115 |
comparison
equal
deleted
inserted
replaced
12226:36940062c081 | 12227:d07e6037846d |
---|---|
1930 ; int flimE, int flimI, int hev_thr); | 1930 ; int flimE, int flimI, int hev_thr); |
1931 ;----------------------------------------------------------------------------- | 1931 ;----------------------------------------------------------------------------- |
1932 | 1932 |
1933 ; write 4 or 8 words in the mmx/xmm registers as 8 lines | 1933 ; write 4 or 8 words in the mmx/xmm registers as 8 lines |
1934 ; 1 and 2 are the registers to write, this can be the same (for SSE2) | 1934 ; 1 and 2 are the registers to write, this can be the same (for SSE2) |
1935 ; for pre-SSE4: | |
1935 ; 3 is a general-purpose register that we will clobber | 1936 ; 3 is a general-purpose register that we will clobber |
1937 ; for SSE4: | |
1938 ; 3 is a pointer to the destination's 5th line | |
1936 ; 4 is a pointer to the destination's 4th line | 1939 ; 4 is a pointer to the destination's 4th line |
1937 ; 5 is -stride and +stride | 1940 ; 5/6 is -stride and +stride |
1938 %macro WRITE_8W 6 | 1941 ; 7 is optimization string |
1942 %macro WRITE_8W 7 | |
1943 %ifidn %7, sse4 | |
1944 pextrw [%4+%5*4], %1, 0 | |
1945 pextrw [%3+%5*4], %1, 1 | |
1946 pextrw [%4+%5*2], %1, 2 | |
1947 pextrw [%4+%5 ], %1, 3 | |
1948 pextrw [%4 ], %1, 4 | |
1949 pextrw [%3 ], %1, 5 | |
1950 pextrw [%3+%6 ], %1, 6 | |
1951 pextrw [%3+%6*2], %1, 7 | |
1952 %else | |
1939 movd %3, %1 | 1953 movd %3, %1 |
1940 %if mmsize == 8 | 1954 %if mmsize == 8 |
1941 punpckhdq %1, %1 | 1955 punpckhdq %1, %1 |
1942 %else | 1956 %else |
1943 psrldq %1, 4 | 1957 psrldq %1, 4 |
1971 mov [%4+%6 ], %3w | 1985 mov [%4+%6 ], %3w |
1972 shr %3, 16 | 1986 shr %3, 16 |
1973 mov [%4+%6*2], %3w | 1987 mov [%4+%6*2], %3w |
1974 %if mmsize == 8 | 1988 %if mmsize == 8 |
1975 add %4, %5 | 1989 add %4, %5 |
1990 %endif | |
1976 %endif | 1991 %endif |
1977 %endmacro | 1992 %endmacro |
1978 | 1993 |
1979 %macro MBEDGE_LOOPFILTER 5 | 1994 %macro MBEDGE_LOOPFILTER 5 |
1980 %if %4 == 8 ; chroma | 1995 %if %4 == 8 ; chroma |
2507 SBUTTERFLY bw, 5, 6, 0 | 2522 SBUTTERFLY bw, 5, 6, 0 |
2508 | 2523 |
2509 %if mmsize == 8 ; mmx/mmxext (h) | 2524 %if mmsize == 8 ; mmx/mmxext (h) |
2510 WRITE_4x2D 1, 2, 3, 4, dst_reg, dst2_reg, mstride_reg, stride_reg | 2525 WRITE_4x2D 1, 2, 3, 4, dst_reg, dst2_reg, mstride_reg, stride_reg |
2511 add dst_reg, 4 | 2526 add dst_reg, 4 |
2512 WRITE_8W m5, m6, dst2_reg, dst_reg, mstride_reg, stride_reg | 2527 WRITE_8W m5, m6, dst2_reg, dst_reg, mstride_reg, stride_reg, %4 |
2513 %else ; sse2 (h) | 2528 %else ; sse2 (h) |
2514 lea dst8_reg, [dst8_reg+mstride_reg+1] | 2529 lea dst8_reg, [dst8_reg+mstride_reg+1] |
2515 WRITE_4x4D 1, 2, 3, 4, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4 | 2530 WRITE_4x4D 1, 2, 3, 4, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4 |
2516 lea dst_reg, [dst2_reg+mstride_reg+4] | 2531 lea dst_reg, [dst2_reg+mstride_reg+4] |
2517 lea dst8_reg, [dst8_reg+mstride_reg+4] | 2532 lea dst8_reg, [dst8_reg+mstride_reg+4] |
2518 WRITE_8W m5, m5, dst2_reg, dst_reg, mstride_reg, stride_reg | 2533 WRITE_8W m5, m5, dst2_reg, dst_reg, mstride_reg, stride_reg, %2 |
2519 WRITE_8W m6, m6, dst2_reg, dst8_reg, mstride_reg, stride_reg | 2534 %ifidn %2, sse4 |
2535 lea dst_reg, [dst8_reg+ stride_reg] | |
2536 %endif | |
2537 WRITE_8W m6, m6, dst2_reg, dst8_reg, mstride_reg, stride_reg, %2 | |
2520 %endif | 2538 %endif |
2521 %endif | 2539 %endif |
2522 | 2540 |
2523 %if mmsize == 8 | 2541 %if mmsize == 8 |
2524 %if %4 == 8 ; chroma | 2542 %if %4 == 8 ; chroma |
2572 %else | 2590 %else |
2573 MBEDGE_LOOPFILTER ssse3, h, 6, 16, 16 | 2591 MBEDGE_LOOPFILTER ssse3, h, 6, 16, 16 |
2574 %endif | 2592 %endif |
2575 MBEDGE_LOOPFILTER ssse3, v, 6, 8, 16 | 2593 MBEDGE_LOOPFILTER ssse3, v, 6, 8, 16 |
2576 MBEDGE_LOOPFILTER ssse3, h, 6, 8, 16 | 2594 MBEDGE_LOOPFILTER ssse3, h, 6, 8, 16 |
2595 | |
2596 %ifdef m8 | |
2597 MBEDGE_LOOPFILTER sse4, h, 5, 16, 16 | |
2598 %else | |
2599 MBEDGE_LOOPFILTER sse4, h, 6, 16, 16 | |
2600 %endif | |
2601 MBEDGE_LOOPFILTER sse4, h, 6, 8, 16 |