Mercurial > libavcodec.hg
changeset 4452:c66326f1f635 libavcodec
optimize IDCT of rows with mostly zero coefficients
author | mru |
---|---|
date | Wed, 31 Jan 2007 23:04:56 +0000 |
parents | 9fa2c8a7e4d8 |
children | 22827cd6b228 |
files | armv4l/simple_idct_armv6.S |
diffstat | 1 files changed, 55 insertions(+), 2 deletions(-) [+] |
line wrap: on
line diff
--- a/armv4l/simple_idct_armv6.S Wed Jan 31 22:58:53 2007 +0000 +++ b/armv4l/simple_idct_armv6.S Wed Jan 31 23:04:56 2007 +0000 @@ -90,6 +90,32 @@ .endm /* + Compute partial IDCT of half row. + shift = left-shift amount + a3 = row[2,0] + a4 = row[3,1] + + Output in registers v1--v8 +*/ + .macro idct_row4 shift + ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */ + ldr lr, [pc, #(w46-.-8)] /* lr = W4 | (W6 << 16) */ + ldr v7, [pc, #(w57-.-8)] /* v7 = W5 | (W7 << 16) */ + mov a2, #(1<<(\shift-1)) + smlad v1, a3, ip, a2 + smlsd v4, a3, ip, a2 + ldr ip, [pc, #(w13-.-8)] /* ip = W1 | (W3 << 16) */ + smlad v2, a3, lr, a2 + smlsd v3, a3, lr, a2 + smusdx fp, a4, v7 /* fp = B3 = W7*row[1] - W5*row[3] */ + smuad v5, a4, ip /* v5 = B0 = W1*row[1] + W3*row[3] */ + pkhtb a3, ip, v7, asr #16 /* a4 = W7 | (W3 << 16) */ + pkhbt a2, ip, v7, lsl #16 /* a2 = W1 | (W5 << 16) */ + smusdx v6, a3, a4 /* v6 = -B1 = W7*row[3] - W3*row[1] */ + smusdx v7, a4, a2 /* v7 = B2 = W5*row[1] - W1*row[3] */ + .endm + +/* Compute final part of IDCT single row without shift. Input in registers v1--v8 Output in registers ip, v1--v3, lr, v5--v7 @@ -167,10 +193,26 @@ .align .func idct_row_armv6 idct_row_armv6: - stmfd sp!, {a2, lr} + str lr, [sp, #-4]! + + ldr lr, [a1, #12] /* lr = row[7,5] */ + ldr ip, [a1, #4] /* ip = row[6,4] */ + ldr a4, [a1, #8] /* a4 = row[3,1] */ + ldr a3, [a1] /* a3 = row[2,0] */ + orrs lr, lr, ip + cmpeq lr, a4 + cmpeq lr, a3, lsr #16 + beq 1f + str a2, [sp, #-4]! + cmp lr, #0 + beq 2f idct_row ROW_SHIFT - ldr a2, [sp], #4 + b 3f + +2: idct_row4 ROW_SHIFT + +3: ldr a2, [sp], #4 idct_finish_shift ROW_SHIFT strh v1, [a2] @@ -183,6 +225,17 @@ strh v5, [a2, #(16*7)] ldr pc, [sp], #4 + +1: mov a3, a3, lsl #3 + strh a3, [a2] + strh a3, [a2, #(16*2)] + strh a3, [a2, #(16*4)] + strh a3, [a2, #(16*6)] + strh a3, [a2, #(16*1)] + strh a3, [a2, #(16*3)] + strh a3, [a2, #(16*5)] + strh a3, [a2, #(16*7)] + ldr pc, [sp], #4 .endfunc /*