; Copyright 1996 Acorn Computers Ltd
;
; Licensed under the Apache License, Version 2.0 (the "License");
; you may not use this file except in compliance with the License.
; You may obtain a copy of the License at
;
;     http://www.apache.org/licenses/LICENSE-2.0
;
; Unless required by applicable law or agreed to in writing, software
; distributed under the License is distributed on an "AS IS" BASIS,
; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
; See the License for the specific language governing permissions and
; limitations under the License.
;
; hand-coded DCT
; Based on the inverse DCT in the Indep JPEG Group's release 4 code - look in c.jrevdct4.
;   This C source, and the assembler output from the C (from cc 4.50) are both extremely
;   useful in understanding this code. In particular CC was used to turn the multiplies
;   into sequences of adds.
; 24-Sep-93 - started
; 21-Oct-93 - expanded-out-odd-cases tried and abandoned (see below).
; 13-Jan-94 - Spitfire bug found, thanx a million Mark Taunton.

; a 1D DCT takes 8 integers, adds and subtracts them, gives you 8 other integers.
; a 2D DCT does this to all the rows, then to all the columns (ie 16 times).
; The blocks (of 64 integers) are initially quite sparse, but they soon fill up.
; If the inputs (of a 1D DCT) are all zero, the outputs are all zero.
; If only the first arg is nonzero, the outputs are all replicas of the first arg.

; In this code for the 1D DCT components 0,2,4,6 are combined, then 1,3,5,7, then
; the results folded together. I have tried expanding out the handling of negative
; components so that there are four special cases for only one negative arg nonzero,
; but it made the result slower! Probably through destroying cache locality in the
; whole system. I've seen code for MPEG, which has the same problem to solve, which
; does this far more - a win on machines with a large cache or no cache. Also,
; maybe MPEG typically has sparser DCTs to perform?

; This DCT works to 8 bits of precision, whereas IJG4 uses 13. I have compared
; the results side-by-side on a 32bpp screen, and I can't see any significant
; differences. JPEG has already thrown away (sorry, 'quantized') all the numbers
; a lot anyway.

; ------------------------------------------------------------------------
; The DCT macro
; ------------------------------------------------------------------------

; The basic 1-D DCT is coded here as a macro, because the code is replicated
; for rows and for columns.

; There are various basic paths through a 1D DCT in this code:
;   all the values are zero - nothing need be done. (common, esp in 1st pass)
;   only r0 is non-zero - replicate it (also common, ditto)
;   r1/r3/r5/r7 are zero, others are non-zero
;   r0/r2/r4/r6 are zero, others are non-zero

        MACRO
        DCTransform $rc

; the even components.

        ORRS    r10,r2,r6
        BEQ     $rc._zr26                 ; B if r2/6 are zero

        ADD     r10,r2,r6

        ;MulCon  r11,r10,FIX_0_541196100   ; z1
        RSB     r11,r10,r10,LSL #4
        ADD     r11,r11,r11,LSL #3
        ADD     r11,r11,r10,LSL #2

        ;MulCon  r10,r6,- FIX_1_847759065
        SUB     r10,r6,r6,LSL #5
        RSB     r10,r10,r10,LSL #4
;        SUB     r10,r11,r10,LSL #3        ; tmp2
        SUB     r10,r10,r6,LSL #3
        ADD     r10,r10,r11                ; tmp2

        ;MulCon  r12,r2,FIX_0_765366865
        ADD     r12,r2,r2,LSL #4
        ADD     r12,r12,r2,LSL #5
        ADD     r12,r11,r12,LSL #2        ; tmp3
        ; scratch z1=r11
        ; scratch r2,r6

        SUB     r11,r0,r4                 ; tmp1 (not shifted)
        ADD     r6,r10,r11,LSL #8         ; tmp11 = tmp1 + tmp2;
        RSB     r2,r10,r11,LSL #8         ; tmp12 = tmp1 - tmp2;

        ADD     r11,r0,r4                 ; tmp0 (not shifted)
        ; scratch r0,r4
        ADD     r0,r12,r11,LSL #8         ; tmp10 = tmp0 + tmp3;
        RSB     r4,r12,r11,LSL #8         ; tmp13 = tmp0 - tmp3;
        ; scratch r11=tmp0
; scratch r10,r11,r12
; r0/2/4/6 have been replaced by tmp10/12/13/11

; the odd components
        ORRS    r10,r1,r3
        ORREQS  r10,r5,r7
        BEQ     $rc._odd_shortcut         ; shortcut
$rc._odd

; the pressure on registers is severe: do z3,z4,z5 so that
; z5 can be discarded before doing anything with z1,z2

        ADD     r10,r7,r3                 ; z3
        ADD     r11,r5,r1                 ; z4
        ADD     r12,r10,r11               ; ...z3+z4

        ;MulCon  lr,r12,FIX_1_175875602    ; z5
        ADD     lr,r12,r12,LSL #5
        ADD     lr,lr,lr,LSL #3
        ADD     lr,lr,r12,LSL #2
        ; scratch r12

        ;MulCon  r12,r10,- FIX_1_961570560 ; z3
        SUB     r12,r10,r10,LSL #1
        RSB     r12,r12,r12,LSL #8
        ADD     r12,r12,r10,LSL #2        ; (still needs to <<1)
        ; scratch r10

        ;MulCon  r10,r11,- FIX_0_390180644 ; z4
        SUB     r10,r11,r11,LSL #1
        ADD     r10,r10,r10,LSL #5
        ADD     r10,r10,r11,LSL #3         ; (still needs to <<2)
;        RSB     r10,r10,r10,LSL #8
;        ADD     r10,r10,r11,LSL #2        ; (still needs to <<2)
        ; scratch r11

        ADD     r12,lr,r12,LSL #1         ; z3 += z5; (did the <<1 from above)
        ADD     r10,lr,r10,LSL #2         ; z4 += z5; (did the <<2 from above)
        ; scratch lr=z5
        ; now do z1 all the way through before doing z2.
        ; registers: r12=z3
        ;            r10=z4
        ;            r11,lr scratch
        ADD     r11,r7,r1                 ; z1

; prevent 'macro too big' errors!
        MEND
        MACRO
        DCTransform2 $rc

        ;MulCon  lr,r7,FIX_0_298631336     ; tmp0
        ADD     lr,r7,r7,LSL #1
        ADD     lr,lr,r7,LSL #4           ; (still need to do <<2)
        ; scratch r7

        ;MulCon  r7,r1,FIX_1_501321110     ; tmp3
        ADD     r7,r1,r1,LSL #1           ; (still need to <<7)
        ; scratch r1

        ;MulCon  r1,r11,- FIX_0_899976223  ; z1
        ADD     r1,r11,r11,LSL #2
        ADD     r1,r1,r11,LSL #3
        SUB     r1,r1,r11,LSL #7          ; (still need to <<1)
        ; scratch r11

        ADD     lr,r12,lr,LSL #2          ; tmp0 += z3;         (did the <<2 for tmp0 above)
        ADD     lr,lr,r1,LSL #1           ; tmp0 += z1;         (did the <<1 for z1 above)
        ADD     r7,r10,r7,LSL #7          ; tmp3 += z4;         (did the <<7 for tmp3 above)
        ADD     r7,r7,r1,LSL #1           ; tmp3 += z1;         (did the <<1 for z1 above)
        ; scratch r1
        ; now we can do z2
        ; registers: r7=tmp3
        ;            r10=z4
        ;            r12=z3
        ;            lr=tmp0
        ;            r1,r11 scratch
        ADD     r1,r5,r3                  ; z2

        ;MulCon  r11,r1,- FIX_2_562915447  ; z2
        SUB     r11,r1,r1,LSL #1
        ADD     r11,r11,r11,LSL #5
        SUB     r11,r11,r1,LSL #3         ; (still need to <<4)
        ; scratch r1

        ADD     r10,r10,r11,LSL #4        ; ...z2+z4 (did the <<4 above)
        ADD     r12,r12,r11,LSL #4        ; ...z2+z3 (did the <<4 above)
        ; scratch r11

        ;MulCon  r11,r5,FIX_2_053119869    ; tmp1
        RSB     r11,r5,r5,LSL #3
        ADD     r11,r11,r5,LSL #8         ; (still need to <<1)
        ; scratch r5

        ;MulCon  r5,r3,FIX_3_072711026     ; tmp2
        ADD     r5,r3,r3,LSL #8
        ADD     r5,r5,r5,LSL #1
        ADD     r5,r5,r3,LSL #4
        ; scratch r3

        ADD     r11,r10,r11,LSL #1        ; tmp1 += z2 + z4; (did the <<1 on tmp1 above)
        ADD     r10,r5,r12                ; tmp2 += z2 + z3;

        ; final combination and output stage:
        ; r0 = tmp10
        ; r1 =             (scratch)
        ; r2 = tmp12
        ; r3 =             (scratch)
        ; r4 = tmp13
        ; r5 =             (scratch)
        ; r6 = tmp11
        ; r7 = tmp3
        ; r8 = loop counter
        ; r9 = data pointer
        ; r10= tmp2
        ; r11= tmp1
        ; r12=             (scratch)
        ; r13= sp
        ; lr = tmp0
        ADD     r1,r6,r10                 ; tmp11 + tmp2
        ADD     r3,r4,lr                  ; tmp13 + tmp0
        SUB     r5,r2,r11                 ; tmp12 - tmp1
        SUB     r4,r4,lr                  ; tmp13 - tmp0
        ADD     r2,r2,r11                 ; tmp12 + tmp1
        SUB     r6,r6,r10                 ; tmp11 - tmp2
        ADD     r0,r0,r7                  ; tmp10 + tmp3
        SUB     r7,r0,r7,LSL #1           ; tmp10 - tmp3 (actually tmp10 + tmp3 - 2*tmp3)
$rc._store
        MEND

; The 'nullsave' argument tells the macro if the answer goes back in the same
; place as the input came from - TRUE for rows, FALSE for columns. If TRUE and
; all inputs zero, don't even have to save outputs.
        MACRO
        DCTransform_Leftovers $rc,$nullsave
$rc._zr26
        ; r2/6 are zero - we can take a shortcut through the first quadrant.
        ; tmp2 and tmp3 are zero, tmp0=(r0+r4)<<8, tmp1=(r0-r4)<<8.
        ; we do this rather than replicate-DC because it's little slower,
        ; especially considering we only did one instruction of test to get here,
        ; and handles one more case.
        SUB     r6,r0,r4
        MOV     r6,r6,LSL #8             ; tmp11 = tmp1 + tmp2;
        MOV     r2,r6                    ; tmp12 = tmp1 - tmp2;
        ADD     r0,r0,r4
        MOV     r0,r0,LSL #8             ; tmp10 = tmp0 + tmp3;
        MOV     r4,r0                    ; tmp13 = tmp0 - tmp3;
        ; We also check for r1-r7 being zero (so replicate r0 << 8), or
        ; r0-r7 being zero in which case nothing to be done at all.
        ORRS    r10,r1,r3
        ORREQS  r10,r5,r7
        BNE     $rc._odd                 ; odd coeffs NOT all zero, handle them normally.

      [ $nullsave.
        ; If r0-r7 are all zero, AND we're saving the answer back to the same
        ; place, then don't even bother to save. But if answer goes somewhere
        ; different, we must save - this was the 'spitfire' bug.
        ; 1-3,5-7 are all zero - worth checking for 1-7 zero, and for all zero.
        ORRS    r10,r0,r2                ; r0+r4,r0-r4 both zero -> r0,r4 were both zero
        BEQ     $rc._inc                 ; r0-r7 are all zero - don't even save
      ]

        ; code for 1-7 zero, works for any case where 1/3/5/7 are zero in fact.
$rc._odd_shortcut
        MOV     r7,r0
        MOV     r1,r6
        MOV     r5,r2
        MOV     r3,r4
        ; it's as if you B'd to $rc._store here.
        ; the caller of this macro will replicate the end-of-loop code,
        ; or put in the suitable B.
        MEND

;; ------------------------------------------------------------------------
;; Test proc - procedure to do a 1-D DCT
;; ------------------------------------------------------------------------
;; extern void dct_1d(decompress_info_ptr cinfo, int *data);

;asm_dct_1_d
;        STMDB   sp!,{r0-r12,lr}          ; save state
;        MOV     r9,r1                    ; data pointer
;        LDMIA   r9,{r0-r7}               ; get this row
;        DCTransform dct1d
;        DCTransform2 dct1d
;        STMIA   r9!,{r0-r7}              ; put this row
;dct1d_inc
;        LDMIA   sp!,{r0-r12,pc}          ; exit

;        DCTransform_Leftovers dct1d,{TRUE}
;        STMIA   r9!,{r0-r7}              ; put this row
;        LDMIA   sp!,{r0-r12,pc}          ; exit

; ------------------------------------------------------------------------
; The main routine
; ------------------------------------------------------------------------
; r0=cinfo (unused)
; r1=data ptr
; r2=count

asm_j_rev_dct                             ; extern void asm_j_rev_dct(decompress_info_ptr cinfo, DCTBLOCK data, int count);

        CMP     r2,#0                     ; if count=0, do nothing
        MOVLE   pc,lr

        STMDB   sp!,{r1,r2,r4-r12,lr}     ; save state - count is at [sp,#4], r3 need not be preserved

dct_loop                                  ; for each block

        MOV     r8,#8                     ; loop counter for rows
        MOV     r9,r1                     ; data pointer

; First do the rows
dctrow_loop
        LDMIA   r9,{r0-r7}                ; get this row
        DCTransform dctrow
        DCTransform2 dctrow
        STMIA   r9,{r0-r7}                ; put this row
dctrow_inc
        ADD     r9,r9,#8*4                ; increment output pointer
        SUBS    r8,r8,#1
        BNE     dctrow_loop
dctrow_done

; now to do the columns, very similar but the loads/stores are spaced
; out more.
;        LDR     r9,[sp]                   ; reload data pointer
        SUB     r9,r9,#8*8*4              ; reset data pointer
        SUB     r8,r9,#8*8*4              ; set up output pointer
dctcol_loop
        ; load the data values, spaced 8 words apart.
        LDR     r0,[r9,#8*4*0]
        LDR     r1,[r9,#8*4*1]
        LDR     r2,[r9,#8*4*2]
        LDR     r3,[r9,#8*4*3]
        LDR     r4,[r9,#8*4*4]
        LDR     r5,[r9,#8*4*5]
        LDR     r6,[r9,#8*4*6]
        LDR     r7,[r9,#8*4*7]

        DCTransform dctcol
        DCTransform2 dctcol
        STMIA   r8,{r0-r7}                ; store in row order, in the block below the original input
dctcol_inc
        ADD     r8,r8,#8*4                ; advance output pointer
        ADD     r9,r9,#4                  ; advance input pointer
        SUB     r10,r9,r8                 ; terminate based on the difference between them
        CMP     r10,#8*4                  ; until then the difference will be bigger
        BGT     dctcol_loop               ; loop

        LDMIA   sp!,{r1,r2}               ; get block pointer, count back
        SUBS    r2,r2,#1                  ; decrement count
        LDMLEIA sp!,{r4-r12,pc}           ; exit
        ; 'count' says that we do another block
        ADD     r1,r1,#64*4               ; advance pointer to next block
        STMDB   sp!,{r1,r2}               ; push pointer and count back on stack
        B       dct_loop                  ; and do another block

; --------------------------------------------------------------------------
        DCTransform_Leftovers dctrow,{TRUE}
        STMIA   r9!,{r0-r7}               ; store this row
        SUBS    r8,r8,#1                  ; decrement row count
        BNE     dctrow_loop               ; and either do another row
        B       dctrow_done               ; or done all rows

        DCTransform_Leftovers dctcol,{FALSE}
        B       dctcol_store              ; B to code for end of column

        END