; Copyright 1996 Acorn Computers Ltd
;
; Licensed under the Apache License, Version 2.0 (the "License");
; you may not use this file except in compliance with the License.
; You may obtain a copy of the License at
;
;     http://www.apache.org/licenses/LICENSE-2.0
;
; Unless required by applicable law or agreed to in writing, software
; distributed under the License is distributed on an "AS IS" BASIS,
; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
; See the License for the specific language governing permissions and
; limitations under the License.
;
; sources.idcto - Ian Rickard's amazing new 5-multiply DCT.
; Thanks to Ian Rickards of ARM Ltd.
; Thanks also to Arai, Agui, Nakajima who actually invented the algorithm.

r0 RN 0
r1 RN 1
r2 RN 2
r3 RN 3
r4 RN 4
r5 RN 5
r6 RN 6
r7 RN 7
r8 RN 8
r9 RN 9
r10 RN 10
r11 RN 11
r12 RN 12
sp RN 13
lr RN 14
pc RN 15

        AREA    |C$$code|, CODE, READONLY

        EXPORT  asm_idctsq_1_d

asm_idctsq_1_d
; on entry:
;   takes coeff array in r0-r7
;   contains coeffs 0, 4, 2, 6, 5, 1, 7, 3 in r0-r7 respectively
;   uses r8-r9 as scratch
; total 43 data processing instructions

; unused env pointer in r0
; ptr to data in r1
        STMDB   sp!, {r4-r10}
        MOV     r10,r1
        LDMIA   r10, {r0-r7}

; Now rearrange the inputs so that they can be passed in in the correct order
; 0 -> 0
; 1 -> 4
; 2 -> 2
; 3 -> 6
; 4 -> 5
; 5 -> 1
; 6 -> 7
; 7 -> 3

;        MOV     r8,r3
;        MOV     r3,r7
;        MOV     r7,r6
;        MOV     r6,r8              ; 3,6,7 moved around
;        MOV     r8,r4
;        MOV     r4,r1
;        MOV     r1,r5
;        MOV     r5,r4              ; 4,1,5 moved around

         MOV     r8,r3
         MOV     r3,r6
         MOV     r6,r7
         MOV     r7,r8
         MOV     r8,r4
         MOV     r4,r5
         MOV     r5,r1
         MOV     r1,r8

; all is well now.

; Now shift the numbers up by 8 so that the inputs are unscaled, but
; the output is shifted up by 8 - exactly like the existing code.
        MOV     r0,r0,LSL #12
        MOV     r1,r1,LSL #12
        MOV     r2,r2,LSL #12
        MOV     r3,r3,LSL #12
        MOV     r4,r4,LSL #12
        MOV     r5,r5,LSL #12
        MOV     r6,r6,LSL #12
        MOV     r7,r7,LSL #12

;        ORR     r0,r0,#128
;        ORR     r1,r1,#128
;        ORR     r2,r2,#128
;        ORR     r3,r3,#128
;        ORR     r4,r4,#128
;        ORR     r5,r5,#128
;        ORR     r6,r6,#128
;        ORR     r7,r7,#128

; Now start the DCT proper.

; starts
        ADD     r5,r5,r6
        SUB     r6,r5,r6,ASL #1
        SUB     r4,r4,r7
        ADD     r7,r4,r7,ASL #1
        SUB     r5,r5,r7
        ADD     r7,r5,r7,ASL #1
        ADD     r2,r2,r3
        SUB     r3,r2,r3,ASL #1
        ADD     r8,r4,r6

; mul r8 by 0.383 (or 392.2/1024)
        SUB     r8,r8,r8,ASR #3  ;  =  448/512
        SUB     r8,r8,r8,ASR #3  ;  =  392/512
; leave pending ASR #1 until instruction marked FIX_a5

; mul r2 by 0.707 (or 724.0/1024)
        SUB     r9,r2,r2,ASR #4  ;  =  960/1024
        ADD     r9,r9,r9,ASR #2  ;  = 1200/1024
        SUB     r2,r2,r9,ASR #2  ;  =  724/1024

        ADD     r0,r0,r1
        SUB     r1,r0,r1,ASL #1
        ADD     r3,r2,r3

; mul r4 by 0.541 (or 554.0/1024)
        ADD     r9,r4,r4,ASR #1  ;  =  768/512
        SUB     r9,r9,r9,ASR #3  ;  =  672/512
        ADD     r4,r4,r9,ASR #4  ;  =  554/512
; leave ASR #1 until instruction marked FIX_a2

; mul r5 by 0.707 (or 724.0/1024)
        SUB     r9,r5,r5,ASR #4  ;  =  960/1024
        ADD     r9,r9,r9,ASR #2  ;  = 1200/1024
        SUB     r5,r5,r9,ASR #2  ;  =  724/1024

; mul r6 by 1.307 (or 1338.4/1024)
        ADD     r9,r6,r6,ASR #1  ;  = 1536/1024
        SUB     r6,r9,r9,ASR #3  ;  = 1344/1024
        SUB     r6,r6,r9,ASR #8  ;  = 1338/1024

        ADD     r0,r0,r3
        SUB     r3,r0,r3,ASL #1
        ADD     r1,r1,r2
        SUB     r2,r1,r2,ASL #1
        ADD     r4,r4,r8         ;  FIX_a2 & FIX_a5 => FIX_r4 below
        SUB     r6,r6,r8,ASR #1  ;  FIX_a5
        ADD     r7,r6,r7
        ADD     r6,r5,r6
        ADD     r5,r5,r4,ASR #1  ;  FIX_r4
        ADD     r0,r0,r7
        SUB     r7,r0,r7,ASL #1
        ADD     r1,r1,r6
        SUB     r6,r1,r6,ASL #1
        ADD     r2,r2,r5
        SUB     r5,r2,r5,ASL #1
        ADD     r3,r3,r4,ASR #1  ;  FIX_r4
        SUB     r4,r3,r4         ;  FIX_r4
; ends

        MOV     r0,r0,ASR #4
        MOV     r1,r1,ASR #4
        MOV     r2,r2,ASR #4
        MOV     r3,r3,ASR #4
        MOV     r4,r4,ASR #4
        MOV     r5,r5,ASR #4
        MOV     r6,r6,ASR #4
        MOV     r7,r7,ASR #4

        STMIA   r10, {r0-r7}
        LDMIA   sp!, {r4-r10}
        MOVS    pc,lr

        EXPORT  asm_j_rev_dct
        GET     sources.idct

        END