; Copyright 1996 Acorn Computers Ltd
;
; Licensed under the Apache License, Version 2.0 (the "License");
; you may not use this file except in compliance with the License.
; You may obtain a copy of the License at
;
;     http://www.apache.org/licenses/LICENSE-2.0
;
; Unless required by applicable law or agreed to in writing, software
; distributed under the License is distributed on an "AS IS" BASIS,
; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
; See the License for the specific language governing permissions and
; limitations under the License.
;
; > Sources.PutScaled.

; started WRS 3rd August 1993
; This is link and service code for the compiled form of c.putscaled,
; which exists in s.putscaled.
; It needs no hand patches, except that the register definitions at
; the beginning duplicate existing definitions and should be removed.

; ... is it a problem being in SVC mode? No problems so far!

; Get things that compiled C needs
        GET     sources.CSupport

; --------------------------------------------------------------------------
; Provide a basic debugging output routine.
; R0 = string to output
      [ debug

asm_writech
; R0 = a character to output
        Push    "R12,LR"
        MOV     R12,SL          ; get back workspace pointer.
        CMP     R0,#10          ; newline char?
        BLEQ    Neil_NewLine
        BLNE    Neil_WriteC
        ADRL    R0,Neil_FileHandle
        LDR     R0,[R0]         ; return the file handle as a result of this function:
                                ; allows the C to know whether output is turned on,
                                ; <0 return -> no tracing right now.
        Pull    "R12,PC"
      ]

dividebyzero
        Debug   cc,"DIVIDE BY ZERO",a2,a1,lr
        ; FALL THROUGH to exit.

; --------------------------------------------------------------------------------------
; For an unexpected exit from compiled code, such as an assertion failure.
; There's no error message unless you're debugging, you simply stop the
; whole plot operation and return as fast as possible.
; If debugging, you have (presumably) already put out an error message.
exit
        MOV     r1, #0
exit_c
        MOV     R12,SL              ; get back workspace pointer.
        ADRL    R2,ccompiler_errptr
        STR     r1,[r2]
        ADRL    R1,ccompiler_sp
        LDR     SP,[R1]             ; get back SP
        Debug   cc,"unexpected exit c code",R0 ; a reason code is possible - usually C line number.
        B       exitbiggie

; ---------------------------------------------------------------------------------------
; Memory allocation for the C code.
; malloc - r0 = number of bytes to allocate (see PRM3 1-233)
;          on exit, r0 = pointer to block, or 0 for 'none possible'.
malloc
        MOV     r1,lr               ; keep return link
        MOV     r3,r0               ; required size, for RMA allocation
        MOV     r0,#ModHandReason_Claim
        SWI     XOS_Module          ; do the claim - on exit R2 points to block, if non-error
from_realloc
        ADRVSL  r2,ccompiler_errptr
        STRVS   r0,[r2]             ; if an error, remember it
        MOVVS   r0,#0               ; ... and return null pointer
        MOVVC   r0,r2               ; if no error, return pointer to C code
        MOV     pc,r1

; free - r0 = pointer to previously malloc'd block. (see PRM3 1-234)
free
        MOV     r1,lr               ; keep return link
        MOV     r2,r0               ; pointer to heap block
        MOV     r0,#ModHandReason_Free
        SWI     OS_Module           ; no error expected, unless we have internal errors
        MOV     pc,r1

; realloc_delta - r0 = pointer to block, r1 = CHANGE in size (see PRM3 1-240)
; (Can't provide the real realloc sadly, because these are the args that the OS wants)
realloc_delta
        MOV     r2,r0               ; pointer to heap block
        MOV     r3,r1               ; change in size
        MOV     r0,#ModHandReason_ExtendBlock
        MOV     r1,lr               ; keep return link
        SWI     XOS_Module
        B       from_realloc

; ---------------------------------------------------------------------------------------
;; debug output from within the compiled code.
;traceentry1
;        Debug   cc,"Tracepoint: lr,r4=",lr,r4
;        MOV     pc,lr

;; accessed from compiled code as bitblockmove-4 - sorry!
;traceentry
;        B       traceentry1
; ---------------------------------------------------------------------------------------
bitblockmove
; routine for simple bit block move.

; This is called by the compiled code when pixels are equal size, no mask, only 'set' gcol action, no table.
; There's no point in 'compiling' it because there are no important variants that we want to compile in,
; so it's clearer to just write it in the assembler.

; Registers on entry:
r_inptr RN      0       ; r_inptr -> input (word pointer)
r_outptr RN     1       ; r_outptr -> output (word pointer)
r_inshift RN    2       ; r_inshift (aka r_inword3) - number of (most significant) bits of first word to transfer, in 1..32
r_outshift RN   3       ; r_outshift (aka r_inword4) - number of (most significant) bits of space in first word to fill up, in 1..32
r_xcount RN     4       ; r_xcount - number of bits to transfer.

; Workspace registers:
r_inword RN     5       ; r_inword, r_inword2 - temp space
r_inword2 RN    6       ; must come AFTER r_inword for LDM
r_outword RN    7       ; r_outword, r_outword2, r_outword3, r_outword4 - temp space
r_outword2 RN   8       ; must come AFTER r_outword for STM
r_shl    RN     9       ; r_shl, r_shr - temp space
r_shr    RN     10
                        ; NB r12 is NOT set up.
                        ; On exit arg registers are corrupted, others preserved

        STMDB   sp!,{r_inword,r_inword2,r_outword,r_outword2,r_shl,r_shr,lr}

;        Debug   cc,"bitblockmove: ",r0,r1,r2,r3,r4
;        Debug   cc,"            : ",r5,r6,r7,r8,r9
;        Debug   cc,"            : ",r10,r11,r12,sp,lr,pc

; The following diagrams help when trying to think about shift cases, especially for start conditions etc.
; Note particularly that an entire display line is 'little-endian' - least sig pixel is at the left,
; most sig pixel is at the right in an entirely consistent way.

; Input words:
; 0                             31 0                             31 0                             31 bit number
; |------------------------------| |------------------------------| |------------------------------|
; What to transfer:
;                         |***************************************************** . . .
;                         <-----> this is r_inshift on entry
; Output words:
; 0                             31 0                             31 0                             31 bit number
; |------------------------------| |------------------------------| |------------------------------|
; What to fill up:
;                |***************************************************** . . .
;                <--------------> this is r_outshift on entry

; The difference between r_outshift and r_inshift is the distance that bulk data has to be shifted,
; once we get into the main loop.

; the bottom 32-outshift bits of outword should be loaded
; with whatever is there already.
        LDR     r_outword,[r_outptr]
        MOV     r_outword,r_outword,LSL r_outshift
        MOV     r_outword,r_outword,LSR r_outshift                     ; discard unwanted bits

; xcount counts the number of bits which must be
; saved at r_outptr, of which the first r_shl bits can be found in outword
; and the remainder are still to be fetched from r_inptr.
        RSB     r_outword2,r_outshift,#32                              ; temp use of r_outword2
        ADD     r_xcount,r_xcount,r_outword2                           ; add the bits we've just loaded in

; Only the top r_inshift bits of r_inword are interesting
        LDR     r_inword,[r_inptr],#4
        RSB     r_inword2,r_inshift,#32                                ; temp use of r_inword2
        MOV     r_inword,r_inword,LSR r_inword2                        ; discard unwanted bits
        MOV     r_inword,r_inword,LSL r_inword2

; differing code depending on which of r_inshift and r_outshift is bigger
        CMP     r_outshift,r_inshift
        BEQ     insh_equal
        BLT     insh_more

; r_outshift is bigger than r_inshift:
; the first output word will consist of:
;   bottom 32-outshift bits undisturbed
;   top inshift bits from first input word
;   bottom outshift-inshift (= r_shr bits, shifted left by r_shl) bits from the next input word

        SUB     r_shr,r_outshift,r_inshift
        RSB     r_shl,r_shr,#32
        ORR     r_outword,r_outword,r_inword,LSR r_shr
        ; bottom r_shl bits of r_outword are now loaded with input.
        ; fetch the top of the next word as part of the main loop.
        B       loop64_enter

insh_equal
; No shift offset between input and output - everything a lot simpler!
; the first output word consists of:
;   bottom 32-outshift bits undisturbed
;   top 32-outshift bits of the input
        ORR     r_outword,r_outword,r_inword
        CMP     r_xcount,#32
        BLT     less32
        STR     r_outword,[r_outptr],#4
        SUBS    r_xcount,r_xcount,#32
        BEQ     done
; Now extra-simple 64-bit loop for no-shift case.
        B       loop64_noshift_enter
loop64_noshift
        LDMIA   r_inptr!,{r_inword,r_inword2}
        STMIA   r_outptr!,{r_inword,r_inword2}
loop64_noshift_enter
        SUBS    r_xcount,r_xcount,#64
        BGE     loop64_noshift
        MOV     r_outword,#0
        MOV     r_shl,#0
        MOV     r_shr,#32
        B       loop64_exit

insh_more
; inshift is bigger than outshift
; the first output word will consist of:
;   bottom 32-outshift bits undisturbed
;   outshift bits from the middle of the input word
        SUB     r_shl,r_inshift,r_outshift
        RSB     r_shr,r_shl,#32
        ORR     r_outword,r_outword,r_inword,LSL r_shl
; We still have r_shl bits of input at the top of r_inword,
; not the correct situation for entering the 64-bit loop (they should be
; at the bottom of r_outword). So, have to do one word of output by steam.
        CMP     r_xcount,#32
        BLT     less32
        STR     r_outword,[r_outptr],#4
        MOV     r_outword,r_inword,LSR r_shr
        SUBS    r_xcount,r_xcount,#32
        BEQ     done
        ; and fall into the 64-bit loop.

; the 64-bit loop - main time-critical bit
; The bottom r_shl bits of r_outword are valid and must be saved at r_outptr.
        SUBS    r_xcount,r_xcount,#64
        BLT     loop64_exit
loop64
        LDMIA   r_inptr!,{r_inword,r_inword2}                      ; pick up 64 input bits
        ORR     r_outword,r_outword,r_inword,LSL r_shl             ; borrow r_shl bits already in r_outword
        MOV     r_outword2,r_inword,LSR r_shr                      ; create outword2
        ORR     r_outword2,r_outword2,r_inword2,LSL r_shl
        STMIA   r_outptr!,{r_outword,r_outword2}                   ; output 64 bits
        MOV     r_outword,r_inword2,LSR r_shr                      ; holding r_shl bits over in r_outword
loop64_enter
        SUBS    r_xcount,r_xcount,#64                              ; loop if at least 64 bits still to do
        BGE     loop64                                             ; loop unless finished finished
loop64_exit                                                        ; we have finished the 64-bit loop
        ADDS    r_xcount,r_xcount,#64                              ; count how many still to do
        BEQ     done                                               ; exit if exactly finished

; The bottom r_shl bits of r_outword are valid and must be saved at r_outptr.
; r_xcount is less than 64.
        LDMIA   r_inptr!,{r_inword,r_inword2}                      ; all the input we'll ever need
        ORR     r_outword,r_outword,r_inword,LSL r_shl             ; make r_outword valid
        CMP     r_xcount,#32
        STRGE   r_outword,[r_outptr],#4                            ; if xcount >= 32 then do a whole word
        SUBGE   r_xcount,r_xcount,#32
        BEQ     done                                               ; if exactly 32 bits were left
        MOVGT   r_outword,r_inword,LSR r_shr                       ; create last output word
        ORRGT   r_outword,r_outword,r_inword2,LSL r_shl
less32
; output the bottom xcount (in 1..31) bits of r_outword, combined with what is already at [r_outptr].
        LDR     r_outword2,[r_outptr]                              ; load word already there - we want top 32-xcount bits
        MOV     r_outword2,r_outword2,LSR r_xcount                 ; get rid of unwanted bits
        MOV     r_outword2,r_outword2,LSL r_xcount
        RSB     r_shl,r_xcount,#32
        MOV     r_outword,r_outword,LSL r_shl                      ; get rid of any unwanted new bits
        ORR     r_outword,r_outword2,r_outword,LSR r_shl           ; and combine the two
        STR     r_outword,[r_outptr]                               ; then save - we've finished
done
        LDMIA   sp!,{r_inword,r_inword2,r_outword,r_outword2,r_shl,r_shr,pc}

; --------------------------------------------------------------------------
; Now the entry sequence from the main assembler.
; We B here from the assembler when various calculations
; have already been done, and various values in the assembler workspace
; set up.
; To exit from here we B exitbiggie - r12,r13 must be preserved.
; Entry: r1 = the sprite itself
;        r5 = GCOL action, and whether to use mask.
;        r12 = assembler workspace pointer (of course)
;        r13 = SVC stack (of course)
new_putscaled_compiler

; --------------------------------------------------------------------------------------
; Before entering the C code, one more possible optimisation: we check for
; a lookup table that has no effect at all, and if found remove it.
; You'd be surprised how often such lookup tables get passed in :-)
; They slow down the blitting code, significantly in some cases.
        LDR     r0,ColourTTR               ; get the table
        CMP     r0,#0                      ; if no table
        BEQ     t_exit                     ; then skip this bit
        LDR     r2,BPP                     ; get output bpp
        LDR     r3,save_inlog2bpp          ; get log2 of input bpp
        MOV     r4,#1
        MOV     r4,r4,LSL r3               ; get input bpp
        CMP     r2,r4                      ; output bpp = input bpp?
        BNE     t_exit                     ; if not, don't even try
        CMP     r4,#16                     ; is input bpp 16 or more?
        BGE     t_exit                     ; if so, don't even try
        MOV     r3,#1
        MOV     r3,r3,LSL r4               ; get size of table, in bytes (2,4,16 or 256)
        ; If we reach here it's definitely worth looking through the table.
        ; r0 = table
        ; r3 = size of table
        ; r1,r5 to be preserved
        ; all others are trash.

        MOV     r2,#0                      ; r2 = expected next value in table
        TST     r0,#3                      ; is the table word-aligned?
        TSTEQ   r3,#3                      ; is the table more than 2 bytes?
        BEQ     t_wordaligned              ; if so, skip first loop that does first 1..3 bytes
        ; table pointer not aligned, or table of just 2 entries
t_loop0
        LDRB    r4,[r0],#1                 ; r4 = next value fetched from table
        CMP     r4,r2                      ; should equal expected value
        BNE     t_fail                     ; if not, give up
        ADD     r2,r2,#1                   ; increment expected value
        SUBS    r3,r3,#1                   ; decrement remaining size of table
        BEQ     t_identity                 ; table of just 2 bytes
        TST     r0,#3                      ; are we word aligned yet?
        BNE     t_loop0                    ; loop until word aligned
        ; exit from first-three-bytes loop - table is now word aligned.

        ; The main loop does four table entries at a time.
        ORR     r2,r2,r2,LSL #8            ; construct four copies of current value of table
        ORR     r2,r2,r2,LSL #16
t_wordaligned                              ; we branched to here with r2=0 if already word aligned
        LDR     r6,c03020100
        ADD     r2,r2,r6                   ; r2 = next four values in table
        LDR     r6,c04040404               ; r6 = what to add to get next four values
        SUBS    r3,r3,#4                   ; is table size at least four?
        BLT     t_loop_exit                ; if not, it was 2 or 4 to start with and not word-aligned
t_loop                                     ; start of 4-at-a-time loop
        LDR     r4,[r0],#4                 ; r4 = next 4 values fetched from table
        CMP     r4,r2                      ; compare four values
        BNE     t_fail                     ; fail if any one not identical
        ADD     r2,r2,r6                   ; advance all four bytes
        SUBS    r3,r3,#4                   ; counter of remaining table size
        BGE     t_loop
t_loop_exit ; exit from 4-at-a-time loop

        ADDS    r3,r3,#4                   ; remaining table size
        BEQ     t_identity                 ; succeed - normal route for word-aligned table

        ; otherwise, <3 bytes left to check as the tail of the table is not word-aligned
        MOV     r2,r2,LSR #24
        ADD     r2,r2,#1                   ; next expected table value
t_loop2
        LDRB    r4,[r0],#1                 ; next value from table
        CMP     r4,r2                      ; table value = expected value?
        BNE     t_fail                     ; if not, fail
        ADD     r2,r2,#1                   ; increment expected value
        SUBS    r3,r3,#1                   ; decrement size
        BNE     t_loop2                    ; branch until 0
        ; exit from last-three-bytes loop

t_identity
        ; success! discard the table
        STR     r3,ColourTTR               ; r3 known to be 0, as we've tested the whole table

t_exit
t_fail
        ; a normal table, or no table - do nothing.

; --------------------------------------------------------------------------------------
; Now the entry to the C code.
; Entry: r1 = the sprite itself
;        r5 = GCOL action, and whether to use mask.
;        r12 = assembler workspace pointer (of course)
;        r13 = SVC stack (of course)
;        all others are trash.

        LDR     r1,[r1,#spMode]     ; get the mode number/identifier
        STR     r1,save_mode        ; can be picked up by the C from here.
        MOV     R0,R12              ; assembler workspace pointer
        ADRL    R1,ccompiler_space  ; above R12, space for me.
        ADRL    R2,ccompiler_end    ; end of space for me.
        MOV     SL,R12              ; will be left alone by compiled C - for debug routines above.
        ADRL    R3,ccompiler_sp
        STR     SP,[R3]             ; in case of unexpected exit
        MOV     R3,R5               ; GCOL action and mask bit
        Debug   gs,"R1,R5,R12 = ",R1,R5,R12
        MOV     R4,#0
        ADRL    R5,ccompiler_errptr
        STR     R4,[R5]             ; in case of error exit
        Debug   cc,"entering c code"
        BL      putscaled_compiler  ; dive into the C
                                    ; returns r0==compiled code.
        MOV     R12,SL              ; R12 is ip to a C prog, will have been trampled on - restore it.

;; We can't try the next bit until I can actually compile a routine:
;; returning a literal C routine doesn't work, AAsm can't handle what CC puts out.
;; So, do nothing for now.
;        Debug   cc,"exit c code",R0
;        B       exitbiggie

; If debugging, and if Debug$File is set, then do not enter the code.
; Only if Debug$File is unset do we enter the code.

        SWI     XOS_RemoveCursors   ; about to stomp on the screen
        BVS     exitbiggie

        ADR     LR,ret              ; set return address
        MOV     PC,R0               ; and branch to the compiled code
        NOP
ret
        NOP
        SWI     XOS_RestoreCursors
        B       exitbiggie

        ; constants required for table comparison
c03020100 DCD   &03020100
c04040404 DCD   &04040404

      [ jpeg
; ----------------------------------------------------------------------
jpeg_fetchroutine
; This is called every line by compiled code from PutSpriteScaled, when the
; source is JPEG compressed data.
; Entry:
;   r0 = y coordinate
;   r12 = wp
; Exit:
;   r0 = initial address to get RGB data from for this line, based given y coord.
;   all other registers preserved.
; This works by calling the C proc jpeg_find_line, defined in h.rojpeg, as:
;   static int *jpeg_find_line(decompress_info_ptr cinfo, int ycoord, int *palette_data);

        Push    "R1-R3,R10-R12,LR"               ; r4-r9 are preserved by C code.

        MOV     R1,R0                            ; arg2 - int y coord

        ADRL    R0,jpeg_info_ptr
        LDR     R0,[R0]                             ; arg1 - decompress_info_ptr cinfo
        ADRL    R2,newtranstable
        Debug   gs,"palette in R2 = ",R2
        ; get ready to call C code
        MOV     SL,R12                           ; will be left alone by compiled C - for debug routines above.

        ; and call
        BL      jpeg_find_line                   ; base of that line in R0 on return

; This bit now removed, the calling code adds the in_x offset. This could be
; a word, byte or half-word offset, depending on how jpeg_scan_file was called.
;        MOV     R12,SL                           ; R12 is ip to a C prog, will have been trampled on - restore it.
;        ; returned value is int* for base of line.
;        ; now add in the initial source X coordinate.
;        LDR     R1,in_x                          ; x offset, as word count
;        ;Debug   cc,"x offset to add",R1
;        ADD     R0,R0,R1,LSL #2                  ; add word offset

        Pull    "R1-R3,R10-R12,PC"                ; restore registers and return to compiled code.

      ]

; Routine to Create a dynamic area with an Internationalised name.
; Called from C because the C code does not know about the Messages file.

create_dynamic_area
        Push    "R1-R9,LR"
        MOV     r2, r0
        ADRL    r0, ErrorBlock_DynName
        BL      copy_error_one
        CLRV
        ADD     r8, r0, #4     ; r8 now points to Dynamic Area Name string!
        MOV     r0, #0
        MOV     r1, #-1
        MOV     r3, #-1
        MOV     r4, #0
        MOV     r5, #&600000
        MOV     r6, #0
        MOV     r7, #0
        SWI     XOS_DynamicArea
        MOVVS   r0, #0
        MOVVC   r0, r1
        Pull    "R1-R9,PC"

ErrorBlock_DynName
        DCD     1
        DCB     "DynName:JPEG Workspace"
        DCB     0
        ALIGN


; ----------------------------------------------------------------------------------------------

        GET     putscaled.s

; Now get optimised assembler bits of JPEG
        GBLL    cfsi_jpeg                        ; for inclusion in ChangeFSI binary?
cfsi_jpeg SETL  {FALSE}

        GET     sources.jdhuff
        GET     sources.jrevdct
        GET     yuvtabs.s
        GET     sources.jcconv
        GET     sources.diffuse
        GET     sources.swiv2
        END