; Copyright 1996 Acorn Computers Ltd ; ; Licensed under the Apache License, Version 2.0 (the "License"); ; you may not use this file except in compliance with the License. ; You may obtain a copy of the License at ; ; http://www.apache.org/licenses/LICENSE-2.0 ; ; Unless required by applicable law or agreed to in writing, software ; distributed under the License is distributed on an "AS IS" BASIS, ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ; See the License for the specific language governing permissions and ; limitations under the License. ; ; Sources.PutScaled. ; started WRS 3rd August 1993 ; This is link and service code for the compiled form of c.putscaled, ; which exists in s.putscaled. ; It needs no hand patches, except that the register definitions at ; the beginning duplicate existing definitions and should be removed. ; ... is it a problem being in SVC mode? No problems so far! ; Get things that compiled C needs GET sources.CSupport ; -------------------------------------------------------------------------- ; Provide a basic debugging output routine. ; R0 = string to output [ debug asm_writech ; R0 = a character to output Push "R12,LR" MOV R12,SL ; get back workspace pointer. CMP R0,#10 ; newline char? BLEQ Neil_NewLine BLNE Neil_WriteC ADRL R0,Neil_FileHandle LDR R0,[R0] ; return the file handle as a result of this function: ; allows the C to know whether output is turned on, ; <0 return -> no tracing right now. Pull "R12,PC" ] dividebyzero Debug cc,"DIVIDE BY ZERO",a2,a1,lr ; FALL THROUGH to exit. ; -------------------------------------------------------------------------------------- ; For an unexpected exit from compiled code, such as an assertion failure. ; There's no error message unless you're debugging, you simply stop the ; whole plot operation and return as fast as possible. ; If debugging, you have (presumably) already put out an error message. exit MOV r1, #0 exit_c MOV R12,SL ; get back workspace pointer. ADRL R2,ccompiler_errptr STR r1,[r2] ADRL R1,ccompiler_sp LDR SP,[R1] ; get back SP Debug cc,"unexpected exit c code",R0 ; a reason code is possible - usually C line number. B exitbiggie ; --------------------------------------------------------------------------------------- ; Memory allocation for the C code. ; malloc - r0 = number of bytes to allocate (see PRM3 1-233) ; on exit, r0 = pointer to block, or 0 for 'none possible'. malloc MOV r1,lr ; keep return link MOV r3,r0 ; required size, for RMA allocation MOV r0,#ModHandReason_Claim SWI XOS_Module ; do the claim - on exit R2 points to block, if non-error from_realloc ADRVSL r2,ccompiler_errptr STRVS r0,[r2] ; if an error, remember it MOVVS r0,#0 ; ... and return null pointer MOVVC r0,r2 ; if no error, return pointer to C code MOV pc,r1 ; free - r0 = pointer to previously malloc'd block. (see PRM3 1-234) free MOV r1,lr ; keep return link MOV r2,r0 ; pointer to heap block MOV r0,#ModHandReason_Free SWI OS_Module ; no error expected, unless we have internal errors MOV pc,r1 ; realloc_delta - r0 = pointer to block, r1 = CHANGE in size (see PRM3 1-240) ; (Can't provide the real realloc sadly, because these are the args that the OS wants) realloc_delta MOV r2,r0 ; pointer to heap block MOV r3,r1 ; change in size MOV r0,#ModHandReason_ExtendBlock MOV r1,lr ; keep return link SWI XOS_Module B from_realloc ; --------------------------------------------------------------------------------------- ;; debug output from within the compiled code. ;traceentry1 ; Debug cc,"Tracepoint: lr,r4=",lr,r4 ; MOV pc,lr ;; accessed from compiled code as bitblockmove-4 - sorry! ;traceentry ; B traceentry1 ; --------------------------------------------------------------------------------------- bitblockmove ; routine for simple bit block move. ; This is called by the compiled code when pixels are equal size, no mask, only 'set' gcol action, no table. ; There's no point in 'compiling' it because there are no important variants that we want to compile in, ; so it's clearer to just write it in the assembler. ; Registers on entry: r_inptr RN 0 ; r_inptr -> input (word pointer) r_outptr RN 1 ; r_outptr -> output (word pointer) r_inshift RN 2 ; r_inshift (aka r_inword3) - number of (most significant) bits of first word to transfer, in 1..32 r_outshift RN 3 ; r_outshift (aka r_inword4) - number of (most significant) bits of space in first word to fill up, in 1..32 r_xcount RN 4 ; r_xcount - number of bits to transfer. ; Workspace registers: r_inword RN 5 ; r_inword, r_inword2 - temp space r_inword2 RN 6 ; must come AFTER r_inword for LDM r_outword RN 7 ; r_outword, r_outword2, r_outword3, r_outword4 - temp space r_outword2 RN 8 ; must come AFTER r_outword for STM r_shl RN 9 ; r_shl, r_shr - temp space r_shr RN 10 ; NB r12 is NOT set up. ; On exit arg registers are corrupted, others preserved STMDB sp!,{r_inword,r_inword2,r_outword,r_outword2,r_shl,r_shr,lr} ; Debug cc,"bitblockmove: ",r0,r1,r2,r3,r4 ; Debug cc," : ",r5,r6,r7,r8,r9 ; Debug cc," : ",r10,r11,r12,sp,lr,pc ; The following diagrams help when trying to think about shift cases, especially for start conditions etc. ; Note particularly that an entire display line is 'little-endian' - least sig pixel is at the left, ; most sig pixel is at the right in an entirely consistent way. ; Input words: ; 0 31 0 31 0 31 bit number ; |------------------------------| |------------------------------| |------------------------------| ; What to transfer: ; |***************************************************** . . . ; <-----> this is r_inshift on entry ; Output words: ; 0 31 0 31 0 31 bit number ; |------------------------------| |------------------------------| |------------------------------| ; What to fill up: ; |***************************************************** . . . ; <--------------> this is r_outshift on entry ; The difference between r_outshift and r_inshift is the distance that bulk data has to be shifted, ; once we get into the main loop. ; the bottom 32-outshift bits of outword should be loaded ; with whatever is there already. LDR r_outword,[r_outptr] MOV r_outword,r_outword,LSL r_outshift MOV r_outword,r_outword,LSR r_outshift ; discard unwanted bits ; xcount counts the number of bits which must be ; saved at r_outptr, of which the first r_shl bits can be found in outword ; and the remainder are still to be fetched from r_inptr. RSB r_outword2,r_outshift,#32 ; temp use of r_outword2 ADD r_xcount,r_xcount,r_outword2 ; add the bits we've just loaded in ; Only the top r_inshift bits of r_inword are interesting LDR r_inword,[r_inptr],#4 RSB r_inword2,r_inshift,#32 ; temp use of r_inword2 MOV r_inword,r_inword,LSR r_inword2 ; discard unwanted bits MOV r_inword,r_inword,LSL r_inword2 ; differing code depending on which of r_inshift and r_outshift is bigger CMP r_outshift,r_inshift BEQ insh_equal BLT insh_more ; r_outshift is bigger than r_inshift: ; the first output word will consist of: ; bottom 32-outshift bits undisturbed ; top inshift bits from first input word ; bottom outshift-inshift (= r_shr bits, shifted left by r_shl) bits from the next input word SUB r_shr,r_outshift,r_inshift RSB r_shl,r_shr,#32 ORR r_outword,r_outword,r_inword,LSR r_shr ; bottom r_shl bits of r_outword are now loaded with input. ; fetch the top of the next word as part of the main loop. B loop64_enter insh_equal ; No shift offset between input and output - everything a lot simpler! ; the first output word consists of: ; bottom 32-outshift bits undisturbed ; top 32-outshift bits of the input ORR r_outword,r_outword,r_inword CMP r_xcount,#32 BLT less32 STR r_outword,[r_outptr],#4 SUBS r_xcount,r_xcount,#32 BEQ done ; Now extra-simple 64-bit loop for no-shift case. B loop64_noshift_enter loop64_noshift LDMIA r_inptr!,{r_inword,r_inword2} STMIA r_outptr!,{r_inword,r_inword2} loop64_noshift_enter SUBS r_xcount,r_xcount,#64 BGE loop64_noshift MOV r_outword,#0 MOV r_shl,#0 MOV r_shr,#32 B loop64_exit insh_more ; inshift is bigger than outshift ; the first output word will consist of: ; bottom 32-outshift bits undisturbed ; outshift bits from the middle of the input word SUB r_shl,r_inshift,r_outshift RSB r_shr,r_shl,#32 ORR r_outword,r_outword,r_inword,LSL r_shl ; We still have r_shl bits of input at the top of r_inword, ; not the correct situation for entering the 64-bit loop (they should be ; at the bottom of r_outword). So, have to do one word of output by steam. CMP r_xcount,#32 BLT less32 STR r_outword,[r_outptr],#4 MOV r_outword,r_inword,LSR r_shr SUBS r_xcount,r_xcount,#32 BEQ done ; and fall into the 64-bit loop. ; the 64-bit loop - main time-critical bit ; The bottom r_shl bits of r_outword are valid and must be saved at r_outptr. SUBS r_xcount,r_xcount,#64 BLT loop64_exit loop64 LDMIA r_inptr!,{r_inword,r_inword2} ; pick up 64 input bits ORR r_outword,r_outword,r_inword,LSL r_shl ; borrow r_shl bits already in r_outword MOV r_outword2,r_inword,LSR r_shr ; create outword2 ORR r_outword2,r_outword2,r_inword2,LSL r_shl STMIA r_outptr!,{r_outword,r_outword2} ; output 64 bits MOV r_outword,r_inword2,LSR r_shr ; holding r_shl bits over in r_outword loop64_enter SUBS r_xcount,r_xcount,#64 ; loop if at least 64 bits still to do BGE loop64 ; loop unless finished finished loop64_exit ; we have finished the 64-bit loop ADDS r_xcount,r_xcount,#64 ; count how many still to do BEQ done ; exit if exactly finished ; The bottom r_shl bits of r_outword are valid and must be saved at r_outptr. ; r_xcount is less than 64. LDMIA r_inptr!,{r_inword,r_inword2} ; all the input we'll ever need ORR r_outword,r_outword,r_inword,LSL r_shl ; make r_outword valid CMP r_xcount,#32 STRGE r_outword,[r_outptr],#4 ; if xcount >= 32 then do a whole word SUBGE r_xcount,r_xcount,#32 BEQ done ; if exactly 32 bits were left MOVGT r_outword,r_inword,LSR r_shr ; create last output word ORRGT r_outword,r_outword,r_inword2,LSL r_shl less32 ; output the bottom xcount (in 1..31) bits of r_outword, combined with what is already at [r_outptr]. LDR r_outword2,[r_outptr] ; load word already there - we want top 32-xcount bits MOV r_outword2,r_outword2,LSR r_xcount ; get rid of unwanted bits MOV r_outword2,r_outword2,LSL r_xcount RSB r_shl,r_xcount,#32 MOV r_outword,r_outword,LSL r_shl ; get rid of any unwanted new bits ORR r_outword,r_outword2,r_outword,LSR r_shl ; and combine the two STR r_outword,[r_outptr] ; then save - we've finished done LDMIA sp!,{r_inword,r_inword2,r_outword,r_outword2,r_shl,r_shr,pc} ; -------------------------------------------------------------------------- ; Now the entry sequence from the main assembler. ; We B here from the assembler when various calculations ; have already been done, and various values in the assembler workspace ; set up. ; To exit from here we B exitbiggie - r12,r13 must be preserved. ; Entry: r1 = the sprite itself ; r5 = GCOL action, and whether to use mask. ; r12 = assembler workspace pointer (of course) ; r13 = SVC stack (of course) new_putscaled_compiler ; -------------------------------------------------------------------------------------- ; Before entering the C code, one more possible optimisation: we check for ; a lookup table that has no effect at all, and if found remove it. ; You'd be surprised how often such lookup tables get passed in :-) ; They slow down the blitting code, significantly in some cases. LDR r0,ColourTTR ; get the table CMP r0,#0 ; if no table BEQ t_exit ; then skip this bit LDR r2,BPP ; get output bpp LDR r3,save_inlog2bpp ; get log2 of input bpp MOV r4,#1 MOV r4,r4,LSL r3 ; get input bpp CMP r2,r4 ; output bpp = input bpp? BNE t_exit ; if not, don't even try CMP r4,#16 ; is input bpp 16 or more? BGE t_exit ; if so, don't even try MOV r3,#1 MOV r3,r3,LSL r4 ; get size of table, in bytes (2,4,16 or 256) ; If we reach here it's definitely worth looking through the table. ; r0 = table ; r3 = size of table ; r1,r5 to be preserved ; all others are trash. MOV r2,#0 ; r2 = expected next value in table TST r0,#3 ; is the table word-aligned? TSTEQ r3,#3 ; is the table more than 2 bytes? BEQ t_wordaligned ; if so, skip first loop that does first 1..3 bytes ; table pointer not aligned, or table of just 2 entries t_loop0 LDRB r4,[r0],#1 ; r4 = next value fetched from table CMP r4,r2 ; should equal expected value BNE t_fail ; if not, give up ADD r2,r2,#1 ; increment expected value SUBS r3,r3,#1 ; decrement remaining size of table BEQ t_identity ; table of just 2 bytes TST r0,#3 ; are we word aligned yet? BNE t_loop0 ; loop until word aligned ; exit from first-three-bytes loop - table is now word aligned. ; The main loop does four table entries at a time. ORR r2,r2,r2,LSL #8 ; construct four copies of current value of table ORR r2,r2,r2,LSL #16 t_wordaligned ; we branched to here with r2=0 if already word aligned LDR r6,c03020100 ADD r2,r2,r6 ; r2 = next four values in table LDR r6,c04040404 ; r6 = what to add to get next four values SUBS r3,r3,#4 ; is table size at least four? BLT t_loop_exit ; if not, it was 2 or 4 to start with and not word-aligned t_loop ; start of 4-at-a-time loop LDR r4,[r0],#4 ; r4 = next 4 values fetched from table CMP r4,r2 ; compare four values BNE t_fail ; fail if any one not identical ADD r2,r2,r6 ; advance all four bytes SUBS r3,r3,#4 ; counter of remaining table size BGE t_loop t_loop_exit ; exit from 4-at-a-time loop ADDS r3,r3,#4 ; remaining table size BEQ t_identity ; succeed - normal route for word-aligned table ; otherwise, <3 bytes left to check as the tail of the table is not word-aligned MOV r2,r2,LSR #24 ADD r2,r2,#1 ; next expected table value t_loop2 LDRB r4,[r0],#1 ; next value from table CMP r4,r2 ; table value = expected value? BNE t_fail ; if not, fail ADD r2,r2,#1 ; increment expected value SUBS r3,r3,#1 ; decrement size BNE t_loop2 ; branch until 0 ; exit from last-three-bytes loop t_identity ; success! discard the table STR r3,ColourTTR ; r3 known to be 0, as we've tested the whole table t_exit t_fail ; a normal table, or no table - do nothing. ; -------------------------------------------------------------------------------------- ; Now the entry to the C code. ; Entry: r1 = the sprite itself ; r5 = GCOL action, and whether to use mask. ; r12 = assembler workspace pointer (of course) ; r13 = SVC stack (of course) ; all others are trash. LDR r1,[r1,#spMode] ; get the mode number/identifier STR r1,save_mode ; can be picked up by the C from here. MOV R0,R12 ; assembler workspace pointer ADRL R1,ccompiler_space ; above R12, space for me. ADRL R2,ccompiler_end ; end of space for me. MOV SL,R12 ; will be left alone by compiled C - for debug routines above. ADRL R3,ccompiler_sp STR SP,[R3] ; in case of unexpected exit MOV R3,R5 ; GCOL action and mask bit Debug gs,"R1,R5,R12 = ",R1,R5,R12 MOV R4,#0 ADRL R5,ccompiler_errptr STR R4,[R5] ; in case of error exit Debug cc,"entering c code" BL putscaled_compiler ; dive into the C ; returns r0==compiled code. MOV R12,SL ; R12 is ip to a C prog, will have been trampled on - restore it. ;; We can't try the next bit until I can actually compile a routine: ;; returning a literal C routine doesn't work, AAsm can't handle what CC puts out. ;; So, do nothing for now. ; Debug cc,"exit c code",R0 ; B exitbiggie ; If debugging, and if Debug$File is set, then do not enter the code. ; Only if Debug$File is unset do we enter the code. SWI XOS_RemoveCursors ; about to stomp on the screen BVS exitbiggie ADR LR,ret ; set return address MOV PC,R0 ; and branch to the compiled code NOP ret NOP SWI XOS_RestoreCursors B exitbiggie ; constants required for table comparison c03020100 DCD &03020100 c04040404 DCD &04040404 [ jpeg ; ---------------------------------------------------------------------- jpeg_fetchroutine ; This is called every line by compiled code from PutSpriteScaled, when the ; source is JPEG compressed data. ; Entry: ; r0 = y coordinate ; r12 = wp ; Exit: ; r0 = initial address to get RGB data from for this line, based given y coord. ; all other registers preserved. ; This works by calling the C proc jpeg_find_line, defined in h.rojpeg, as: ; static int *jpeg_find_line(decompress_info_ptr cinfo, int ycoord, int *palette_data); Push "R1-R3,R10-R12,LR" ; r4-r9 are preserved by C code. MOV R1,R0 ; arg2 - int y coord ADRL R0,jpeg_info_ptr LDR R0,[R0] ; arg1 - decompress_info_ptr cinfo ADRL R2,newtranstable Debug gs,"palette in R2 = ",R2 ; get ready to call C code MOV SL,R12 ; will be left alone by compiled C - for debug routines above. ; and call BL jpeg_find_line ; base of that line in R0 on return ; This bit now removed, the calling code adds the in_x offset. This could be ; a word, byte or half-word offset, depending on how jpeg_scan_file was called. ; MOV R12,SL ; R12 is ip to a C prog, will have been trampled on - restore it. ; ; returned value is int* for base of line. ; ; now add in the initial source X coordinate. ; LDR R1,in_x ; x offset, as word count ; ;Debug cc,"x offset to add",R1 ; ADD R0,R0,R1,LSL #2 ; add word offset Pull "R1-R3,R10-R12,PC" ; restore registers and return to compiled code. ] ; Routine to Create a dynamic area with an Internationalised name. ; Called from C because the C code does not know about the Messages file. create_dynamic_area Push "R1-R9,LR" MOV r2, r0 ADRL r0, ErrorBlock_DynName BL copy_error_one CLRV ADD r8, r0, #4 ; r8 now points to Dynamic Area Name string! MOV r0, #0 MOV r1, #-1 MOV r3, #-1 MOV r4, #0 MOV r5, #&600000 MOV r6, #0 MOV r7, #0 SWI XOS_DynamicArea MOVVS r0, #0 MOVVC r0, r1 Pull "R1-R9,PC" ErrorBlock_DynName DCD 1 DCB "DynName:JPEG Workspace" DCB 0 ALIGN ; ---------------------------------------------------------------------------------------------- GET putscaled.s ; Now get optimised assembler bits of JPEG GBLL cfsi_jpeg ; for inclusion in ChangeFSI binary? cfsi_jpeg SETL {FALSE} GET sources.jdhuff GET sources.jrevdct GET yuvtabs.s GET sources.jcconv GET sources.diffuse GET sources.swiv2 END