; Copyright 1996 Acorn Computers Ltd
;
; Licensed under the Apache License, Version 2.0 (the "License");
; you may not use this file except in compliance with the License.
; You may obtain a copy of the License at
;
;     http://www.apache.org/licenses/LICENSE-2.0
;
; Unless required by applicable law or agreed to in writing, software
; distributed under the License is distributed on an "AS IS" BASIS,
; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
; See the License for the specific language governing permissions and
; limitations under the License.
;
; > Sources.SprTrans

 ^ 0,SP
trns_spr_xcoords                #       16      ;       Four x coordinates
trns_spr_ycoords                #       16      ;       Four y coordinates
trns_comp_spr_left              #       4       ;       Sprite left hand edge (bottom 16 bits)
trns_comp_spr_start             #       4       ;       Sprite start (accounting for internal coord block top)
trns_comp_spr_byte_width        #       4       ;       Sprite byte width << (3-sprite bpp)
trns_comp_spr_height            #       4       ;       Sprite height (top 16 bits) and right hand edge (bottom 16)
trns_comp_spr_ttr               #       4       ;       Translation table (if required)
trns_comp_spr_masko             #       4       ;       Sprite mask offset from image << (3-sprite bpp)
trns_comp_ecf_ora               #       4       ;       ECF OR word
trns_comp_ecf_eor               #       4       ;       ECF EOR word
trns_codebuffer                 #       4       ;       Pointer to codebuffer
trns_spr_X_x0_y                 #       4       ;       Sprite X,Y at top coordinate of area
trns_spr_Y_x0_y                 #       4       ;            in 16.16 fixed point
trns_spr_inc_X_x                #       4       ;       Sprite increments
trns_spr_inc_Y_x                #       4       ;          ( change induced by single
trns_spr_inc_Y_y                #       4       ;            increments in screen x,y on
trns_spr_inc_X_y                #       4       ;            sprite X,Y )
trns_spr_lineptr                #       4       ;       Line to output onto
trns_spr_edgeblock              #       6*4*4   ;       Edge blocks, in format as below
trns_spr_edgeblock_end          #       4*6     ;        -1, to denote end of edge block
trns_ecf_ptr                    #       4       ;       Ecf pointer
trns_masking_word               #       4       ;       Masking word for > eight bit per pixel
trns_comp_mask_offset           #       4       ;       used to point at 1bpp mask data
trns_comp_spr_mask_width        #       4       ;       1bpp mask equivalent of spr_width
trns_comp_mask_base             #       4       ;       1bpp mask adjustment to mask data
trns_spr_vars_end               *       :INDEX:@

; The edge blocks are stored in a similar fashion to Draw Quick Fill, and the code is mainly a copy
; of that. The actual layout of the blocks is as follows:
;   Offset 0  : Flag word. Top two bits specify direction of line in the X-axis (01=positive,1x=negative)
;                          Bottom two bits specify whether edge is active (01), not yet active (00), or dead (11)
;      The next 5 words are dependent on the flags. For active edges:
;   Offset 4  : Lower Y coordinate (in 256ths pixels, bottom of screen is 0) of edge. The line is deactivated
;                 after reaching this Y coordinate
;   Offset 8  : ABS(deltaX) for the line, in 256ths of a pixel
;   Offset 12 : ABS(deltaY) for the line, in 256ths of a pixel
;   Offset 16 : Bresenham error value for the pixel
;   Offset 20 : Current X coordinate of the point (in pixels)
;       For inactive edges:
;   Offset 4  : Lower X coordinate (256ths of pixel)
;   Offset 8  : Lower Y coordinate
;   Offset 12 : Upper X coordinate
;   Offset 16 : Upper Y coordinate
trns_activated          *       2_0001
trns_deactivated        *       2_0010


trns_xsize              RN      0 ; Top 16 bits only.           Used throughout the compiled loop
trns_spr_left           RN      0 ; Bottom 16 bits only.        Used throughout the compiled loop
trns_scr_lx             RN      0 ;                             Used outside the loop
trns_spr_start          RN      1 ;                             Used throughout the loop
trns_scr_rx             RN      1 ;                             Used outside the loop
trns_offset             RN      2 ;                             Used throughout the loop
trns_scr_y              RN      2 ;                             Used outside the loop
trns_X                  RN      3 ;                             Used throughout the loop
trns_X_x0_y             RN      3 ;                             Used outside the loop
trns_Y                  RN      4 ;                             Used throughout the loop
trns_Y_x0_y             RN      4 ;                             Used outside the loop
trns_inc_X_x            RN      5 ;                             Used throughout the loop & outside the loop
trns_inc_Y_x            RN      6 ;                             Used throughout the loop & outside the loop
trns_byte_width         RN      7 ;                             Used throughout the loop
trns_inc_X_y            RN      7 ;                             Used outside the loop
trns_spr_height         RN      8 ; Top 16 bits only.           Used throughout the compiled loop
trns_spr_right          RN      8 ; Bottom 16 bits only.        Used throughout the compiled loop
trns_inc_Y_y            RN      8 ;                             Used outside the loop
trns_out_ptr            RN      9 ;                             Used throughout the loop
trns_line_ptr           RN      9 ;                             Used outside the loop
trns_out_word           RN      10;                             Used throughout the loop
trns_vertex_ptrs        RN      10;                             Used outside the loop
trns_out_mask           RN      11;                             Used throughout the loop
trns_dummy11            RN      11;                              (dummy register - used outside the loop)
trns_out_x              RN      12;                             Used on entry to the compiled loop
trns_workspace_ptr      RN      12;                             Workspace pointer
trns_dummy12            RN      12;                              (dummy register - used everywhere)
trns_in_pixel           RN      14;                             Used in the middle to end of loop
trns_dummy14            RN      14;                              (dummy register - used everywhere)

        GBLA    ldmreg
        GBLA    ldmreg3
        GBLS    ldmreg2

        MACRO
$l      TrnsAsm         $label,$size,$cc
ldmreg  SETA    $size:SHR:2
ldmreg3 SETA    ldmreg+3
ldmreg2 SETS    "$ldmreg3":RIGHT:1
$l
    [ ldmreg>6
        ADR$cc  R4,$label
        LDM$cc.IA R4!,{R4-R9}
        STM$cc.IA R10!,{R4-R9}
        TrnsAsm ($label+6*4),($size-6*4),$cc
    |
      [ ldmreg>1
        ADR$cc.L  R4,$label
        LDM$cc.IA R4,{R4-R$ldmreg2}
        STM$cc.IA R10!,{R4-R$ldmreg2}
      |
        LDR$cc  R4,$label
        STR$cc  R4, [R10], #4
      ]
    ]
        MEND


        MACRO
$l      TrnsAsmReg      $reg,$size,$cc
ldmreg  SETA    $size:SHR:2
ldmreg3 SETA    ldmreg+3
ldmreg2 SETS    "$ldmreg3":RIGHT:1
$l      LDM$cc.IA $reg,{R4-R$ldmreg2}
        STM$cc.IA R10!,{R4-R$ldmreg2}
        MEND


        MACRO
$l      TrnsBranch      $reg,$op
        SUB     $reg,$reg,R10
        SUB     $reg,$reg,#8
        MOV     $reg,$reg,LSL#6
        MOV     $reg,$reg,LSR#8
        ORR     $reg,$reg,#$op
        STR     $reg,[R10],#4
        MEND

        MACRO
$label  DivRem2 $rc, $ra, $rb, $rtemp
$label
        [       debugtr
        TEQ     $rb,#0
        SWIEQ   OS_BreakPt
        ]
        MOV     $rtemp, $rb
        CMP     $rtemp, $ra, LSR #1
01
        MOVLS   $rtemp, $rtemp, LSL #1
        CMPLS   $rtemp, $ra, LSR #1
        BLS     %BT01
        MOV     $rc, #0
02
        CMP     $ra, $rtemp
        SUBCS   $ra, $ra, $rtemp
        ADC     $rc, $rc, $rc
        MOV     $rtemp, $rtemp, LSR #1
        CMP     $rtemp, $rb
        BCS     %BT02
        MEND

 [ usemull
        ! 0, "Using SMULL - StrongARM or later only"
 ]

        MACRO
        SSmultD $ra,$rb,$rl,$rh
        ; Asserts to check requirements always meet both options
        ASSERT  $rh = $rl + 1
        ASSERT  $ra <= R8
        ASSERT  $rb <= R8
        ASSERT  $rl <= R7
 [ usemull
    [ $ra = $rl :LOR: $ra = $rh
        ! 0, "Register clash avoided in SSmultD"
        MOV     R14, $ra
        SMULL   $rl,$rh,R14,$rb
    |
        SMULL   $rl,$rh,$ra,$rb
    ]
 |
        BL      arith_SSmultD
        DCB     $ra,$rb,$rl,0
 ]
        MEND


        MakeSpriteErrorBlock BadFlags,,BadFlgs
        MakeSpriteErrorBlock BadCoordBlock,,BadCBlk
        MakeSpriteErrorBlock BadSourceRectangle,,BadRect
        MakeSpriteErrorBlock BadTransformation,,BadTfrm

badcoordblock
        ADR     R0, ErrorBlock_BadCoordBlock
        addr    r1, Title
        BL      copy_error_one                  ; Always sets the V bit
        B       exitbiggie

nullareasource
        ADR     R0, ErrorBlock_BadSourceRectangle
        addr    r1, Title
        BL      copy_error_one                  ; Always sets the V bit
        B       exitbiggie

badtransformation
        ADR     R0, ErrorBlock_BadTransformation
        addr    r1, Title
        BL      copy_error_one                  ; Always sets the V bit
        B       exitbiggie

        GBLA    flg_matrix
        GBLA    flg_coordblock
flg_matrix      SETA    1
flg_coordblock  SETA    2

Go_PlotMaskTransformed
        Push    "R1-R9,LR"
; Pretend its a GCOL 8 - use sprite's mask if it has one
        MOV     R5,#8
; No colour translation
        MOV     R7,#0
        B       %FT01

Go_PutSpriteTransformed
        Push    "R1-R9,LR"
01
        Debug   tr,"Draw sprite: R0,R1,R2 =",R0,R1,R2
        Debug   tr,"Coords, gcol, &scale, &ttr =",R3,R4,R5,R6,R7

        CLRPSR  I_bit, R14              ; re-enable interrupts

        [       flagbit
        MOV     R14, R5, LSR #4
        STR     R14, trns_flags2
        [ widetrans
        BICS    R14, R14, #flg2_ignorettr+flg2_widetrans + flg2_ditheron
        |
        BICS    R14, R14, #flg2_ignorettr + flg2_ditheron
        ]
        ADRNE   R0, ErrorBlock_BadFlags
        addr    r1, Title, NE
        BLNE    copy_error_one          ; Always sets the V bit
        BVS     exitbiggie
        ]

        AND     R5,R5,#&0F              ; only bottom 4 bits are interesting

        BICS    R14, R3, #flg_matrix:OR:flg_coordblock  ; Check for legal flags
        ADRNE   R0, ErrorBlock_BadFlags
        addr    r1, Title, NE
        BLNE    copy_error_one          ; Always sets the V bit
        BVS     exitbiggie

; see if reason code indicated a sprite name or sprite pointer in R2

        BL      findsprite              ; R2 --> sprite

        MOVVC   R1,R2                   ; now R1 --> sprite

; read input/output mode variables

        BLVC    readvduvars
        LDRVC   R0,[R1,#spMode]         ; get sprite's original mode
        MOVVC   R14,R0,LSR #27

        STRVC   R14,save_spr_type       ; and derive the sprite type from it
        BLVC    readspritevars
        BVS     exitbiggie

        [ ignore_ttr
; check whether it has a palette - if doing <16 to >8 we may use it in preference
; to the translation table

        MOV     R0, #0
        STR     R0, trns_palette

        LDR     R0,[R1,#spImage]
        CMP     R0,#SpriteCBsize
        BEQ     trns_has_no_palette

        LDR     R14,[R1,#spTrans]
        CMP     R14,#SpriteCBsize
        BEQ     trns_has_no_palette

        ;validate it (to exclude 8bpp without full palettes)
        ;test is that palette size should be 8*ncolours

        ;find the lower of the sprite start and mask start
        CMP     R0,R14
        MOVCS   R0,R14
        SUB     R0,R0,#SpriteCBsize

        MOV     R0,R0,LSR #3 ;divide by 8 for number of palette entries
        MOV     R14,#1
        LDR     R9,save_inbpp
        MOV     R14,R14,ASL R9

        CMP     R14,R0
        BNE     trns_has_no_palette

        [       flagbit
        LDR     R14, trns_flags2
        TST     R14, #flg2_ignorettr
;        BNE     %FT01
;        [       med01867
;        CMP     R9, #8
;        BNE     trns_has_no_palette ;restrict it to 8bpp full palette only
;        ]
        BEQ     trns_has_no_palette
01
        |
;        [       med01867
;        CMP     R9, #8
;        BNE     trns_has_no_palette ;restrict it to 8bpp full palette only
;        ]
        B       trns_has_no_palette
        ]

        LDR     R14,[R1,#spTrans]
        ADD     R14,R1,#SpriteCBsize
        STR     R14, trns_palette
trns_has_no_palette
       ]

; validate supplied translation table (if any)

        CMP     R7,R7,ASR #31           ; documented as <=0 being none - now accept 0 or -1
        MOVEQS  R7,#0                   ; 0 ==> no translation
        BLNE    checktrans
        BVS     exitbiggie              ; fault it because it failed
;
        [ ignore_ttr
; if doing a sprite of <16bpp to >8bpp, and it has a palette, change the ttr pointer
; to point at the palette data instead of the ttr data. Note that the two are different
; formats, so there is also a different plotting routine to include too...

        LDR     R14, BPP                               ; output bpp
        CMP     R14, #16
        MOVCC   R14, #0
        STRCC   R14, trns_palette
        BCC     %FT45
        LDR     R14, save_inbpp
        CMP     R14, #16
        MOVCS   R14, #0
        STRCS   R14, trns_palette
        BCS     %FT45
        LDR     R14, trns_palette
        TEQ     R14, #0
        MOVNE   R7,R14
        MOVEQ   R14,#0
        STREQ   R14, trns_palette ;only non-zero if going to use this

; trns_palette doubles as a pointer to the palette up this far, and then becomes a
; compilation flag for the macro generation (with the value being passed in as the
; ttr address)

45
        ]
        STR     R7,ColourTTR
        CMP     R7,#0
        BNE     trns_notrans
;
        LDRB    R14,spritecode          ; R14 = bottom 8 bits of reason code
        TEQ     R14,#SpriteReason_PlotMaskTransformed
        LDRNE   R14,save_inbpp
        LDRNE   R0,BPP
        TEQNE   R14,R0                  ; OK if same bpp or mask plotting

        ;however, don't error if going 16>32 or 32>16
        BEQ     trns_notrans ;dispose of the equal case

        CMP     R0,#16
        CMPEQ   R14,#32
        BEQ     trns_notrans

        CMP     R0,#32
        CMPEQ   R14,#16
        BEQ     trns_notrans

        B       errtrans

trns_notrans
; adapt mode variables intelligently (account for double pixel modes)

        LDR     R8, Log2bpc
        LDR     R9, Log2bpp
        SUB     R8, R8, R9
        LDR     R0, gwx0
        MOV     R0, R0, ASL R8
        STR     R0, gwx0
        LDR     R0, gwx1
        ADD     R0, R0, #1
        MOV     R0, R0, ASL R8
        STR     R0, gwx1

; Valid registers: R1 - ptr to sprite?, R3 - flags, R4 -> coordinate block, R5 GCOL, R6->matrix/coords, R7->ttr
;        Debug   tr,"Read VDU and sprite:",#gwx0,#gwy0,#gwx1,#gwy1,#orgx,#orgy
;        Debug   tr,"Pointer to sprite:",R1
; Get coordinate block

        LDR     R2, save_inlog2bpp
        TST     R3, #flg_coordblock
        BNE     %FT01

; (whole sprite R0 - left, R8 - top y, R9 - right, R14 - bottom)

        LDR     R14, save_spr_type
        CMP     R14, #0
        LDREQ   R0, [R1, #spLBit]
        MOVEQ   R0, R0, ASR R2
        MOVNE   R0, #0                  ; new format sprites don't have any lh wastage
        LDR     R8, [R1, #spHeight]
        ADD     R8, R8, #1
        LDR     R9, [R1, #spRBit]
        RSB     R9, R9, #31
        LDR     R4, [R1, #spWidth]
        ADD     R4, R4, #1
        RSB     R9, R9, R4, LSL#5
        MOV     R9, R9, ASR R2
        MOV     R14, #0
        B       %FT02
01

; Check area passed in

        LDMIA   R4, {R0,R8,R9,R14}

; If using a matrix then clip source rectangle to sprite

        TST     R3, #flg_matrix
        BNE     %FT03
        CMP     R8, #0
        MOVLT   R8, #0
        CMP     R14, #0
        MOVLT   R14, #0
        LDR     R4, [R1, #spHeight]
        ADD     R4, R4, #1
        CMP     R8, R4
        MOVGT   R8, R4
        CMP     R14, R4
        MOVGT   R14, R4
        CMP     R0, #0
        MOVLT   R0, #0
        CMP     R9, #0
        MOVLT   R9, #0

        LDR     R4, [R1, #spLBit]
        ADD     R0, R0, R4, LSR R2
        ADD     R9, R9, R4, LSR R2
        LDR     R4, [R1, #spRBit]
        RSB     R4, R4, #31
        LDR     R10, [R1, #spWidth]
        ADD     R10, R10, #1
        RSB     R4, R4, R10, LSL#5

        CMP     R0, R4, LSR R2
        MOVGT   R0, R4, LSR R2
        CMP     R9, R4, LSR R2
        MOVGT   R9, R4, LSR R2
        CMP     R0, R9
        CMPNE   R8, R14
        BEQ     nullareasource
        B       %FT02
03

; If not using a matrix then check valid source area

        CMP     R0, R9
        CMPNE   R8, R14
        BEQ     nullareasource
        CMP     R8, #0
        CMPGE   R14, #0
        BLT     badcoordblock
        LDR     R4, [R1, #spHeight]
        ADD     R4, R4, #1
        CMP     R8, R4
        CMPLE   R14, R4
        BGT     badcoordblock
        CMP     R0, #0
        CMPGE   R9, #0
        BLT     badcoordblock

        LDR     R4, [R1, #spLBit]
        ADD     R0, R0, R4, LSR R2
        ADD     R9, R9, R4, LSR R2
        LDR     R4, [R1, #spRBit]
        RSB     R4, R4, #31
        LDR     R10, [R1, #spWidth]
        ADD     R10, R10, #1
        RSB     R4, R4, R10, LSL#5

        CMP     R0, R4, LSR R2
        CMPLE   R9, R4, LSR R2
        BGT     badcoordblock
02
        TST     R3, #flg_matrix
        BNE     coords_passed_in

; Valid registers: R0,R8,R9,R14 - source rect, R1 - ptr to sprite, R5 GCOL, R6->matrix/coords, R7->ttr
; Set up input path for draw - move to R0,R8, move to R9,R14, end path

        Push    "R0,R1,R2,R3,R4,R5,R7,R8,R9,R10,R11,R12,R14"
        SUB     sp, sp, #64
        Push    "R12"
        LDR     R11, inlog2px
        ADD     R11, R11, #8
        MOV     R0, R0, ASL R11
        MOV     R9, R9, ASL R11
        LDR     R11, inlog2py
        ADD     R11, R11, #8
        MOV     R8, R8, ASL R11
        MOV     R14, R14, ASL R11
        ADD     R12, SP, #4
        MOV     R10, R12
        MOV     R11, #2
        STMIA   R10!, {R11}
        STMIA   R10!, {R0,R8,R11}
        STMIA   R10!, {R9}
        STMIA   R10!, {R8,R11}
        MOV     R11, #0
        STMIA   R10!, {R0,R14}
        STMIA   R10!, {R11}

; Transform the path

        MOV     R0, R12
        MOV     R1, #0
        MOV     R2, R6
        MOV     R3, #0
        SWI     Draw_TransformPath
        ADDVS   sp, sp, #64+8
        Pull    "R1,R2,R3,R4,R5,R7,R8,R9,R10,R11,R12,R14",VS
        BVS     exitbiggie

; Recover R6

        ADD     R10, R12, #4
        LDMIA   R10!, {R0,R3,R4,R6,R8,R9,R11,R14}
        Pull    "R12"
        ADR     R10, save_outoffset
        STMIA   R10!, {R0,R3,R6,R8}
        ADD     R6, R6, R11
        ADD     R8, R8, R14
        SUB     R6, R6, R0
        SUB     R8, R8, R3
        STMIA   R10!,{R6,R8,R11,R14}
        ADR     R6, save_outoffset
        ADD     sp, sp, #64
        Pull    "R0,R1,R2,R3,R4,R5,R7,R8,R9,R10,R11,R12,R14"

; Make Y values offset from the top of the sprite, not the bottom

coords_passed_in
        LDR     R2, [R1, #spHeight]
        ADD     R2, R2, #1
        SUB     R8, R2, R8
        SUB     R14, R2, R14

; Get R4 = distance from top of sprite, R2 = height of sprite, and set R8,R14 to be offset from R4

        SUBS    R2, R8, R14
        RSBMI   R2, R2, #0
        MOVMI   R4, R8
        MOVPL   R4, R14
        SUB     R8, R8, R4
        SUB     R14, R14, R4

; Store height, left, right of sprite for inner loop

        SUB     sp, sp, #trns_spr_vars_end
        CMP     R9, R0
        ORRGT   R10, R9, R2, LSL#16
        ORRLE   R10, R0, R2, LSL#16
        STR     R10, trns_comp_spr_height  ;       Height in top 16 bits, right side in bottom 16 bits
        STRGT   R0, trns_comp_spr_left    ;       Left side in bottom 16 bits (measurements in pixels)
        STRLE   R9, trns_comp_spr_left    ;       Left side in bottom 16 bits (measurements in pixels)

; Add R4*sprite_byte_width to sprite_pixel_data_start to give top of sprite, and store for loop

        LDR     R2,[r1,#spMode]
        Debug   ag,"spMode is",R2

        LDR     R2, [R1, #spImage]
        LDR     R10, [R1, #spWidth]
        Debug   ag,"spWidth is",R10
        Debug   ag,"sprite is at",R1
        ADD     R10, R10, #1
        LSL     R10, 2
        MLA     R11, R10, R4, R2
        ADD     R11, R11, R1
        STR     R11, trns_comp_spr_start

; and we have to do the same for the mask data now
; for now we just save the R4 value and deal with it once we have the mask
; equivalent of spWidth computed for the mask

        STR     R4, trns_comp_mask_base

; Work out byte width << 3-input_bpp, and store for loop

        LDR     R4, save_inlog2bpp
        CMP     R4, #4
        BCC     %FT89

        SUB     R4, R4, #3
        MOV     R11, R10, LSL #3
        MOV     R10, R10, LSR R4
        B       %FT88
89
        RSB     R4, R4, #3
        MOV     R11, R10, LSL #3
        MOV     R10, R10, LSL R4
88

        STR     R10, trns_comp_spr_byte_width

        ANDS    R11, R10, #&1F
        MOVNE   R11, #32
        ADD     R11, R11, R10
        BIC     R11, R11, #&1F

        STR     R11, trns_comp_spr_mask_width   ; used only for 1bpp masks

        ;the mask_base was used earlier for the number of rows to go into the
        ;data, so now we turn that into a byte offset

        LDR     R10, trns_comp_mask_base        ; recover row number
        MUL     R10, R11, R10                   ; convert to offset (bits)
        MOV     R10, R10, LSR #3                ; convert to bytes
        STR     R10, trns_comp_mask_base

; Get mask offset and store ttr pointer (clear mask GCOL bit if there is no mask)

        STR     R7, trns_comp_spr_ttr

        LDR     R10, BPP
        SUB     R10, R10,#1
        MOV     R11, #1:SHL:31
        MOV     R11, R11, ASR R10
        STR     R11, trns_masking_word                  ; suitable mask for pixels
        Debug   tr,"trns_masking_word",R11

        LDR     R10, [R1, #spTrans]
        LDR     R11, [R1, #spImage]

        Debug   ag,"trans and image offsets",R10,R11

        SUBS    R11, R10, R11
        BICEQ   R5, R5, #8                 ; no mask, so can't do a plot with b3 set

        Debug   ag,"trans minus image",R11

        LDR     R4, save_inlog2bpp
        CMP     R4, #4
        RSBCC   R4, R4, #3
        SUBCS   R4, R4, #3
        MOVCC   R11, R11, LSL R4
        MOVCS   R11, R11, LSR R4

        ; R4 is finished with now, so used as a working register for the mask stuff

        LDR     R10,[R1,#spTrans]             ; re-fetch offset to mask
        ADD     R10,R10,R1                    ; turn into absolute address
        LDR     R4, trns_comp_mask_base       ; fetch byte offset within 1bpp mask data
        ADD     R10,R10,R4                    ; include it
        STR     R10,trns_comp_mask_base       ; and store back the final result
        Debug   ag,"mask_base is",R10

        STR     R11, trns_comp_spr_masko
        Debug   ag,"stored masko as",R11

        Debug   tr,"Coordinate block:",R0,R8,R9,R14
        Debug   tr,"Mask offset:",#trns_comp_spr_masko
        Debug   tr,"Flags, GCOL, &Screen block, &TTR",R3, R5, R6, R7
        Debug   tr,"Height/right, left, spr_start, spr_byte_width",#trns_comp_spr_height,#trns_comp_spr_left,#trns_comp_spr_start,#trns_comp_spr_byte_width

; Copy the coordinate block
; Valid registers: R0,R8,R9,R14 - source rect, R1 - ptr to sprite, R5 GCOL, R6->matrix/coords, R7->ttr
; copy coordinates, adding graphics origin

        Push    "R1,R5,R7,R12"
        Push    "R0,R8,R9,R14"
        ADR     R0, trns_spr_xcoords + 4*8

; get O.S. to pixel shift (account for double pixel modes)

        LDR     R1, log2px
        LDR     R2, Log2bpp
        ADD     R1, R1, R2
        LDR     R2, Log2bpc
        SUB     R1, R1, R2
        LDR     R2, log2py

; get graphics origin

        LDR     R3, orgx
        LDR     R4, orgy

; get coordinates

        LDMIA   R6, {R5,R6,R7,R8,R9,R10,R11,R12}

; x coords

        ADD     R14, R7, R11
        SUB     R14, R14, R5
        CMP     R14, R9
        Pull    "R0,R8,R9,R14",NE
        Pull    "R1,R5,R7,R12",NE
        ADDNE   sp, sp, #trns_spr_vars_end
        BNE     badtransformation
        ADD     R5, R5, R3, LSL#8
        ADD     R7, R7, R3, LSL#8
        ADD     R9, R9, R3, LSL#8
        ADD     R11, R11, R3, LSL#8
        MOV     R5, R5, ASR R1
        MOV     R7, R7, ASR R1
        MOV     R9, R9, ASR R1
        MOV     R11, R11, ASR R1
        STMIA   R0!, {R5,R7,R9,R11}

; y coordinates

        ADD     R14, R8, R12
        SUB     R14, R14, R6
        CMPNE   R14, R10
        Pull    "R0,R8,R9,R14",NE
        Pull    "R1,R5,R7,R12",NE
        ADDNE   sp, sp, #trns_spr_vars_end
        BNE     badtransformation
        ADD     R6, R6, R4, LSL#8
        ADD     R8, R8, R4, LSL#8
        ADD     R10, R10, R4, LSL#8
        ADD     R12, R12, R4, LSL#8
        MOV     R6, R6, ASR R2
        MOV     R8, R8, ASR R2
        MOV     R10, R10, ASR R2
        MOV     R12, R12, ASR R2
        STMIA   R0!, {R6,R8,R10,R12}

        Debug tr,"Transformed coords are:",R5,R6,R7,R8
        Debug tr,"Transformed coords are:",R9,R10,R11,R12

; Get determinant

        SUB     R9, R7, R5      ; x1
        SUB     R10, R11, R5     ; x2
        SUB     R11, R8, R6      ; y1
        SUB     R12, R12, R6     ; y2

        Debug tr,"x1,x2,y1,y2",R9,R10,R11,R12

; R4,5 = x1 * y2

        MOV     R0, R9
        MOV     R1, R12
        SSmultD R0,R1,R4,R5

; R6,7 = x2 * y1

        MOV     R0, R10
        MOV     R1, R11
        SSmultD R0,R1,R6,R7

; R4,R5 = x1*y2 - x2*y1 (48.16 precision)

        SUBS    R4, R4, R6
        SBC     R5, R5, R7

        Debug tr,"R4,R5,R6,R7:",R4,R5,R6,R7
; R4,R5 = x1*y2 - x2*y1 / 4 (48.14 precision)

        MOV     R4, R4, LSR#2
        ORR     R4, R4, R5,LSL#30
        MOV     R5, R5, ASR#2

; R6,R7 = 1 / (x1*y2 - x2*y1) (16.48 precision)

        TEQ     R4, #0
        TEQEQ   R5, #0
        MOVEQ   R6, #0
        MOVEQ   R7, #0
        BEQ     trns_division_by_zero
        MOV     R0, #0
        MOV     R1, #&40000000
        Debug tr,"Dividing:",R0,R1,R4,R5
        mextralong_divide R6,R7,R0,R1,R4,R5,R2,R3,R8
        Debug tr,"Gives:",R6,R7

trns_division_by_zero
        LDMFD   sp, {R0,R1,R2,R3}
        SUBS    R2, R2, R0
;        RSBMI   R2, R2, #0
        SUBS    R3, R3, R1
;        RSBMI   R3, R3, #0

; spr_inc_X_x = det*y2 [ * xsize ]

        MOV     R0, R12
        MOV     R1, R2
        BL      produce_increment
        MOV     R12, R4

; spr_inc_Y_x = det*-y1 [ * ysize ]

        RSB     R0, R11, #0
        MOV     R1, R3
        BL      produce_increment
        MOV     R11, R4

; spr_inc_X_y = det*-x2 [ * xsize ]

        RSB     R0, R10, #0
        MOV     R1, R2
        BL      produce_increment
        MOV     R10, R4

; spr_inc_Y_y = det*x1 [ * ysize ]

        MOV     R0, R9
        MOV     R1, R3
        BL      produce_increment
        MOV     R9, R4

; Store increments

        STR     R12, trns_spr_inc_X_x + 8*4
        STR     R11, trns_spr_inc_Y_x + 8*4
        STR     R10, trns_spr_inc_X_y + 8*4
        STR     R9, trns_spr_inc_Y_y + 8*4
        Pull    "R0,R8,R9,R14"
        Pull    "R1,R5,R7,R12"
        Debug   tr,"Coordinate block:",R0,R8,R9,R14

; thats all the horrid fixed point stuff out of the way
; Find the top y coordinate

        ADR     R3, trns_spr_ycoords
        LDMIA   R3!, {R4,R6,R10,R11}
        CMP     R6, R4
        MOVGT   R4, R6
        CMP     R10, R4
        MOVGT   R4, R10
        CMP     R11, R4
        MOVGT   R4, R11
        MOV     R2, R8

; Valid registers: R5 GCOL, R7->ttr, R4 - maximum ycoordinate of area
;                                       Get top y coordinate of centre of line being drawn in R1

        SUB     R1, R4, #128
        MOV     R1, R1, ASR#8
        LDR     R3, gwy1
        CMP     R1, R3
        MOVGT   R1, R3
        STR     R1, save_ycoord
        Debug tr,"Top coordinate on screen (PIXELS) is:",R1

;                                       Get address of line on screen

        LDR     R3, ywindlimit
        SUB     R3, R3, R1
        LDR     R8, screenstart
        LDR     R6, linelength
        MLA     R6, R3, R6, R8
        STR     R6, trns_spr_lineptr
;                                       Get ECF pointer
        MOV     R6, #VduDriverWorkSpace + BgEcfOraEor
        AND     R3, R3, #7
        ADD     R6, R6, R3,LSL#3
;        Debug   tr, "Ecf pointer is:",R6
        STR     R6, trns_ecf_ptr

;spr_X_x0_y% += ((screen_y%-y(0)) * (!asm_spr_inc_X_y)) / 256
; screen_y%-y(0) = R6.R4

        MOV     R1, R1, ASL#8
        ADD     R1, R1, #128
        LDR     R4, trns_spr_ycoords
        SUB     R4, R1, R4
;        Debug tr,"scry%-y0:",R4
        MOV     R6, R4, ASR#16
        BIC     R4, R4, R6, LSL#16

; inc_X_y = R8.R9

        LDR     R14, trns_spr_inc_X_x
;        Debug tr,"IncXx",R14
        LDR     R14, trns_spr_inc_X_y
;        Debug tr,"incXy,Xx0",R14,R0
        MOV     R8, R14, ASR#16
        BIC     R9, R14, R8, LSL#16

; multiply

        MUL     R10, R9, R6
        MLA     R10, R8, R4, R10
        MUL     R11, R6, R8
        MUL     R14, R9, R4
        MOV     R14, R14, LSR#8
        ADD     R10, R14, R10,LSL #8
        ADD     R0, R10, R0,LSL #16
        ADD     R0, R0, R11,LSL#24

        Debug tr,"Xx0y:",R0
;spr_Y_x0_y% += ((screen_y%-y(0)) * (!asm_spr_inc_Y_y)) / 256
; inc_Y_y = R8.R9

        LDR     R14, trns_spr_inc_Y_y
        Debug tr,"incYy=",R14
        MOV     R8, R14, ASR#16
        BIC     R9, R14, R8, LSL#16
        MUL     R10, R9, R6
        MLA     R10, R8, R4, R10
        MUL     R11, R6, R8
        MUL     R14, R9, R4
        MOV     R14, R14, LSR#8
        ADD     R10, R14, R10,LSL #8
        ADD     R2, R10, R2,LSL#16
        ADD     R2, R2, R11,LSL#24

; Valid registers: R0,R2 - X_x0_y/Y_x0_y, R5 GCOL, R7->ttr

        STR     R0, trns_spr_X_x0_y
        STR     R2, trns_spr_Y_x0_y
        Debug tr,"Top corner (w.r.t. sprite) is:",R0,R2

; Now compile the code

        BL      compile_transform_code
        ADR     R10, codebuffer
        STR     R10, trns_codebuffer
        Debug tr,"Code compiled:",R10

; Now, set up the edges in the edge block

        ADR     R0, trns_spr_edgeblock
        MOV     R1, #0
        ADR     R2, trns_spr_xcoords
        LDMIA   R2, {R2,R3,R4,R5, R6,R7,R8,R9}

; Edge 0->1

        CMP     R6, R7
        STMGTIA R0!,{R1,R3,R7}
        STMGTIA R0!,{R2,R6}
        STMLEIA R0!,{R1,R2,R6}
        STMLEIA R0!,{R3,R7}

; Edge 1->2

        ADD     R0, R0, #8
        CMP     R7, R8
        STMGTIA R0!,{R1,R4,R8}
        STMGTIA R0!,{R3,R7}
        STMLEIA R0!,{R1,R3,R7}
        STMLEIA R0!,{R4,R8}

; Edge 0->3

        ADD     R0, R0, #8
        CMP     R6, R9
        STMGTIA R0!,{R1,R5,R9}
        STMGTIA R0!,{R2,R6}
        STMLEIA R0!,{R1,R2,R6}
        STMLEIA R0!,{R5,R9}

; Edge 3->2

        ADD     R0, R0, #8
        CMP     R9, R8
        STMGTIA R0!,{R1,R4,R8}
        STMGTIA R0!,{R5,R9}
        STMLEIA R0!,{R1,R5,R9}
        STMLEIA R0!,{R4,R8}
        ADD     R0, R0, #8
        MVN     R1, #0
        STR     R1, [R0]

;;        Debug tr,"Active edge list:",#trns_spr_edgeblock,#trns_spr_edgeblock+4,#trns_spr_edgeblock+8,#trns_spr_edgeblock+12,#trns_spr_edgeblock+16,#trns_spr_edgeblock+20,#trns_spr_edgeblock+24
;;        Debug tr,"Active edge list:",#trns_spr_edgeblock+28,#trns_spr_edgeblock+32,#trns_spr_edgeblock+36,#trns_spr_edgeblock+40,#trns_spr_edgeblock+44,#trns_spr_edgeblock+48,#trns_spr_edgeblock+52
;;        Debug tr,"Active edge list:",#trns_spr_edgeblock+56,#trns_spr_edgeblock+60,#trns_spr_edgeblock+64,#trns_spr_edgeblock+68,#trns_spr_edgeblock+72,#trns_spr_edgeblock+76,#trns_spr_edgeblock+80
;;        Debug tr,"Active edge list:",#trns_spr_edgeblock+84,#trns_spr_edgeblock+88,#trns_spr_edgeblock+92,#trns_spr_edgeblock+96,#trns_spr_edgeblock+100,#trns_spr_edgeblock+104,#trns_spr_edgeblock+108

; Get top Y coordinate

        LDR     R1, save_ycoord
        MOV     R11, #3

; Start loop - looping until all edges are deactivated or below Y coord

00
        LDR     R14, gwy0
        CMP     R1, R14
        BLT     trns_completed_drawing

; Activate any new edges, deactivate old ones (R11 = count of deactivated edges)

        ADR     R10, trns_spr_edgeblock
01
        LDMIA   R10, {R0,R5,R6,R7,R8,R9,R14}

; Checked all the edges?

        CMN     R0, #1
        BEQ     activated_all_edges

; If dead then ignore

        TST     R0, #trns_deactivated
        BNE     edge_inactive

; If active then shoule we deactivate it?

        TST     R0, #trns_activated
        BNE     edge_active

; Test for activation

        SUB     R14, R8, #128
        CMP     R1, R14, ASR #8
        BGT     edge_inactive

; Activate an edge

        SUB     R8, R8, #128
        SUB     R6, R6, #128
        ORR     R0, R0, #trns_activated

; Calculate R3=ABS(deltaX) and R4=ABS(deltaY).

        SUBS    R3,R7,R5
        RSBLT   R3,R3,#0
        SUB     R4,R8,R6                ;Must be correct sign already - lines are sorted wrt Y coords.

; Record direction of line as 1 or -1 in top two bits of flags

        ORR     R0,R0,#&40000000        ;line goes rightwards
        ORRGE   R0,R0,#&80000000        ;Processor Status preserved from above - GE if line goes leftwards

; Now calculate target Y co-ordinate - the line must stop after this Y value is reached.

        MOV     R2,R6,ASR #8
        ADD     R2,R2,#1
        CMP     R1, R2
        MOVLT   R0, #trns_deactivated
        BLT     edge_inactive

; Valid registers: R0-flags, R1-current Y, R2-target Y, R3,R4-dX,dY,
;                  R7,R8-Upper x,y, R10-store for edge, R11-count of deactivated edges
; Now get Bresenham error in R6.

        AND     R14,R7,#&FF
        MOV     R7,R7,ASR #8
        ADD     R6,R3,R4                ;ABS(deltaX)+ABS(deltaY)
        CMP     R6,#&80000000:SHR:8
        BLO     out_qfill_spbres

; Only do complicated stuff if simple stuff will overflow

out_qfill_dpbres
        Push    "R0,R4,R5,R7"

; R7=-1,C=1 for leftward lines, R7=0,C=0 for rightward lines.

        MOVS    R7,R0,ASR #32

; Get R0=real sub-X coordinate on screen (R14=subpixel now)

        SUB     R0,R14,#128
        RSBCS   R0,R0,#0

; R4,R5 = R0*dY , R6,R7=R4,R5 -1 if leftward,+0 i rightward

        SSmultD R0,R4,R4,R5             ;subpixelX * ABS(deltaY) into R4,R5
        ADDS    R6,R4,R7                ;Accumulate into R6,R7
        ADC     R7,R5,R7

; R6,R7 += R0*dX

        AND     R0,R8,#255
        SSmultD R0,R3,R4,R5
        ADDS    R6,R4,R6
        ADC     R7,R5,R7

; Change error to full pixel instead of 256ths pixels

        MOV     R6,R6,LSR #8
        ORR     R6,R6,R7,LSL #24
        Pull    "R0,R4,R5,R7"
        B       out_qfill_bresdone

; Do simple stuff

out_qfill_spbres

; R6=-1,C=1 for leftward lines, R6=0,C=0 for rightward lines.

        MOVS    R6,R0,ASR #32

; Get R0=real sub-X coordinate on screen (R14=subpixel now)

        SUB     R14,R14,#128
        RSBCS   R14,R14,#0
        MLA     R6,R14,R4,R6            ;Accumulate subpixelX * ABS(deltaY)
        AND     R14,R8,#255
        MLA     R6,R14,R3,R6            ;Accumulate subpixelY * ABS(deltaX)
        MOV     R6,R6,ASR #8            ;Change units to full pixels

; Now R6=Bresenham value

out_qfill_bresdone
        MOV     R8,R8,ASR #8

; Now advance the edge until we're on the right scan line and the Bresenham
; value is negative.

        CMP     R1,R2                   ;Don't bother if we're below target Y
        BLT     out_qfill_doneclip
        SUBS    R8,R8,R1                ;Must set GT or EQ
        BLGT    out_qfill_fastclip
        CMP     R6,#0                   ;So this can only happen if R1 >= R2
out_qfill_Xcliploop
        ADDGE   R7,R7,R0,ASR #30        ;Advance in X direction as far as
        SUBGES  R6,R6,R4                ;  possible
        BGE     out_qfill_Xcliploop
out_qfill_doneclip
        ADD     R6,R6,R3

; Store flags,targetY,deltaX,deltaY

        STMIA   R10!,{R0,R2,R3,R4}
        STMIA   R10!, {R6,R7,R8}
        B       edge_no_store
edge_active

; If now below it then kill it

        CMP     R1, R5                  ; Check for deactivating the edge
        MOVLT   R0, #trns_deactivated
        SUBLT   R11, R11, #1
        ADD     R8,R8,R6                ;Adjust Bresenham value for Y move
edge_inactive
        STMIA   R10!, {R0,R5,R6,R7,R8,R9,R14}
edge_no_store
        B       %BT01
activated_all_edges
        CMP     R11, #0
        BLT     trns_completed_drawing

;;        Debug tr,"Active edge list:",#trns_spr_edgeblock,#trns_spr_edgeblock+4,#trns_spr_edgeblock+8,#trns_spr_edgeblock+12,#trns_spr_edgeblock+16,#trns_spr_edgeblock+20,#trns_spr_edgeblock+24
;;        Debug tr,"Active edge list:",#trns_spr_edgeblock+28,#trns_spr_edgeblock+32,#trns_spr_edgeblock+36,#trns_spr_edgeblock+40,#trns_spr_edgeblock+44,#trns_spr_edgeblock+48,#trns_spr_edgeblock+52
;;        Debug tr,"Active edge list:",#trns_spr_edgeblock+56,#trns_spr_edgeblock+60,#trns_spr_edgeblock+64,#trns_spr_edgeblock+68,#trns_spr_edgeblock+72,#trns_spr_edgeblock+76,#trns_spr_edgeblock+80
;;        Debug tr,"Active edge list:",#trns_spr_edgeblock+84,#trns_spr_edgeblock+88,#trns_spr_edgeblock+92,#trns_spr_edgeblock+96,#trns_spr_edgeblock+100,#trns_spr_edgeblock+104,#trns_spr_edgeblock+108
;;        Debug tr,"Number of active/not yet activated edges:",R11

; Move on all active lines

        ADR     R10, trns_spr_edgeblock
01
        LDMIA   R10!, {R0,R2,R3,R4,R6,R7,R8}
        CMN     R0, #1
        BEQ     trns_draw_line
        TST     R0, #trns_activated
        BEQ     %BT01

; R0-flags, R2 - target Y, R3 - deltaX, R4-deltaY
; R6-Bresenham error value, R7-current X

        SUB     R8,R7,R0,ASR #31        ;Calculate crossing X
        CMP     R1,R2                   ;Stop if already below target
        BLT     trns_deactivate_2
;        CMP     R4,R6,ASR #3            ;Use fast code?
;        BLLE    out_qfill_fasthoriz
        CMP     R6,#0                   ;Are we still below the edge?
02
        ADDGE   R7,R7,R0,ASR #30        ;Make X moves until we're above the
        SUBGES  R6,R6,R4                ;  edge, adjusting Bresenham value
        BGE     %BT02                   ;  for them
        STMDB   R10, {R0,R2,R3,R4,R6,R7,R8}
        B       %BT01

trns_deactivate_2
        MOV     R0, #trns_deactivated
        STMDB   R10, {R0,R2,R3,R4,R6,R7,R8}
        B       %BT01

; Now find the pair of lines which are active

trns_draw_line
;;        Debug tr,"Active edge list:",#trns_spr_edgeblock,#trns_spr_edgeblock+4,#trns_spr_edgeblock+8,#trns_spr_edgeblock+12,#trns_spr_edgeblock+16,#trns_spr_edgeblock+20,#trns_spr_edgeblock+24
;;        Debug tr,"Active edge list:",#trns_spr_edgeblock+28,#trns_spr_edgeblock+32,#trns_spr_edgeblock+36,#trns_spr_edgeblock+40,#trns_spr_edgeblock+44,#trns_spr_edgeblock+48,#trns_spr_edgeblock+52
;;        Debug tr,"Active edge list:",#trns_spr_edgeblock+56,#trns_spr_edgeblock+60,#trns_spr_edgeblock+64,#trns_spr_edgeblock+68,#trns_spr_edgeblock+72,#trns_spr_edgeblock+76,#trns_spr_edgeblock+80
;;        Debug tr,"Active edge list:",#trns_spr_edgeblock+84,#trns_spr_edgeblock+88,#trns_spr_edgeblock+92,#trns_spr_edgeblock+96,#trns_spr_edgeblock+100,#trns_spr_edgeblock+104,#trns_spr_edgeblock+108
;;        Debug tr,"Number of active/not yet activated edges:",R11
        ADR     R10, trns_spr_edgeblock
01
        LDMIA   R10!, {R0,R2,R3,R4,R5,R6,R7}
        CMN     R0, #1
        BEQ     trns_completed_drawing
        TST     R0, #1
        BEQ     %BT01
01
        LDMIA   R10!, {R0,R2,R3,R4,R5,R6,R8}
        CMN     R0, #1
        BEQ     trns_completed_drawing
        TST     R0, #1
        BEQ     %BT01

; R6,R7 are x coords to plot between at current Y

        Push    "R1,R11,R12"
        MOV     trns_scr_y, R1
        MOV     trns_scr_lx, R7
        MOV     trns_scr_rx, R8
        ASSERT trns_scr_lx<>R6
        ASSERT trns_scr_y<>R6
        ASSERT trns_scr_y<>R7
        LDR     R14, trns_ecf_ptr + 3*4
        LDMIA   R14!, {trns_X_x0_y, trns_Y_x0_y}
        STR     trns_X_x0_y, trns_comp_ecf_ora + 3*4
        STR     trns_Y_x0_y, trns_comp_ecf_eor + 3*4
        CMP     R14, #VduDriverWorkSpace + BgEcfOraEor+64
        MOVGE   R14, #VduDriverWorkSpace + BgEcfOraEor
        STR     R14, trns_ecf_ptr + 3*4
        ADR     R14, trns_spr_X_x0_y + 3*4
        LDMIA   R14, {trns_X_x0_y,trns_Y_x0_y,trns_inc_X_x,trns_inc_Y_x,trns_inc_X_y,trns_inc_Y_y,trns_line_ptr}

;;        Debug tr,"Calling plotting routine: lx,rx,ty",trns_scr_lx, trns_scr_rx, trns_scr_y
;;        Debug tr,"incXx,Yx,Xy,Yy,lineptr",trns_inc_X_x,trns_inc_Y_x,trns_inc_X_y,trns_inc_Y_y,trns_line_ptr
;;        Debug tr,"X0Y0",trns_X_x0_y,trns_Y_x0_y

        BL      plot_and_calculate_row
        Pull    "R1,R11,R12"
        LDR     R14, linelength
        LDR     trns_line_ptr, trns_spr_lineptr
        ADD     trns_line_ptr, trns_line_ptr, R14
        STR     trns_line_ptr, trns_spr_lineptr
        LDR     trns_X_x0_y, trns_spr_X_x0_y
        LDR     trns_Y_x0_y, trns_spr_Y_x0_y
        LDR     trns_inc_X_y, trns_spr_inc_X_y
        LDR     trns_inc_Y_y, trns_spr_inc_Y_y
        SUB     trns_X_x0_y, trns_X_x0_y, trns_inc_X_y
        SUB     trns_Y_x0_y, trns_Y_x0_y, trns_inc_Y_y
        STR     trns_X_x0_y, trns_spr_X_x0_y
        STR     trns_Y_x0_y, trns_spr_Y_x0_y
        SUB     R1, R1, #1
        B       %BT00

trns_completed_drawing

; Update changed coordinate block?

        LDR     R14, changedbox
        LDR     R0, [R14], #4
        TST     R0, #1
        BEQ     trns_no_changed_box

; Get R6 = top y, R4 = bottom y

        LDR     R6, save_ycoord
        MOV     R4, R1

; Check top y was above gwy0, else no change to box

        LDR     R0, gwy0
        CMP     R6, R0
        BLT     trns_no_changed_box

; Find R3 = left x, R5 = right x

        ADR     R0, trns_spr_xcoords
        LDMIA   R0, {R3,R7,R8,R9}
        MOV     R5, R3
        CMP     R3, R7
        MOVGT   R3, R7
        CMP     R3, R8
        MOVGT   R3, R8
        CMP     R3, R9
        MOVGT   R3, R9
        CMP     R5, R7
        MOVLT   R5, R7
        CMP     R5, R8
        MOVLT   R5, R8
        CMP     R5, R9
        MOVLT   R5, R9

; Now convert from 256ths pixels to pixels, and clip to graphics window

        ADD     R3, R3, #128
        MOV     R3, R3, ASR #8
        ADD     R5, R5, #128
        MOV     R5, R5, ASR #8
        LDR     R0, gwx0
        LDR     R1, gwx1
        CMP     R3, R0
        MOVLT   R3, R0
        CMP     R5, R1
        MOVGT   R5, R1
        CMP     R3, R5
        BGT     trns_no_changed_box

; Unbodge double pixels

        LDR     R7, Log2bpc
        LDR     R8, Log2bpp
        SUB     R7, R7, R8
        MOVNE   R3, R3, LSL R7
        MOVNE   R5, R5, LSL R7
        ADD     R4, R4, #1

; Get original box and update it

        LDMIA   R14, {R0,R1,R2,R7}
        CMP     R0, R3
        MOVGT   R0, R3
        CMP     R1, R4
        MOVGT   R1, R4
        CMP     R2, R5
        MOVLT   R2, R5
        CMP     R7, R6
        MOVLT   R7, R6
        STMIA   R14, {R0,R1,R2,R7}

trns_no_changed_box
        ADD     sp, sp, #trns_spr_vars_end
        CLRV
        Pull    "R1-R9,PC"


; -------------------------------------------------------------
; - Routine to get increment in sprite due to x or y movement -
; -------------------------------------------------------------
; R0 - x or y distance
; R1 - size of sprite (width or height)
; R6,7 - 64 bit determinant (16.48 format)
; Result in R4

produce_increment
        Push    "R0-R3,R5-R12,R14"
        Debug tr,"Inputs:",R0,R1,R6,R7

; First split R6/7 into R4-R7

        MOVS    R3, R7, ASR #32
        BEQ     %FT01
        RSBS    R6, R6, #0
        RSC     R7, R7, #0
01
        MOV     R4, R7, LSR #16
        BIC     R5, R7, R4, LSL #16
        MOV     R7, R6, LSR #16
        BIC     R6, R6, R7, LSL #16

; Now check sign of R0, R3 = sign of product

        EORS    R3, R3, R0, ASR #32

; If R3<0 (i.e. product<0) then R1=-R1

        RSBMI   R1, R1, #0

; If R0<0 then R0=-R0

        RSBCS   R0, R0, #0

; Split R0

        MOV     R2, R0, LSR #16
        BIC     R0, R0, R2, LSL #16
        Debug tr,"Premultiply values:R0,R2,R1,R3:",R0,R2,R1,R3
        Debug tr,"Premultiply values:R4,R5,R7,R6:",R4,R5,R7,R6

; Now produce R8 = U0.6 + L2.6 + L0.7, R6 = U2.6 + U0.7 (inc. carries)

        MUL     R8, R0, R6
        MUL     R6, R2, R6
        MUL     R3, R0, R7
        ADDS    R3, R3, R6
        MOV     R6, R3, LSR #16
        ADDCS   R6, R6, #&10000
        ADDS    R8, R8, R3, LSL #16
        MOV     R8, R8, LSR #16
        ADC     R6, R6, #0
        Debug tr,"Stage 1:R6,R8",R6,R8

; Now produce R9 = L6 + L2.7 + L0.5, R7 = U6 + U2.7 + U0.5 (inc carries)

        MUL     R9, R0, R5
        MUL     R7, R2, R7
        ADDS    R3, R7, R9
        MOV     R7, R3, LSR #16
        ADDCS   R7, R7, #&10000
        ADD     R7, R7, R6, LSR #16
        MOV     R9, R3, LSL #16
        ADDS    R9, R9, R6, LSL #16
        MOV     R9, R9, LSR #16
        ADC     R7, R7, #0
        Debug tr,"Stage 2:R7,R9",R7,R9

; Now produce R10 = L7 + L2.5 + L0.4, R5 = U7 + U2.5 + U0.4 (inc. carries)

        MUL     R10, R0, R4
        MUL     R5, R2, R5
        ADDS    R3, R5, R10
        MOV     R5, R3, LSR #16
        ADDCS   R5, R5, #&10000
        ADD     R5, R5, R7, LSR #16
        MOV     R10, R3, LSL #16
        ADDS    R10, R10, R7, LSL #16
        MOV     R10, R10, LSR #16
        ADC     R5, R5, #0
        Debug tr,"Stage 3:R10,R5",R10,R5

; Now produce R11 = L5 + L2.4

        MLA     R11, R2, R4, R5
        MOV     R3, R11, LSR #16
        BIC     R11, R11, R3, LSL #16

; Now R11.R10.R9.R8 (sixteen bits each) = 256/Det*Size in 32.32 form
; So get R1 * R11.R10.R9.R8 into R4

        MUL     R8, R1, R8
        MUL     R9, R1, R9
        MUL     R10, R1, R10
        MUL     R11, R1, R11
        MOV     R4, R8, ASR #24
        ADD     R4, R4, R9, ASR #8
        ADD     R4, R4, R10, ASL #8
        ADD     R4, R4, R11, ASL #24

        ;bug fix, round up if necessary
        TST     R9,#&80
        ADDNE   R4,R4,#1

        Debug tr,"Output:R4",R4

        Pull    "R0-R3,R5-R12,PC"



; -----------------------------------
; - DrQfill routines required above -
; -----------------------------------
; Fast Y clipping routine

out_qfill_fastclip
        Push    "R1,R2,R7,LR"
        MOV     R7,R6,ASR #31           ;Sign-extend Bresenham value
        SSmultD R8,R3,R1,R2             ;(no. Y steps) * deltaX into R1,R2
        ADDS    R6,R6,R1                ;Accumulate into Bresenham value
        ADCS    R7,R7,R2
        MOVMI   R8,#0                   ;Check for still being to the right
        BMI     out_qfill_fastclipdone  ;  of the edge
        BL      arith_DSdivS            ;Divide by deltaY
        DCB     R6,R4,R8,0
        SSmultD R8,R4,R1,R2             ;(no. X steps) * deltaY into R1,R2
        SUB     R6,R6,R1                ;Not interested in high word!

out_qfill_fastclipdone
        Pull    "R1,R2,R7,LR"
        TEQ     R0,#0                   ;Move X co-ord. in right direction
        ADDPL   R7,R7,R8
        SUBMI   R7,R7,R8
        MOV     PC,LR

; Subroutine to advance an edge horizontally fast (i.e. using long division
; rather than division by repeated subtraction!)
;   Updates R6 and R7, corrupts R2 and R3, preserves flags & other registers

out_qfill_fasthoriz
        DivRem  R2,R6,R4,R3
        TEQ     R0,#0
        ADDPL   R7,R7,R2
        SUBMI   R7,R7,R2
        MOV     PC,LR

 [ :LNOT:usemull
; Subroutine to multiply two single precision signed numbers together and
; get a double precision result. The word following the BL should contain
; the numbers of the two operand registers in its bottom two bytes and the
; number of the register to take the ls part of the result in the next byte.
; The ms part of the result will go into the next register.
;   This routine will only work on registers R0-R8.

arith_SSmultD
        Push    "R0-R8"                 ;REMEMBER: need to stack R0-R8 so they can hold results
        RSB     R8,PC,PC                ;get embedded PSR flags into R8 - 26/32-bit mode neutral
        LDRB    R0,[R14,-R8]!           ;Get first operand; R14 := (R14 - PSR)
        LDR     R0,[R13,R0,LSL #2]
        LDRB    R1,[R14,#1]             ;Get second operand
        LDR     R1,[R13,R1,LSL #2]
        MOV     R4,R0,LSR #16           ;Split first operand into halves
        BIC     R3,R0,R4,LSL #16
        MOVS    R6,R1,LSR #16           ;Split second operand into halves
        BIC     R5,R1,R6,LSL #16
        MUL     R2,R3,R5                ;Produce low partial product
        MUL     R3,R6,R3                ;And middle partial products
        MUL     R5,R4,R5
        MULNE   R6,R4,R6                ;And high partial product
        ADDS    R3,R3,R5                ;Add middle partial products, dealing
        ADDCS   R6,R6,#&10000           ;  with overflow
        ADDS    R2,R2,R3,LSL #16        ;Add middle partial product sum into
        ADC     R6,R6,R3,LSR #16        ;  result
        TEQ     R0,#0                   ;Add cross products of operands and
        SUBMI   R6,R6,R1                ;  operand sign extensions into
        TEQ     R1,#0                   ;  result
        SUBMI   R6,R6,R0
        LDRB    R0,[R14,#2]             ;Store the result on the stack, to be
        ADD     R0,R13,R0,LSL #2        ;  picked up by the correct registers
        STMIA   R0,{R2,R6}
        Pull    "R0-R8"
        ADD     PC,R14,#4               ;Skip the argument word on return (don't use ADDS!)
 ]

; Subroutine to divide a double precision unsigned number by a single
; precision unsigned number, yielding a single precision unsigned result.
; The word following the BL should contain the number of the register holding
; the ls part of the dividend in its bottom byte; the ms part of the dividend
; is in the next register. The next byte of the word contains the number of
; the divisor register, and the next byte the number of the register in which
; to deposit the quotient.
;   This routine will only work on registers R0-R8. It assumes that the
; divisor is not zero, and that the quotient will not overflow.

arith_DSdivS
        Push    "R0-R8"
        RSB     R8,PC,PC                ;get embedded PSR flags into R8 - 26/32-bit mode neutral
        LDRB    R0,[R14,-R8]!           ;Get first operand; R14 := (R14 - PSR)
        ADD     R0,R13,R0,LSL #2
        LDMIA   R0,{R0,R1}
        LDRB    R2,[R14,#1]             ;Get second operand
        LDR     R2,[R13,R2,LSL #2]
        MOV     R3,#1                   ;Init. quotient with a sentinel bit

arith_DSdivS_loop
        ADDS    R0,R0,R0                ;Shift a bit up into the ms half of
        ADC     R1,R1,R1                ;  the dividend
        CMP     R1,R2                   ;Do trial subtraction, producing
        SUBCS   R1,R1,R2                ;  result bit in C
        ADCS    R3,R3,R3                ;Result bit into result, then loop
        BCC     arith_DSdivS_loop       ;  unless sentinel bit shifted out
        LDRB    R0,[R14,#2]             ;Store the result on the stack, to be
        STR     R3,[R13,R0,LSL #2]      ;  picked up by the correct registers
        Pull    "R0-R8"
        ADD     PC,R14,#4               ;Skip the argument word on return (don't use ADDS!)


;       ---------------------------------
;       - The code which is compiled in -
;       ---------------------------------
compiled_routine_stacked        *       calc_row_stacked + 1*4 - 4; Number of bytes stacked since main loop

;                       Start of routine
trnslp_strt
        Push    "R14"
        MOV     trns_out_word, #&80000000
        MOV     trns_out_mask, #0
        CMP     trns_xsize, #0
        Pull    "PC",LE
trnslp_strt_size        *       .-trnslp_strt
        ASSERT (trnslp_strt_size) = 5*4

;                       Get address of lefthand x on screen (dependent on out_bpp)
trnslp_getaddr_o1
        ADD     trns_out_ptr, trns_out_ptr, trns_out_x, LSR# 3 ;(0.1.2.3 - 8.4.2.1 out_bpp)
        BIC     trns_out_ptr, trns_out_ptr, #3
        AND     trns_out_x, trns_out_x, #31                     ;(3.7.15.31 - 8.4.2.1 out_bpp)
        ANDEQ   R0,R0,R0
trnslp_getaddr_o2
        ADD     trns_out_ptr, trns_out_ptr, trns_out_x, LSR# 2 ;(0.1.2.3 - 8.4.2.1 out_bpp)
        BIC     trns_out_ptr, trns_out_ptr, #3
        MOV     trns_out_x, trns_out_x, LSL#1
        AND     trns_out_x, trns_out_x, #30                     ;(3.7.15.31 - 8.4.2.1 out_bpp)
trnslp_getaddr_o4
        ADD     trns_out_ptr, trns_out_ptr, trns_out_x, LSR# 1  ;(0.1.2.3 - 8.4.2.1 out_bpp)
        BIC     trns_out_ptr, trns_out_ptr, #3
        MOV     trns_out_x, trns_out_x, LSL#2
        AND     trns_out_x, trns_out_x, #28                     ;(3<<3.7<<2.15<<1.31<<0 - 8.4.2.1 out_bpp)
trnslp_getaddr_o8
        ADD     trns_out_ptr, trns_out_ptr, trns_out_x          ;(0.1.2.3 - 8.4.2.1 out_bpp)
        BIC     trns_out_ptr, trns_out_ptr, #3
        MOV     trns_out_x, trns_out_x, LSL#3
        AND     trns_out_x, trns_out_x, #24                     ;(3.7.15.31 - 8.4.2.1 out_bpp)
trnslp_getaddr_o16
        ADD     trns_out_ptr, trns_out_ptr, trns_out_x, LSL #1
        BIC     trns_out_ptr, trns_out_ptr, #3
        MOV     trns_out_x, trns_out_x, LSL #4
        AND     trns_out_x, trns_out_x, #16
trnslp_getaddr_o24
        ADD     trns_out_ptr, trns_out_ptr, trns_out_x, LSL #2
        BIC     trns_out_ptr, trns_out_ptr, #3
        MOV     trns_out_x, trns_out_x, LSL#5
        AND     trns_out_x, trns_out_x, #0
trnslp_getaddr_size     *       .-trnslp_getaddr_o24
        ASSERT (.-trnslp_getaddr_o1) = 6*trnslp_getaddr_size

;                       Get pixel and macro word shifted to align with lefthand pixel inside screen word
trnslp_getwam
        MOV     trns_out_word, trns_out_word, LSR trns_out_x
trnslp_getwam_size      *       .-trnslp_getwam
        ASSERT (trnslp_getwam_size) = 1*4

;                       Start of loop
trnslp_stloop1
        MOVS    trns_offset, trns_Y, ASR#16
        MOVMI   trns_offset, #0
        MOV     trns_dummy14, trns_spr_height, LSR#16
        CMP     trns_dummy14, trns_Y, ASR#16
trnslp_stloop1_size      *       .-trnslp_stloop1
trnslp_stloop2
        SUBLE   trns_offset, trns_dummy14, #1
        MUL     trns_offset, trns_byte_width, trns_offset
        MOV     trns_dummy14, trns_X
        CMP     trns_X, trns_spr_left, LSL #16
trnslp_stloop2_size      *       .-trnslp_stloop2
trnslp_stloop3
        MOVLT   trns_dummy14, trns_spr_left, LSL #16
        CMP     trns_X, trns_spr_right, LSL #16
        MOVGE   trns_dummy14, trns_spr_right, LSL #16
        SUBGE   trns_dummy14, trns_dummy14, #1
        ADD     trns_offset, trns_offset, trns_dummy14, LSR #16
trnslp_stloop3_size      *       .-trnslp_stloop3

;                       Start of loop (1bpp mask)
trnslp_nmstloop1
        MOVS    trns_offset, trns_Y, ASR#16
        MOVMI   trns_offset, #0
        MOV     trns_dummy14, trns_spr_height, LSR#16
        CMP     trns_dummy14, trns_Y, ASR#16
trnslp_nmstloop1_size      *       .-trnslp_nmstloop1
trnslp_nmstloop2
        SUBLE   trns_offset, trns_dummy14, #1
        LDR     trns_dummy12, trns_comp_spr_mask_width + compiled_routine_stacked
        MUL     trns_dummy12, trns_offset, trns_dummy12
        MUL     trns_offset, trns_byte_width, trns_offset
        MOV     trns_dummy14, trns_X
trnslp_nmstloop2_size      *       .-trnslp_nmstloop2
trnslp_nmstloop3
        CMP     trns_X, trns_spr_left, LSL #16
        MOVLT   trns_dummy14, trns_spr_left, LSL #16
        CMP     trns_X, trns_spr_right, LSL #16
        MOVGE   trns_dummy14, trns_spr_right, LSL #16
        SUBGE   trns_dummy14, trns_dummy14, #1
trnslp_nmstloop3_size      *       .-trnslp_nmstloop3
trnslp_nmstloop4
        ADD     trns_offset, trns_offset, trns_dummy14, LSR #16
        ADD     trns_dummy12, trns_dummy12, trns_dummy14, LSR #16
        STR     trns_dummy12, trns_comp_mask_offset + compiled_routine_stacked
trnslp_nmstloop4_size      *       .-trnslp_nmstloop4


;                       Shift pixel and mask words by out_bpp
trnslp_shfwam_o1
        MOVS    trns_out_word, trns_out_word, LSR#1
        MOV     trns_out_mask, trns_out_mask, LSR#1
trnslp_shfwam_o2
        MOVS    trns_out_word, trns_out_word, LSR#2
        MOV     trns_out_mask, trns_out_mask, LSR#2
trnslp_shfwam_o4
        MOVS    trns_out_word, trns_out_word, LSR#4
        MOV     trns_out_mask, trns_out_mask, LSR#4
trnslp_shfwam_o8
        MOVS    trns_out_word, trns_out_word, LSR#8
        MOV     trns_out_mask, trns_out_mask, LSR#8
trnslp_shfwam_o16
        MOVS    trns_out_word, trns_out_word, LSR#16
        MOV     trns_out_mask, trns_out_mask, LSR#16
trnslp_shfwam_o24
        ;MOVS    trns_out_word, trns_out_word
        ;MOV     trns_out_mask, trns_out_mask
        ;achieve the same effect as a LSR#32 would do, if such existed!
        MOV      trns_out_word,#1
        MOVS     trns_out_word,trns_out_word,LSR #1
trnslp_shfwam_size      *       .-trnslp_shfwam_o24
        ASSERT (.-trnslp_shfwam_o1) = trnslp_shfwam_size*6

;can't do the 32 bit case in two instructions, so here's the last one

trnslp_shfwam_o24_2
        MOV      trns_out_mask, #0

;                       Read pixel from sprite (dependent on in_bpp)
trnslp_readpx_i1
        LDRB    trns_dummy14, [trns_spr_start, trns_offset, LSR #3]     ;(0.1.2.3 - 8.4.2.1 in_bpp)
        AND     trns_dummy12, trns_offset, #7                           ;(0.1.3.7 - 8.4.2.1 in_bpp)
        MOV     trns_dummy14, trns_dummy14, LSR trns_dummy12
        AND     trns_dummy14, trns_dummy14, #&1                         ;(&FF.&F.&3.&1 - 8.4.2.1 in_bpp)
trnslp_readpx_i1_size * .-trnslp_readpx_i1
        ANDEQ   R0,R0,R0
trnslp_readpx_i2
        LDRB    trns_dummy14, [trns_spr_start, trns_offset, LSR #2]     ;(0.1.2.3 - 8.4.2.1 in_bpp)
        AND     trns_dummy12, trns_offset, #3                           ;(0.1.3.7 - 8.4.2.1 in_bpp)
        MOV     trns_dummy12, trns_dummy12, LSL#1                       ;(3.2.1.0 - 8.4.2.1 in_bpp)
        MOV     trns_dummy14, trns_dummy14, LSR trns_dummy12
        AND     trns_dummy14, trns_dummy14, #&3                         ;(&FF.&F.&3.&1 - 8.4.2.1 in_bpp)
trnslp_readpx_i4
        LDRB    trns_dummy14, [trns_spr_start, trns_offset, LSR #1]     ;(0.1.2.3 - 8.4.2.1 in_bpp)
        AND     trns_dummy12, trns_offset, #1                           ;(0.1.3.7 - 8.4.2.1 in_bpp)
        MOV     trns_dummy12, trns_dummy12, LSL#2                       ;(3.2.1.0 - 8.4.2.1 in_bpp)
        MOV     trns_dummy14, trns_dummy14, LSR trns_dummy12
        AND     trns_dummy14, trns_dummy14, #&F                         ;(&FF.&F.&3.&1 - 8.4.2.1 in_bpp)
trnslp_readpx_i8
        LDRB    trns_dummy14, [trns_spr_start, trns_offset]
        ANDEQ   R0,R0,R0
        ANDNE   R0,R0,R0
        ANDNE   R0,R0,R0
        ANDEQ   R0,R0,R0
trnslp_readpx_i16
        ;note - this will be word or half word aligned
        LDR     trns_dummy14, [trns_spr_start, trns_offset, LSL #1]
        MOV     trns_dummy14, trns_dummy14, LSL #16
        MOV     trns_dummy14, trns_dummy14, LSR #16
        ANDNE   R0,R0,R0
        ANDEQ   R0,R0,R0
trnslp_readpx_i32
        LDR     trns_dummy14, [trns_spr_start, trns_offset, LSL #2]
        ANDEQ   R0,R0,R0
        ANDNE   R0,R0,R0
        ANDNE   R0,R0,R0
        ANDEQ   R0,R0,R0
trnslp_readpx_size      *       trnslp_readpx_i8-trnslp_readpx_i4
        ASSERT (.-trnslp_readpx_i1) = trnslp_readpx_size*6

;                       Translate pixel through ttr
trnslp_transpx
        LDR     trns_dummy12, trns_comp_spr_ttr + compiled_routine_stacked
        LDRB    trns_dummy14, [trns_dummy12, trns_dummy14]
trnslp_transpx_size      *      .-trnslp_transpx
        ASSERT (trnslp_transpx_size) = 2*4

trnslp_transpx1
        LDR     trns_dummy12, trns_comp_spr_ttr + compiled_routine_stacked
        LDR     trns_dummy14, [trns_dummy12, trns_dummy14, LSL #2]
trnslp_transpx1_size    *       .-trnslp_transpx1

        [ ignore_ttr
trnslp_transpx2
        LDR     trns_dummy12, trns_comp_spr_ttr + compiled_routine_stacked
        LDR     trns_dummy14, [trns_dummy12, trns_dummy14, LSL #3]
        MOV     trns_dummy14, trns_dummy14, LSR #8 ;now in correct form for 32bpp
trnslp_transpx2_size    *       .-trnslp_transpx2
        ;use trnslp_munge_32to16 if doing 16bpp
        ]

trnslp_use32K
       MOV      trns_dummy14,trns_dummy14,LSL #17
       MOV      trns_dummy14,trns_dummy14,LSR #17

       LDR      trns_dummy12,trns_comp_spr_ttr + compiled_routine_stacked ;fetch the table address
       LDR      trns_dummy12,[trns_dummy12,#4]
       ; trns_dummy12 is correct at this point....
       LDRB     trns_dummy14,[trns_dummy12,trns_dummy14]
trnslp_use32K_size      *      .-trnslp_use32K

;                       Copy pixel into output word (dependent on out_bpp)
trnslp_setpx
        ORR     trns_out_word, trns_out_word, trns_dummy14, LSL #xxx
trnslp_setpx_size       *       .-trnslp_setpx
        ASSERT (trnslp_setpx_size) = 1*4

;                       If sprite has no mask, then ORR set bits into the output mask word
trnslp_sprnomask_o1
        ORR     trns_out_mask, trns_out_mask, #&80000000
trnslp_sprnomask_o2
        ORR     trns_out_mask, trns_out_mask, #&C0000000
trnslp_sprnomask_o4
        ORR     trns_out_mask, trns_out_mask, #&F0000000
trnslp_sprnomask_o8
        ORR     trns_out_mask, trns_out_mask, #&FF000000
trnslp_sprnomask_size   *       .-trnslp_sprnomask_o8
        ASSERT (.-trnslp_sprnomask_o1) = trnslp_sprnomask_size*4

trnslp_sprnomask16or24
        LDR     trns_dummy14, trns_masking_word +compiled_routine_stacked
        ORR     trns_out_mask, trns_out_mask, trns_dummy14
trnslp_sprnomask16or24_size * .-trnslp_sprnomask16or24
        ASSERT  trnslp_sprnomask16or24_size = 4*2
;                       If sprite has a mask then read it in (also uses trnslp_readpx_i, above)
trnslp_sprmask
        LDR     trns_dummy14, trns_comp_spr_masko + compiled_routine_stacked
        ADD     trns_offset, trns_offset, trns_dummy14
trnslp_sprmask_size     *       .-trnslp_sprmask
        ASSERT (trnslp_sprmask_size) = 2*4

;                       If sprite has a 1BPP mask then read it in
; (note: no longer uses trnslp_readpx_i1 - it needs a different base address and must
; preserve the original one by avoiding trns_spr_start (R1) )

trnslp_new_sprmask
        ;
        LDR     trns_dummy12, trns_comp_mask_base + compiled_routine_stacked
        LDR     trns_offset, trns_comp_mask_offset + compiled_routine_stacked
        LDRB    trns_dummy14,[trns_dummy12, trns_offset, LSR #3]
trnslp_new_sprmask_size     *       .-trnslp_new_sprmask
trnslp_new_sprmask2
        AND     trns_dummy12, trns_offset, #7
        MOV     trns_dummy14, trns_dummy14, LSR trns_dummy12
        AND     trns_dummy14, trns_dummy14, #1
trnslp_new_sprmask2_size     *       .-trnslp_new_sprmask2

;                       Setup valid mask pixel into mask word
trnslp_setmask_o1
        TEQ     trns_dummy14, #0                ;Is the mask pixel set?
        ORRNE   trns_out_mask, trns_out_mask, #&80000000
trnslp_setmask_o2
        TEQ     trns_dummy14, #0                ;Is the mask pixel set?
        ORRNE   trns_out_mask, trns_out_mask, #&C0000000
trnslp_setmask_o4
        TEQ     trns_dummy14, #0                ;Is the mask pixel set?
        ORRNE   trns_out_mask, trns_out_mask, #&F0000000
trnslp_setmask_o8
        TEQ     trns_dummy14, #0                ;Is the mask pixel set?
        ORRNE   trns_out_mask, trns_out_mask, #&FF000000
trnslp_setmask_size * .-trnslp_setmask_o8
        ASSERT  (.-trnslp_setmask_o1) = trnslp_setmask_size*4

trnslp_setmask16or24
        TEQ     trns_dummy14, #0                ;Is the mask pixel set?
        LDRNE   trns_dummy14, trns_masking_word +compiled_routine_stacked
        ORRNE   trns_out_mask, trns_out_mask, trns_dummy14
trnslp_setmask16or24_size * .-trnslp_setmask16or24
        ASSERT  (.-trnslp_setmask16or24) = 3*4

;                       Cope with 1:1 mapping in the mask and screen mode
trnslp_setmask2
        ORR     trns_out_mask, trns_out_mask, trns_dummy14, LSL #xxx
trnslp_setmask2_size    *       .-trnslp_setmask2
        ASSERT (trnslp_setmask2_size) = 1*4

;                       Read the screen word
trnslp_readscrn
        LDRCS   trns_offset, [trns_out_ptr]
        ANDCS   trns_out_word, trns_out_word, trns_out_mask
trnslp_readscrn_size    *       .-trnslp_readscrn
        ASSERT (trnslp_readscrn_size) = 2*4

 [ AvoidScreenReads
trnslp_readscrn0
        MVNS    trns_offset, trns_out_mask      ; Z set iff out_mask is all 1s
        LDRHI   trns_offset, [trns_out_ptr]     ; load if C set & Z clear
        ANDCS   trns_out_word, trns_out_word, trns_out_mask
trnslp_readscrn0_size   *       .-trnslp_readscrn0
 ]

;                       Effect the screen word depending on the GCOL action
trnslp_gcol
;      GCOL 0 - plot direct
                BICCS   trns_offset, trns_offset, trns_out_mask
                EORCS   trns_offset, trns_offset, trns_out_word
;      GCOL 1 - OR with screen
                ORRCS   trns_offset, trns_offset, trns_out_word
        ANDEQ   R0,R0,R0
;      GCOL 2 - AND with screen
                EORCS   trns_out_word, trns_out_word, trns_out_mask
                BICCS   trns_offset, trns_offset, trns_out_word
;      GCOL 3 - EOR with screen
                EORCS   trns_offset, trns_offset, trns_out_word
        ANDEQ   R0,R0,R0
;      GCOL 4 - Invert screen
                EORCS   trns_offset, trns_offset, trns_out_mask
        ANDEQ   R0, R0, R0
;      GCOL 5 - Do nothing
        ANDEQ   R0,R0,R0
        ANDEQ   R0,R0,R0
;      GCOL 6 - AND with NOT colour
                BICCS   trns_offset, trns_offset, trns_out_word
        ANDEQ   R0,R0,R0
;      GCOL 7 - ORR with NOT colour
                EORCS   trns_out_word, trns_out_word, trns_out_mask
                ORRCS   trns_offset, trns_offset, trns_out_word
trnslp_gcol_size        *       2*4
        ASSERT (.-trnslp_gcol) = trnslp_gcol_size * 8

;                       Effect the screen word according to the mask and ECF
trnslp_plotmask
                LDRCS   trns_dummy12, trns_comp_ecf_ora + compiled_routine_stacked
                LDRCS   trns_out_word, trns_comp_ecf_eor + compiled_routine_stacked
                ANDCS   trns_dummy12, trns_dummy12, trns_out_mask
                ANDCS   trns_out_word, trns_out_word, trns_out_mask
                ORRCS   trns_offset, trns_offset, trns_dummy12
                EORCS   trns_offset, trns_offset, trns_out_word
trnslp_plotmask_size    *       .-trnslp_plotmask
        ASSERT (trnslp_plotmask_size) = 6*4

;                       End of the loop - store the screen word and move on a screen pixel
trnslp_eoloop
                STRCS   trns_offset, [trns_out_ptr], #4
                MOVCS   trns_out_word, #&80000000
                ADD     trns_X, trns_X, trns_inc_X_x
                ADD     trns_Y, trns_Y, trns_inc_Y_x
                SUB     trns_xsize, trns_xsize, #&10000
                CMP     trns_xsize, #&10000
trnslp_eoloop_size      *       .-trnslp_eoloop
        ASSERT (trnslp_eoloop_size) = 6*4

;                       Finished row so store last compiled output word and mask, if necessary
trnslp_lastword
        CMP     trns_out_word, #&80000000
        LDMEQFD sp!, {PC}
trnslp_lastword_size        *       .-trnslp_lastword
        ASSERT (trnslp_lastword_size) = 2*4

;                       Stored last output word and mask on screen, so exit
trnslp_lastword2
        STR     trns_offset, [trns_out_ptr]
        LDMFD   sp!, {PC}
trnslp_lastword2_size        *       .-trnslp_lastword2
        ASSERT (trnslp_lastword2_size) = 2*4

trnslp_munge_16to321
        Push    "R0,R2"
                                          ;      fedcba9876543210 fedcba9876543210
                                          ; LR =                  0bbbbbgggggrrrrr
        MOV     R0,trns_dummy14,LSR #10   ; R0 =                            0bbbbb
        MOV     R2,R0,LSL #19             ; R2 =        0bbbbb000 0000000000000000
        AND     R0,trns_dummy14,#&3E0     ; R0 =                  000000ggggg00000
        ORR     R2,R2,R0,LSL #6           ; R2 =        0bbbbb000 ggggg00000000000
trnslp_m1632_size1        *       .-trnslp_munge_16to321
trnslp_munge_16to322
        MOV     R0,trns_dummy14,LSL #27   ; R0 = rrrrr00000000000 0000000000000000
        ORR     R2,R2,R0,LSR #24          ; R2 =        0bbbbb000 ggggg000rrrrr000

        ;now copy the top three bits of each colour component into the bottom three

        MOV     R0,#&E0                   ;avoid an LDR for speed
        ORR     R0,R0,R0,LSL #8
trnslp_m1632_size2        *       .-trnslp_munge_16to322
trnslp_munge_16to323
        ORR     R0,R0,R0,LSL #8           ; R0 = 0000000011100000 1110000011100000
        AND     R0,R0,R2                  ; R0 = 00000000bbb00000 ggg00000rrr00000
        ORR     trns_dummy14,R2,R0,LSR #5 ; LR = 00000000bbbbbbbb ggggggggrrrrrrrr
        Pull    "R0,R2"
trnslp_m1632_size3        *       .-trnslp_munge_16to323

trnslp_munge_32to161
        Push    "R0,R2"
                                             ;      fedcba9876543210 fedcba9876543210
                                             ; LR = 00000000bbbbbbbb ggggggggrrrrrrrr
        AND     R0,trns_dummy14,#&F80000     ; R0 = 00000000bbbbb000 0000000000000000
        MOV     R2,R0,LSR #9                 ; R2 =                  0bbbbb0000000000
        AND     R0,trns_dummy14,#&F800       ; R0 = 0000000000000000 ggggg00000000000
trnslp_m3216_size1        *       .-trnslp_munge_32to161
trnslp_munge_32to162
        ORR     R2,R2,R0,LSR #6              ; R2 =                  0bbbbbggggg00000
        AND     R0,trns_dummy14,#&F8         ; R0 = 0000000000000000 00000000rrrrr000
        ORR     trns_dummy14,R2,R0,LSR #3    ; LR =                  0bbbbbgggggrrrrr

        Pull    "R0,R2"
trnslp_m3216_size2        *       .-trnslp_munge_32to162


; ---------------------------------------------
; - Routine to compile the code specified by  -
; - the GCOL action, in_bpp, out_bpp          -
; - translation table, and plotmask/putsprite -
; - options                                   -
; ---------------------------------------------
compile_transform_code
        Push    "LR"
; R11 = GCOL calue (including mask bit)
        AND     R11,R5,#mc_gcol:OR:mc_hasmask
; Set mc_ttr if colour translation table is to be used
        CMP     R7,#0
        ORRNE   R11,R11,#mc_ttr
; Set mc_plotmask if plotting the mask, clear if putting a sprite
        LDR     R14,spritecode
        AND     R14,R14,#&FF
        TEQ     R14,#SpriteReason_PlotMaskTransformed
        ORREQ   R11,R11,#mc_plotmask
; Store the sprite type, so that new masks make a difference
        LDR     R0, save_spr_type
        ORR     R11, R11, R0, LSL #mcb_sprtype

; Set in_bpp, out_bpp
        LDR     R0,save_inbpp
        ORR     R11,R11,R0,LSL #mcb_inbpp
        LDR     R1,BPP
        ORR     R11,R11,R1,LSL #mcb_outbpp

; Set bit denoting transformed sprite, rather than scaled sprite (uses same area as Sprite_PutSpriteScaled)
        ORR     R11, R11, #mc_transformed
; Test with old compiled routine
        LDR     R14,macroword
        TEQ     R11,R14
        BEQ     trns_compiling_complete
;
        STR     R11,macroword

; compile the code - R0=in_bpp, R1=out_bpp
        LDR     R0, save_inlog2bpp
        LDR     R1, Log2bpp
        ADR     R10, codebuffer
        Debug   mc,"Transformation code starts at",R10

;      Start of loop
                TrnsAsm         trnslp_strt, trnslp_strt_size
;      Get start pixel on the screen line of lefthand x coord (4 instr)
                ADRL            R2, trnslp_getaddr_o1
                ADD             R2, R2, R1, LSL#4
                TrnsAsmReg      R2, trnslp_getaddr_size
                ASSERT trnslp_getaddr_size = 4*4
;      Shift start words (pixel and mask) according to pixel offset
                TrnsAsm         trnslp_getwam, trnslp_getwam_size
; loop Store address on stack for the loop
                Push            "R10"

; If using a 1bpp mask we need to work out a different offset for the mask too
                MOVS            R2, R11, LSR #mcb_sprtype
                TSTNE           R11, #mc_hasmask

;      First 15 words of the loop (to find byte offset in sprite from sprite X,Y coords)
                TrnsAsm         trnslp_stloop1, trnslp_stloop1_size ,EQ
                TrnsAsm         trnslp_stloop2, trnslp_stloop2_size ,EQ
                TrnsAsm         trnslp_stloop3, trnslp_stloop3_size ,EQ

                TrnsAsm         trnslp_nmstloop1, trnslp_nmstloop1_size ,NE
                TrnsAsm         trnslp_nmstloop2, trnslp_nmstloop2_size ,NE
                TrnsAsm         trnslp_nmstloop3, trnslp_nmstloop3_size ,NE
                TrnsAsm         trnslp_nmstloop4, trnslp_nmstloop4_size ,NE

;      Shift pixel and mask words by output bpp (2 instructions)
                ADRL            R2, trnslp_shfwam_o1
                ADD             R2, R2, R1, LSL#3
                TrnsAsmReg      R2, trnslp_shfwam_size
                ASSERT trnslp_shfwam_size = 2*4

                CMP             R1, #5
                LDREQ           R2, trnslp_shfwam_o24_2
                STREQ           R2, [R10], #4      ; need three insts for 32bpp

;      If putting the sprite (i.e. not plotting the mask)
                TST     R11, #mc_plotmask
                BNE     trns_compile_plotmask
;      Compile reading of pixel (4,5,5,1 instructions - each input store 5 instructions)
                ADRL            R2, trnslp_readpx_i1
                ADD             R2, R2, R0, LSL#4
                ADD             R2, R2, R0, LSL#2
                TrnsAsmReg      R2, trnslp_readpx_size
                BL              trnslp_chopexcess

                ASSERT trnslp_readpx_size = 5*4

; if doing 16->32 and 32->16 then build in appropriate munging code
        CMP     R1,#4
        BCC     %FT07
        CMP     R0,#4
        BCC     %FT07

        CMP     R0,R1
        BEQ     %FT07

        ; input and output are both >8bpp and are different
        CMP     R0,#4 ;inbpp
        ADREQ   R2,trnslp_munge_16to321
        TrnsAsmReg R2, trnslp_m1632_size1, EQ
        ADREQ   R2,trnslp_munge_16to322
        TrnsAsmReg R2, trnslp_m1632_size2, EQ
        ADREQ   R2,trnslp_munge_16to323
        TrnsAsmReg R2, trnslp_m1632_size3, EQ
        ADRNE   R2,trnslp_munge_32to161
        TrnsAsmReg R2, trnslp_m3216_size1, NE
        ADRNE   R2,trnslp_munge_32to162
        TrnsAsmReg R2, trnslp_m3216_size2, NE
07

;      If there is a translation table ...
        TST     R11, #mc_ttr
;          then translate the pixel
        BEQ     %FT01

;if inbpp and outbpp are both >8bpp ignore the translation table
        CMP     R1,#4
        BCC     %FT09
        CMP     R0,#4
        BCS     %FT08
09

;if inbpp is 16 or 32 and outbpp is 8 or below we need a CTrans
;32K entry table for colour matched. This will already have been validated

        CMP     R1,#4
        BCS     %FT07         ;output depth is >8bpp so skip this

        CMP     R0,#4
        BCC     %FT07         ;input depth is <16bpp so skip this

        ;ok - we definitely need a 32K table here. If it is 32bpp
        ;we also need to munge down to 16bpp before using the table

        ADRNE   R2,trnslp_munge_32to161
        TrnsAsmReg R2, trnslp_m3216_size1, NE
        ADRNE   R2,trnslp_munge_32to162
        TrnsAsmReg R2, trnslp_m3216_size2, NE

        ADRL     R2,trnslp_use32K
        TrnsAsmReg R2, trnslp_use32K_size
        B       %FT08
07

; First check to see if the output depth is greater than 8 bit per pixel,
; if it is then compile in the new translation functions which use
; a word array, rather than a byte array!

                [ ignore_ttr
                CMP             R1,#4
                BCC             %FT29         ;not if output bpp is <16bpp
                CMP             R0,#4
                BCS             %FT29         ;or input bpp is >8bpp

                LDR             R2,trns_palette
                TEQ             R2,#0
                BEQ             %FT29
                TrnsAsm         trnslp_transpx2, trnslp_transpx2_size

                ;if output is 16bpp now need to munge it down as well
                CMP             R1,#5
                ADRNE           R2,trnslp_munge_32to161
                TrnsAsmReg      R2, trnslp_m3216_size1, NE
                ADRNE           R2,trnslp_munge_32to162
                TrnsAsmReg      R2, trnslp_m3216_size2, NE
                B               %FT28
29
                CMP             R1,#4
                TrnsAsm         trnslp_transpx,  trnslp_transpx_size,  LT
                TrnsAsm         trnslp_transpx1, trnslp_transpx1_size, GE
28
                |

                CMP             R1,#4
                TrnsAsm         trnslp_transpx,  trnslp_transpx_size,  LT
                TrnsAsm         trnslp_transpx1, trnslp_transpx1_size, GE
                ]
01
08

;      Copy pixel into output pixel word (ORR out_word, out_word, pixel,LSL#32-out_bpp)
                LDR             R2, trnslp_setpx
                MOV             R3, #1
                MOV             R3, R3, LSL R1
                RSB             R3, R3, #32
                SetLsl          R2, R3
                STR             R2, [R10], #4
                ASSERT (trnslp_setpx_size = 4)
;      Assemble mask word
trns_compile_plotmask
;      If in GCOL 0-7,9,11,13, then assume the mask is whole sprite
        AND     R2, R11, #15            ;       R2 = GCOL in range 0-15
        ADD     R2, R2, #1              ;       R2 = GCOL +1
        MOV     R3, #2_1101010100000000 ;       Bits set are 8, 10, 12, 14, 15
        MOVS    R3, R3, LSR R2          ;       Move bit (R2-1) of R3 into the carry flag
;      CC if GCOL 0-7,9,11,13 (or no mask, in which case GCOL is in range 0-7)

;;-----------------------------------------------------------------------------
;; CC if no mask / CS has a mask so attempt to assemble in the function
;; which is correct for this depth, for 1,2,4 or 8 bit per pixel we can
;; simply compile in the ORR with a suitable mask, for 16 or 24 bit per pixel
;; we need to faff around and generate a new workspace word which
;; contains the correctmask.
;;-----------------------------------------------------------------------------

                BCS             trns_compile_mask

                CMP             R1, #4                          ; is the depth sensible?
                ADRCCL          R2, trnslp_sprnomask_o1
                ADDCC           R2, R2, R1,LSL#2
                TrnsAsmReg      R2, trnslp_sprnomask_size,CC
                ADRCSL          R2, trnslp_sprnomask16or24
                TrnsAsmReg      R2, trnslp_sprnomask16or24_size,CS
                B               trns_compile_nomask

;      Sprite has mask and it is used (GCOL 8,10,12,14,15)

trns_compile_mask

                MOVS            R2, R11, LSR #mcb_sprtype
                BEQ             trns_old_mask

                TrnsAsm         trnslp_new_sprmask, trnslp_new_sprmask_size
                TrnsAsm         trnslp_new_sprmask2, trnslp_new_sprmask2_size

                B               trns_any_mask

trns_old_mask
                TrnsAsm         trnslp_sprmask, trnslp_sprmask_size

;      Compile reading of mask pixel (4,5,5,1 instructions - each input store 5 instructions)
                ADRL            R2, trnslp_readpx_i1
                ADD             R2, R2, R0, LSL#4
                ADD             R2, R2, R0, LSL#2
                TrnsAsmReg      R2, trnslp_readpx_size
                BL              trnslp_chopexcess
                ASSERT trnslp_readpx_size = 5*4

                TEQ             R0,R1                   ;Check for 1:1 mapping?
                BEQ             trns_compile_mask1to1
trns_any_mask
;       Copy mask byte into output mask, first check for non-zero then orr in special case

                CMP             R1,#4
                ADRCCL          R2,trnslp_setmask_o1
                ADDCC           R2,R2,R1,LSL #3         ;Each section is 8 bytes (2 words)
                TrnsAsmReg      R2,trnslp_setmask_size,CC
                ASSERT trnslp_setmask_size = 2*4
                ADRCSL          R2,trnslp_setmask16or24
                TrnsAsmReg      R2,trnslp_setmask16or24_size,CS
                B               trns_compile_nomask

;      Copy pixel into output pixel word (ORR out_mask, out_mask, mask_pixel,LSL#32-out_bpp)


trns_compile_mask1to1
                LDR             R2, trnslp_setmask2
                MOV             R3, #1
                MOV             R3, R3, LSL R1
                RSB             R3, R3, #32
                SetLsl          R2, R3
                STR             R2, [R10], #4
                ASSERT  (trnslp_setmask2_size = 4)

trns_compile_nomask
;      Compile code to read the screen word
 [ AvoidScreenReads
;      If action code 0 (solid), use alternate code that avoids a load if the
;      mask is solid.
                ANDS            R2, R11, #7
                TrnsAsm         trnslp_readscrn0, trnslp_readscrn0_size, EQ
                TrnsAsm         trnslp_readscrn,  trnslp_readscrn_size,  NE
 |
                TrnsAsm         trnslp_readscrn, trnslp_readscrn_size
 ]
;      Test for plotting mask - if so compile ECF code, else GCOL code
        TST     R11, #mc_plotmask
        BNE     trns_compile_ecf_store
;      Store the GCOL action code (2,1,2,1,1,0,1,2 instructions, stored as 2 instructions in input)
                ADRL            R3, trnslp_gcol
                ANDS            R2, R11, #7
                ADD             R3, R3, R2,LSL#3
                TrnsAsmReg      R3, trnslp_gcol_size
                CMPNE           R2, #2
                CMPNE           R2, #7
                SUBNE           R10, R10, #4
                CMP             R2, #5
                SUBEQ           R10, R10, #4
                ASSERT trnslp_gcol_size = 2*4
        B       trns_compiled_gcol
;      Handle the mask and ECF
trns_compile_ecf_store
                TrnsAsm         trnslp_plotmask, trnslp_plotmask_size
;      Store the end of loop code
trns_compiled_gcol
                TrnsAsm         trnslp_eoloop, trnslp_eoloop_size
                Pull            "R2"
                TrnsBranch      R2, BGE
;       Store the code for storing the last word
                TrnsAsm         trnslp_lastword, trnslp_lastword_size
; loop
                Push            "R10"
;      Shift pixel and mask words by output bpp (2 instructions)
                ADRL             R2, trnslp_shfwam_o1
                ADD             R2, R2, R1, LSL#3
                TrnsAsmReg      R2, trnslp_shfwam_size
                ASSERT trnslp_shfwam_size = 2*4
;      Loop until words are aligned
                Pull            "R2"
                TrnsBranch      R2, BCC
;      Compile code to read the screen word
                TrnsAsm         trnslp_readscrn, trnslp_readscrn_size
;      Test for plotting mask - if so compile ECF code, else GCOL code
        TST     R11, #mc_plotmask
        BNE     trns_compile_ecf_store2
;      Store the GCOL action code (2,1,2,1,1,0,1,2 instructions, stored as 2 instructions in input)
                ADRL            R3, trnslp_gcol
                ANDS            R2, R11, #7
                ADD             R3, R3, R2,LSL#3
                TrnsAsmReg      R3, trnslp_gcol_size
                CMPNE           R2, #2
                CMPNE           R2, #7
                SUBNE           R10, R10, #4
                CMP             R2, #5
                SUBEQ           R10, R10, #4
                ASSERT trnslp_gcol_size = 2*4
        B       trns_compiled_gcol2
;      Handle the mask and ECF
trns_compile_ecf_store2
                TrnsAsm         trnslp_plotmask, trnslp_plotmask_size
;      Compile code to store the word and exit
trns_compiled_gcol2
                TrnsAsm         trnslp_lastword2, trnslp_lastword2_size
;      Code compiled.
        Debug   mc,"Transformation code ends at",r10
	Push	"r0-r2"
	MOV	r2, r10
	LDR	r1, trns_codebuffer
	MOV	r0, #1
	SWI	XOS_SynchroniseCodeAreas
	Pull	"r0-r2"

trns_compiling_complete
        Pull    "PC"

trnslp_chopexcess
;this routine reduces the various readpx_i segments down to the real
;size rather than five words
        TEQ             R0, #0
        SUBEQ           R10, R10, #4
        TEQ             R0, #3
        SUBEQ           R10, R10, #16
        TEQ             R0, #4
        SUBEQ           R10, R10, #4
        TEQ             R0, #5
        SUBEQ           R10, R10, #16
        MOV R15,LR


calc_row_stacked        *       4*4    ;       Number of bytes stacked in plot_and_calculate_row
plot_and_calculate_row
        Push    "R14"
        CMP     trns_scr_lx, trns_scr_rx
        Swap    trns_scr_lx, trns_scr_rx, GT
        LDR     trns_dummy14, gwx0
        CMP     trns_scr_lx, trns_dummy14
        MOVLT   trns_scr_lx, trns_dummy14
        LDR     trns_dummy14, gwx1
        CMP     trns_scr_rx, trns_dummy14
        MOVGT   trns_scr_rx, trns_dummy14
        CMP     trns_scr_lx, trns_scr_rx
        Pull    "PC",GE
;                                       Get sprite offsets for left of current y line
;                                       First get dummy11 = scr_lx - x(0)
        LDR     trns_dummy11, trns_spr_xcoords + calc_row_stacked
;        Debug tr,"X,Y(x0,y0), incXx, incYy",trns_X_x0_y,trns_Y_x0_y,trns_inc_X_x,trns_inc_Y_x
;        Debug tr,"coords:",trns_scr_lx, trns_scr_rx,trns_scr_y,trns_dummy11
        MOV     trns_dummy11, trns_dummy11, ASR#8
        SUB     trns_dummy11, trns_scr_lx, trns_dummy11
; X = ((screen_lx%-x(0)) * (inc_X_x)) + X_x0_y
        MLA     trns_X, trns_dummy11, trns_inc_X_x, trns_X_x0_y
; Y = ((screen_lx%-x(0)) * (spr_inc_Y_x)) + Y_x0_y
        MLA     trns_Y, trns_inc_Y_x, trns_dummy11, trns_Y_x0_y
;                                       Get the x posn and x size for the plot
        SUB     trns_dummy14, trns_scr_rx, trns_scr_lx
        MOV     trns_out_x, trns_scr_lx
;                                       Load the compile routine registers - HERE
        ADR     trns_spr_start, trns_comp_spr_left + calc_row_stacked
        LDMIA   trns_spr_start, {trns_xsize, trns_spr_start, trns_byte_width, trns_spr_height}
        ORR     trns_xsize, trns_xsize, trns_dummy14,LSL#16
;                               Plot the row (call compiled code)
        Pull    "R14"
        LDR     PC, trns_codebuffer + calc_row_stacked -4

        END