; Copyright 2000 Pace Micro Technology plc
;
; Licensed under the Apache License, Version 2.0 (the "License");
; you may not use this file except in compliance with the License.
; You may obtain a copy of the License at
;
;     http://www.apache.org/licenses/LICENSE-2.0
;
; Unless required by applicable law or agreed to in writing, software
; distributed under the License is distributed on an "AS IS" BASIS,
; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
; See the License for the specific language governing permissions and
; limitations under the License.
;
        GBLL    MinorL2PThack
MinorL2PThack SETL {TRUE}

; Fixed page allocation is as follows

                        ^       0
DRAMOffset_FirstFixed   #       0
DRAMOffset_ScratchSpace #       16*1024
DRAMOffset_PageZero     #       16*1024
DRAMOffset_PageTables   #       0               ; Base of the page tables used during init; just a useful symbol we can use for relative offsets

 [ LongDesc
                        ^       DRAMOffset_PageTables
DRAMOffset_LL2PT        #       16*1024         ; Each page needs to be page-aligned
DRAMOffset_LL1PT        #       4096            ; Only needs to be 32 byte aligned
DRAMOffset_LastFixed_LongDesc # 0
 ]

 [ ShortDesc
                        ^       DRAMOffset_PageTables
DRAMOffset_L1PT         #       16*1024         ; L1PT must be 16K-aligned
DRAMOffset_LastFixed_ShortDesc # 0
 ]

;        IMPORT  Init_ARMarch
;        IMPORT  ARM_Analyse

 [ MEMM_Type = "VMSAv6"
mmuc_init_new
        ; MMUC initialisation flags for ARMv6/ARMv7
        ; This tries to leave the reserved/unpredictable bits untouched, while initialising everything else to what we want
                ; ARMv7MP (probably) wants SW. ARMv6 wants U+XP (which should both be fixed at 1 on ARMv7)
        DCD     MMUC_BEN+MMUC_SW+MMUC_U+MMUC_XP
                ; M+C+W+Z+I+L2 clear to keep MMU/caches off.
                ; A to keep alignment exceptions off (for now at least)
                ; B+EE clear for little-endian
                ; S+R+RR clear to match mmuc_init_old
                ; V+VE clear to keep processor vectors at &0
                ; FI clear to disable fast FIQs (interruptible LDM/STM)
                ; TRE+AFE clear for our VMSAv6 implementation
                ; TE clear for processor vectors to run in ARM mode
        DCD     MMUC_M+MMUC_A+MMUC_C+MMUC_W+MMUC_B+MMUC_S+MMUC_R+MMUC_Z+MMUC_I+MMUC_V+MMUC_RR+MMUC_FI+MMUC_VE+MMUC_EE+MMUC_L2+MMUC_TRE+MMUC_AFE+MMUC_TE
mmuc_init_old
        ; MMUC initialisation flags for ARMv5 and below, as per ARM600 MMU code
                ; Late abort (ARM6 only), 32-bit Data and Program space. No Write buffer (ARM920T
                ; spec says W bit should be set, but I reckon they're bluffing).
                ;
                ; The F bit's tricky. (1 => CPCLK=FCLK, 0=>CPCLK=FCLK/2). The only chip using it was the
                ; ARM700, it never really reached the customer, and it's always been programmed with
                ; CPCLK=FCLK. Therefore we'll keep it that way, and ignore the layering violation.
        DCD     MMUC_F+MMUC_L+MMUC_D+MMUC_P
                ; All of these bits should be off already, but just in case...
        DCD     MMUC_B+MMUC_W+MMUC_C+MMUC_A+MMUC_M+MMUC_RR+MMUC_V+MMUC_I+MMUC_Z+MMUC_R+MMUC_S
 ]

; void RISCOS_InitARM(unsigned int flags)
;
RISCOS_InitARM
        MOV     a4, lr
        ; Check if we're architecture 3. If so, don't read the control register.
        BL      Init_ARMarch
        MOVEQ   a3, #0
        ARM_read_control a3, NE
 [ MEMM_Type = "VMSAv6"
        CMP     a1, #ARMv6
        CMPNE   a1, #ARMvF
        ADREQ   a2, mmuc_init_new
        ADRNE   a2, mmuc_init_old
        LDMIA   a2, {a2, lr}
        ORR     a3, a3, a2
        BIC     a3, a3, lr
 |
        ; Late abort (ARM6 only), 32-bit Data and Program space. No Write buffer (ARM920T
        ; spec says W bit should be set, but I reckon they're bluffing).
        ;
        ; The F bit's tricky. (1 => CPCLK=FCLK, 0=>CPCLK=FCLK/2). The only chip using it was the
        ; ARM700, it never really reached the customer, and it's always been programmed with
        ; CPCLK=FCLK. Therefore we'll keep it that way, and ignore the layering violation.
        ORR     a3, a3, #MMUC_F+MMUC_L+MMUC_D+MMUC_P
        ; All of these bits should be off already, but just in case...
        BIC     a3, a3, #MMUC_B+MMUC_W+MMUC_C+MMUC_A+MMUC_M
        BIC     a3, a3, #MMUC_RR+MMUC_V+MMUC_I+MMUC_Z+MMUC_R+MMUC_S
 ]

        ; Off we go.
        ARM_write_control a3
        MOV     a2, #0
 [ MEMM_Type = "VMSAv6"
        myISB   ,a2,,y ; Ensure the update went through
 ]

        ; In case it wasn't a hard reset
 [ MEMM_Type = "VMSAv6"
        CMP     a1, #ARMvF
        ; Assume that all ARMvF ARMs have multi-level caches and thus no single MCR op for invalidating all the caches
        ADREQ   lr, %FT01
        BEQ     HAL_InvalidateCache_ARMvF
        MCRNE   ARM_config_cp,0,a2,ARMv4_cache_reg,C7           ; invalidate I+D caches
01
 ]
        CMP     a1, #ARMv3
        BNE     %FT01
        MCREQ   ARM_config_cp,0,a2,ARMv3_TLBflush_reg,C0        ; flush TLBs
        B       %FT02
01      MCRNE   ARM_config_cp,0,a2,ARMv4_TLB_reg,C7             ; flush TLBs
02
 [ MEMM_Type = "VMSAv6"
        myDSB   ,a2,,y
        myISB   ,a2,,y
 ]

        ; We assume that ARMs with an I cache can have it enabled while the MMU is off.
        [ :LNOT:CacheOff
        ORRNE   a3, a3, #MMUC_I
        ARM_write_control a3, NE                                ; whoosh
 [ MEMM_Type = "VMSAv6"
        myISB   ,a2,,y ; Ensure the update went through
 ]
        ]

        ; Check if we are in a 26-bit mode.
        MRS     a2, CPSR
        ; Keep a soft copy of the CR in a banked register (R13_und)
        MSR     CPSR_c, #F32_bit+I32_bit+UND32_mode
        MOV     sp, a3
        ; Switch into SVC32 mode (we may have been in SVC26 before).
        MSR     CPSR_c, #F32_bit+I32_bit+SVC32_mode

        ; If we were in a 26-bit mode, the lr value given to us would have had PSR flags in.
        TST     a2, #2_11100
        MOVNE   pc, a4
        BICEQ   pc, a4, #ARM_CC_Mask

; void *RISCOS_AddRAM(unsigned int flags, uintptr_t start, uintptr_t end, uintptr_t sigbits, void *ref)
;   Entry:
;     flags   bit 0: video memory (currently only one block permitted)
;             bit 1: video memory is not suitable for general use
;             bit 2: memory can't be used for DMA (sound, video, or other)
;             bits 8-11: speed indicator (arbitrary, higher => faster)
;             bit 12: start, end, sigbits are shifted right 12 bits (for supporting large physical addresses)
;             other bits reserved (SBZ)
;     start   = start address of RAM (inclusive) (no alignment requirements)
;     end     = end address of RAM (exclusive) (no alignment requirements, but must be >= start)
;     sigbits = significant address bit mask (1 => this bit of addr decoded, 0 => this bit ignored)
;     ref     = reference handle (NULL for first call)
;
; The first registered block must be in the low 4GB, and blocks must not cross
; 4GB thresholds.
; The (unshifted) sigbits is assumed to be the same across all calls.

; A table is built up at the head of the first block of memory.
; The table consists of (addr, len, flags) pairs, terminated by a count of
; those pairs; ref points to that counter.
; The table stores addresses are in terms of 4K pages, allowing us to
; theoretically support a 44 bit physical space.
; Twelve bits of flags are stored at the bottom of the length word.
; The remaining 20 length bits are the length of the block in 4K pages, minus
; one (so a full 4GB region will have length &FFFFF)

        ROUT
RISCOS_AddRAM
        Entry   "v1-v4"
        LDR     v4, [sp, #5*4]          ; Get ref

        ; Round to pages. If we were extra sneaky we could not do this and chuck out incomplete
        ; pages after concatanation, but it would be a weird HAL that gave us pages split across
        ; calls.
        ;
        TST     a1, #OSAddRAM_LargeAddresses
        ADDEQ   a2, a2, #4096           ; round start address up
        SUBEQ   a2, a2, #1
        MOVEQ   a2, a2, LSR #12
        MOVEQ   a3, a3, LSR #12         ; round end address down
        MOVEQ   a4, a4, LSR #12
        ORREQ   a4, a4, #&FF000000
        ORREQ   a4, a4, #&00F00000
        BIC     a1, a1, #OSAddRAM_LargeAddresses ; Flag no longer relevant

      [ :LNOT: LongDesc
        ; Ignore any RAM above the 32bit barrier if we've been built with
        ; long-descriptor support disabled (HAL may not know our build settings)
        CMP     a2, #1:SHL:20
        BHS     %FT90
      ]

        CMP     a3, a2
        BLS     %FT90                   ; check we aren't now null

        CMP     v4, #0
        BEQ     %FT20


        ; We are not dealing with the first block since v4 != 0.  Make an attempt to merge this block
        ; with the previous block.
        LDMDB   v4, {v1, v2}            ; Get details of the previous block
        EOR     ip, v1, a2
        CMP     ip, #1:SHL:20
        BHS     %FT20                   ; Don't merge if they're in different 4GB blocks
        MOV     v3, v2, LSL #20         ; Isolate flags
        MOV     v2, v2, LSR #12         ; And get length in pages
        ADD     v2, v1, v2              ; Get the end address
        ADD     v2, v2, #1
        EOR     v2, v2, a2              ; Compare with the current block start address...
        TST     v2, a4                  ; ... but only check the decoded bits.
        EOR     v2, v2, a2              ; Restore the previous block end address.
        TEQEQ   v3, a1, LSL #20         ; And are the page flags the same?
        BNE     %FT10                   ; We can't merge it after the previous block

        ; v1 = previous start
        ; v2 = previous end
        ; The block is just after the previous block.  That means the start address is unchanged, but
        ; the length is increased.
        LDR     v2, [v4, #-4]           ; Reload length+flags
        SUB     a3, a3, a2              ; Find the length of the new block.
        ; a3 = length of block
        ADD     v2, v2, a3, LSL #12     ; Add it to the previous length.
        STR     v2, [v4, #-4]           ; Update the block size in memory.
        MOV     a1,v4
        EXIT

        ; The block is not just after the previous block, but it may be just before.  This may be the
        ; case if we are softloaded.
10      SUB     v1, v1, #1              ; Compare the address before the previous block start ...
        SUB     a3, a3, #1              ; ... with the address of the last page in this block ...
        EOR     v1, v1, a3
        TST     v1, a4                  ; ... but check only the decoded bits.
        ADD     a3, a3, #1              ; Restore the end address.
        TEQEQ   v3, a1, LSL #20         ; And are the page flags the same?
        BNE     %FT20                   ; Skip if we cannot merge the block.

        ; The block is just before the previous block.  The start address and length both change.
        LDMDB   v4, {v1, v2}            ; Get the previous block again
        SUB     a3, a3, a2              ; Calculate the current block size.
        SUB     v1, v1, a3              ; Subtract from the previous block start address.
        ADD     v2, v2, a3, LSL #12     ; Calculate the new length+flags
        STMDB   v4, {v1, v2}            ; Update the block info in memory.
        MOV     a1,v4
        EXIT

        ; We now have a region which does not merge with a previous region.  We move it up to the
        ; highest address we can in the hope that this block will merge with the next block.
20      SUB     a3, a3, a2              ; Calculate the block size
        SUB     a3, a3, #1
        ORR     a3, a3, a1, LSL #20     ; Put the flags in
        MOV     a3, a3, ROR #20         ; And rotate so they're at the bottom
        MVN     v1, a4                  ; Get the non-decoded address lines.
        ORR     a2, v1, a2              ; Set the non-decoded address bit in the start address.

30      CMP     v4, #0                  ; If the workspace has not been allocated...
        MOVEQ   v4, a2, LSL #12         ; ... use this block.
        MOVEQ   v1, #0                  ; Initialise the counter.

        ; The block/fragment to be added is between a2 and a2+a3.
        LDRNE   v1, [v4]                ; Get the old counter if there was one.
        STMIA   v4!, {a2, a3}           ; Store address and size.
        ADD     v1, v1, #1              ; Increment the counter.
        STR     v1, [v4]                ; Store the counter.

90      MOV     a1,v4
        EXIT                            ; We've done with this block now.



; Subtractv1v2fromRAMtable
;
; On entry: v1 = base of memory area
;           v2 = size of memory area
;           a4 = RAM table handle (ie pointer to terminator word containing number of entries)
;
; On exit:  a1-a3 preserved
;           a4 and RAM table updated
;           other registers corrupted
Subtractv1v2fromRAMtable
        ADD     v2, v1, v2              ; v2 = end address
        MOV     v1, v1, LSR #12         ; round base down
        ADD     v2, v2, #4096
        SUB     v2, v2, #1
        MOV     v2, v2, LSR #12         ; round end up

        LDR     v5, [a4]
        SUB     v8, a4, v5, LSL #3
10      TEQ     v8, a4
        MOVEQ   pc, lr
        LDMIA   v8!, {v3, v4}
        ADD     v6, v3, v4, LSR #12
        ADD     v6, v6, #1              ; v6 = end of RAM block
        CMP     v2, v3                  ; if our end <= RAM block start
        CMPHI   v6, v1                  ; or RAM block end <= our start
        BLS     %BT10                   ; then no intersection

        MOV     v4, v4, LSL #20         ; extract flags

        CMP     v1, v3
        BHI     not_bottom

        ; our area is at the bottom
        CMP     v2, v6
        BHS     remove_block

        SUB     v6, v6, v2              ; v6 = new size
        SUB     v6, v6, #1
        MOV     v6, v6, LSL #12
        ORR     v6, v6, v4, LSR #20     ; + flags
        STMDB   v8, {v2, v6}            ; store new base (= our end) and size
        B       %BT10

        ; we've completely covered a block. Remove it.
remove_block
        MOV     v7, v8
20      TEQ     v7, a4                  ; shuffle down subsequent blocks in table
        LDMNEIA v7, {v3, v4}
        STMNEDB v7, {v3, v4}
        ADDNE   v7, v7, #8
        BNE     %BT20
        SUB     v5, v5, #1
        SUB     a4, a4, #8
        STR     v5, [a4]
        SUB     v8, v8, #8
        B       %BT10

        ; our area is not at the bottom.
not_bottom
        CMP     v2, v6
        BLO     split_block

        ; our area is at the top
        SUB     v6, v1, v3              ; v6 = new size
        SUB     v6, v6, #1
        MOV     v6, v6, LSL #12
        ORR     v6, v6, v4, LSR #20     ; + flags
        STMDB   v8, {v3, v6}            ; store original base and new size
        B       %BT10

split_block
        MOV     v7, a4
30      TEQ     v7, v8                  ; shuffle up subsequent blocks in table
        LDMDB   v7, {v3, v4}
        STMNEIA v7, {v3, v4}
        SUBNE   v7, v7, #8
        BNE     %BT30
        ADD     v5, v5, #1
        ADD     a4, a4, #8
        STR     v5, [a4]

        MOV     v4, v4, LSL #20         ; (re)extract flags

        SUB     v7, v1, v3              ; v7 = size of first half
        SUB     v6, v6, v2              ; v6 = size of second half
        SUB     v7, v7, #1
        SUB     v6, v6, #1
        MOV     v7, v7, LSL #12
        MOV     v6, v6, LSL #12
        ORR     v7, v7, v4, LSR #20
        ORR     v6, v6, v4, LSR #20     ; + flags
        STMDB   v8, {v3, v7}
        STMIA   v8!, {v2, v6}
        B       %BT10


;void RISCOS_Start(unsigned int flags, int *riscos_header, int *hal_header, void *ref)
;

; We don't return, so no need to obey ATPCS, except for parameter passing.
; register usage:   v4 = location of VRAM
;                   v6 = amount of VRAM

        ROUT
RISCOS_Start
        TEQ     a4, #0
01      BEQ     %BT01                           ; Stop here if no RAM

        ; subtract the HAL and OS from the list of RAM areas
        MOV     v1, a2
        LDR     v2, [a2, #OSHdr_ImageSize]
        BL      Subtractv1v2fromRAMtable
        LDR     v1, [a3, #HALDesc_Start]
        ADD     v1, a3, v1
        LDR     v2, [a3, #HALDesc_Size]
        BL      Subtractv1v2fromRAMtable

        LDR     v5, [a4]                        ; v5 = the number of RAM blocks
        SUB     v8, a4, v5, LSL #3              ; Jump back to the start of the list.

 [ LongDesc :LAND: ShortDesc
        ; Check and see if there's any RAM above the 4GB limit
        MOV     v1, v8
02
        LDR     v2, [v1], #8
        CMP     v2, #1:SHL:20
        BHS     RISCOS_Start_LongDesc
        CMP     v1, a4
        BNE     %BT02
        B       RISCOS_Start_ShortDesc
 ]

        ; MMU initialisation is tricky, so just duplicate all of this code
        ; if we need to support multiple page table formats
        GBLA    ptcounter
ptcounter SETA    0
        WHILE   ptcounter < 2
      [ ptcounter = 0 :LAND: ShortDesc
pt      SETS    "ShortDesc"
      ELIF ptcounter = 1 :LAND: LongDesc
pt      SETS    "LongDesc"
      |
pt      SETS    ""
      ]
      [ "$pt" <> ""
RISCOS_Start_$pt
        ROUT

        ; Search for some VRAM
05      LDMIA   v8!, {v1, v2}                   ; Get a block from the list. (v1,v2)=(addr,size+flags)
        TST     v2, #OSAddRAM_IsVRAM            ; Is it VRAM?
        BNE     %FT20                           ; If so, deal with it below
        TEQ     v8, a4                          ; Carry on until end of list or we find some.
        BNE     %BT05

        ; Extract some pseudo-VRAM from first DMA-capable RAM block
        SUB     v8, a4, v5, LSL #3              ; Rewind again.
06      LDMIA   v8!, {v1, v2}
        TEQ     v8, a4                          ; End of list?
        TSTNE   v2, #OSAddRAM_NoDMA             ; DMA capable?
        BNE     %BT06
        MOV     v2, v2, LSR #12                 ; Remove flags
        ADD     v2, v2, #1
        ; Is this the only DMA-capable block?
        MOV     v4, v8
        MOV     v6, #OSAddRAM_NoDMA
07      TEQ     v4, a4
        BEQ     %FT08
        LDR     v6, [v4, #4]
        ADD     v4, v4, #8
        TST     v6, #OSAddRAM_NoDMA
        BNE     %BT07
08
        ; v6 has NoDMA set if v8 was the only block
        TST     v6, #OSAddRAM_NoDMA
        MOV     v4, v1                          ; Allocate block as video memory
        MOV     v6, v2
        BEQ     %FT09
        SUBS    v6, v6, #(16*1024*1024):SHR:12  ; Leave 16M if it was the only DMA-capable block
        MOVLS   v6, v2, LSR #1                  ; If that overflowed, take half the bank.
09
        CMP     v6, #(32*1024*1024):SHR:12
        MOVHS   v6, #(32*1024*1024):SHR:12      ; Limit allocation to 32M (arbitrary)

        ADD     v1, v1, v6                      ; Adjust the RAM block base...
        SUBS    v2, v2, v6                      ; ... and the size
        LDREQ   v6, [v8, #-4]
        BEQ     %FT21                           ; pack array tighter if this block is all gone
        STR     v1, [v8, #-8]                   ; update base
        LDR     v1, [v8, #-4]
        MOV     v1, v1, LSL #20                 ; original flags
        ORR     v6, v6, v1                      ; merged into VRAM size
        ORR     v1, v1, v2                      ; and into new size
        MOV     v1, v1, ROR #20
        MOV     v6, v6, ROR #20
        SUB     v1, v1, #4096
        STR     v1, [v8, #-4]                   ; update size
        B       %FT30

        ; Note real VRAM parameters
20      MOV     v6, v2                          ; Remember the size+flags and
        MOV     v4, v1                          ; address of the VRAM
21      ADD     v6, v6, #4096                   ; Assume <4GB of VRAM!
22      TEQ     v8, a4                          ; if not at the end of the array
        LDMNEIA v8, {v1, v2}                    ; pack the array tighter
        STMNEDB v8, {v1, v2}
        ADDNE   v8, v8, #8
        BNE     %BT22
25      SUB     v5, v5, #1                      ; decrease the counter
        STR     v5, [a4, #-8]!                  ; and move the end marker down

30      SUB     v8, a4, v5, LSL #3              ; Rewind to start of list

        ; Scan forwards to find the fastest block of non-DMAable memory which is at least DRAMOffset_LastFixed size, at least 16KB aligned, and located in the first 4GB
        LDMIA   v8!, {v1, v2}
31
        TEQ     v8, a4
        MOVEQ   v1, v1, LSL #12
        BEQ     %FT32
        LDMIA   v8!, {v7, ip}
        MOV     sp, v7, ROR #2                  ; Bottom two bits (for 16KB alignment) rotated high
        CMP     sp, #1:SHL:18                   ; Ignore if not aligned, or base address >= 4GB
        BHS     %BT31
        CMP     ip, #DRAMOffset_LastFixed_$pt-4096
        ANDHS   sp, ip, #&F*OSAddRAM_Speed+OSAddRAM_NoDMA
        ANDHS   lr, v2, #&F*OSAddRAM_Speed+OSAddRAM_NoDMA
        ASSERT  OSAddRAM_Speed = 1:SHL:8
        ASSERT  OSAddRAM_NoDMA < OSAddRAM_Speed
        MOVHS   sp, sp, ROR #8                  ; Give NoDMA flag priority over speed when sorting
        CMPHS   sp, lr, ROR #8
        MOVHI   v1, v7
        MOVHI   v2, ip
        B       %BT31
32

        ; Fill in the Kernel's permanent memory table, sorting by address, speed and DMA ability.
        ; * Address: All memory that falls in the low 4GB of the physical map
        ;   comes first. This makes it easier for our initial memory allocation
        ;   (no danger of allocating pages which can't be accessed with the MMU
        ;   off), but may also help with wider software compatibility (all low-
        ;   RAM pages occupy the lowest physical page numbers)
        ; * Non-DMAable RAM is preferred over DMAable, as the kernel requires
        ;   very little DMAable RAM, and we don't want to permanently claim
        ;   DMAable RAM if we're not actually using it for DMA (in case machine
        ;   only has a tiny amount available)
        ; * Speed: Fastest RAM is listed first, so that we'll prefer to allocate
        ;   it for these important kernel/system areas
        ADD     ip, v1, #DRAMOffset_PageZero
        ASSERT  DRAMOffset_PageZero > 0         ; If the workspace block is the block containing the OS_AddRAM list, make sure the two don't overlap otherwise we might corrupt it while we copy it

        ADD     sp, v1, #DRAMOffset_ScratchSpace + ScratchSpaceSize

        Push    "a1,a2,a3"                      ; Remember our arguments

        SUB     v8, a4, v5, LSL #3              ; Rewind to start of list
        CMP     v5, #DRAMPhysTableSize          ; Don't overflow our table
        ADDHI   a4, v8, #DRAMPhysTableSize*8 - 8

        ; First put the VRAM information in to free up some regs
        ADD     v7, ip, #VideoPhysAddr
        STMIA   v7!, {v4, v6}

        ; Now fill in the rest
        ASSERT  DRAMPhysAddrA = VideoPhysAddr+8
        MOV     v1, v1, LSR #12
        ADDS    v2, v2, #4096                   ; Store true length
        ADDCS   v2, v2, #1:SHL:31               ; If it overflowed, must have been 4GB block, so clamp at 2GB (loop below will add the second 2GB)
        STMIA   v7!, {v1, v2}                   ; workspace block must be first
33
        TEQ     v8, a4
        BEQ     %FT39
        LDMIA   v8!, {v1, v2}
        ADDS    v2, v2, #4096                   ; Get true length
        ADDCS   v2, v2, #1:SHL:31               ; If it overflowed, must have been 4GB block, so split into two 2GB blocks
        SUBCS   v2, v2, #4096
        ADDCS   v1, v1, #1:SHL:(31-12)
        STMCSDB v8!, {v1, v2}
        ADDCS   v2, v2, #4096
        SUBCS   v1, v1, #1:SHL:(31-12)
        ADD     a1, ip, #DRAMPhysAddrA
        LDMIA   a1!, {a2, a3}
        TEQ     v1, a2
        BEQ     %BT33                           ; don't duplicate the initial block
        ; Perform insertion sort
        ; a1-a3, v3-v6, ip, lr free
        AND     v3, v2, #&F*OSAddRAM_Speed
        CMP     v1, #1:SHL:20
        ORRLO   v3, v3, #1:SHL:31               ; Low RAM takes priority
        TST     v2, #OSAddRAM_NoDMA
        ORRNE   v3, v3, #1:SHL:30               ; Followed by non-DMA
34
        AND     v4, a3, #&F*OSAddRAM_Speed
        CMP     a2, #1:SHL:20
        ORRLO   v4, v4, #1:SHL:31               ; Low RAM takes priority
        TST     a3, #OSAddRAM_NoDMA
        ORRNE   v4, v4, #1:SHL:30               ; Followed by non-DMA
        CMP     v3, v4                          ; Compare priority value
        BHI     %FT35
        TEQ     a1, v7
        LDMNEIA a1!, {a2, a3}
        BNE     %BT34
        ADD     a1, a1, #8
35
        ADD     v7, v7, #8
        ; Insert at a1-8, overwriting {a2, a3}
36
        STMDB   a1, {v1, v2}                   ; store new entry
        TEQ     a1, v7
        MOVNE   v1, a2                         ; if not at end, shuffle
        MOVNE   v2, a3                         ; overwritten entry down one,
        LDMNEIA a1!, {a2, a3}                  ; load next to be overwritten,
        BNE     %BT36                          ; and loop
        B       %BT33

39
        ; Now we have to work out the total RAM size
        MOV     a2, #0
        ADD     v6, ip, #PhysRamTable
        MOV     a3, v6
40
        LDMIA   v6!, {v1, v2}                   ; get address, size
        ADD     a2, a2, v2, LSR #12             ; add on size
        TEQ     v6, v7
        BNE     %BT40

        ; Work out how much DMAable RAM the HAL/kernel needs
        LDR     a1, [sp, #8]
        LDR     a1, [a1, #HALDesc_Flags]
        TST     a1, #HALFlag_NCNBWorkspace              ; do they want uncacheable workspace?
        LDRNE   a1, =SoundDMABuffers-CursorChunkAddress + ?SoundDMABuffers + 32*1024 + DRAMOffset_LastFixed_$pt
        LDREQ   a1, =SoundDMABuffers-CursorChunkAddress + ?SoundDMABuffers + DRAMOffset_LastFixed_$pt
        ; Scan PhysRamTable for a DMAable block of at least this size, extract it, and stash it in InitDMABlock
        ; Once the initial memory claiming is done we can re-insert it
        ADD     a4, a3, #DRAMPhysAddrA-VideoPhysAddr    ; don't claim VRAM

        ; First block needs special treatment as we've already claimed some of it
        LDMIA   a4!, {v1, v2}
        TST     v2, #OSAddRAM_NoDMA
        BNE     %FT41
        CMP     v2, a1
        BLO     %FT41
        ; Oh crumbs, the first block is a match for our DMA block
        ; Claim it as normal, but set InitDMAEnd to v1+DRAMOffset_LastFixed so
        ; that the already used bit won't get used for DMA
        ; We also need to be careful later on when picking the initial v2 value
        MOV     lr, v1, LSL #12
        ADD     lr, lr, #DRAMOffset_LastFixed_$pt
        STR     lr, [ip, #InitDMAEnd]
        B       %FT43
41
        ; Go on to check the rest of PhysRamTable
        SUB     a1, a1, #DRAMOffset_LastFixed_$pt
42
        LDMIA   a4!, {v1, v2}
        TST     v2, #OSAddRAM_NoDMA
        BNE     %BT42
        CMP     v2, a1
        BLO     %BT42
        CMP     v1, #1:SHL:20 ; <4GB only for now
        BHS     %BT42
        ; Make a note of this block
        MOV     lr, v1, LSL #12
        STR     lr, [ip, #InitDMAEnd]
43
        STR     v1, [ip, #InitDMABlock]
        STR     v2, [ip, #InitDMABlock+4]
        SUB     lr, a4, a3
        STR     lr, [ip, #InitDMAOffset]
        ; Now shrink/remove this memory from PhysRamTable
        SUB     v2, v2, a1
        ADD     v1, v1, a1, LSR #12
        CMP     v2, #4096               ; Block all gone?
        STMHSDB a4, {v1, v2}            ; no, just shrink it
        BHS     %FT55
45
        CMP     a4, v7
        LDMNEIA a4, {v1, v2}
        STMNEDB a4, {v1, v2}
        ADDNE   a4, a4, #8
        BNE     %BT45
        SUB     v7, v7, #8

; a2 = Total memory size (pages)
; a3 = PhysRamTable
; v7 = After last used entry in PhysRamTable
; ip -> ZeroPage

; now store zeros to fill out table

55
        ADD     v2, a3, #PhysRamTableEnd-PhysRamTable
        MOV     v3, #0
        MOV     v4, #0
57
        CMP     v7, v2
        STMLOIA v7!, {v3, v4}
        BLO     %BT57

; Calculate PhysIllegalMask before anything tries to use it
        BL      DeterminePhysIllegalMask

; Time to set up the L1PT. Just zero it out for now.

 [ pt = "LongDesc"
        ASSERT  DRAMOffset_LL1PT = DRAMOffset_LL2PT+16*1024
        LDR     a4, =DRAMOffset_LL1PT+4096-(PhysRamTable+DRAMOffset_PageZero) ; offset from a3 to L1PT end
        ADD     a3, a3, a4
        MOV     a4, #16*1024+4096 ; Clear the full L1PT page, even though we only need/use a small part of it
 |
        LDR     a4, =DRAMOffset_L1PT+16*1024-(PhysRamTable+DRAMOffset_PageZero) ; offset from a3 to L1PT end
        ADD     a3, a3, a4
        MOV     a4, #16*1024
 ]
        MOV     v2, #0
        MOV     v3, #0
        MOV     v4, #0
        MOV     v5, #0
        MOV     v6, #0
        MOV     v7, #0
        MOV     v8, #0
        MOV     ip, #0
60
        STMDB   a3!, {v2-v8,ip}                         ; start at end and work back
        SUBS    a4, a4, #8*4
        BNE     %BT60

 [ pt = "LongDesc"
        ; a3 -> L2PT
        ; Set up L1PT
        ADD     lr, a3, #DRAMOffset_LL1PT - DRAMOffset_PageTables
        ASSERT  :INDEX: DRAMOffset_PageTables = :INDEX: DRAMOffset_LL2PT
        ORR     v1, a3, #LL12_Table
        MOV     v2, #0
        STRD    v1, [lr], #8
        ADD     v1, v1, #4096
        STRD    v1, [lr], #8
        ADD     v1, v1, #4096
        STRD    v1, [lr], #8
        ADD     v1, v1, #4096
        STRD    v1, [lr], #8
 ]

        ADD     v1, a3, #DRAMOffset_PageZero - DRAMOffset_PageTables
        ADD     v2, a3, #DRAMOffset_LastFixed_$pt - DRAMOffset_PageTables
        STR     a2, [v1, #RAMLIMIT]                     ; remember the RAM size
        SUB     lr, a2, #1
        STR     lr, [v1, #MaxCamEntry]
        MOV     lr, a2, LSR #12-CAM_EntrySizeLog2       ; no. of pages needed for CAM
        CMP     a2, lr, LSL #12-CAM_EntrySizeLog2
        ADDNE   lr, lr, #1                              ; round up
        MOV     lr, lr, LSL #12
        STR     lr, [v1, #SoftCamMapSize]
        STR     a3, [v1, #InitUsedStart]                ; store start of L1PT

        ADD     v1, v1, #DRAMPhysAddrA
        MOV     v2, v2, LSR #12
        MOV     v3, a3

        ; Detect if the DMA claiming adjusted the first block
        ; If so, we'll need to reset v2 to the start of the block at v1
        LDR     a1, [v1]
        ADD     lr, a1, #DRAMOffset_LastFixed_$pt:SHR:12
        TEQ     lr, v2
        MOVNE   v2, a1

; For the next batch of allocation routines, v1-v3 are treated as globals.
; v1 -> current entry in PhysRamTable
; v2 -> next address to allocate in v1 (may point at end of v1), in units of pages
; v3 -> L1PT (or 0 if MMU on - not yet)

; Set up some temporary PCBTrans and PPLTrans pointers, and the initial page flags used by the page tables
        ADD     a1, v3, #DRAMOffset_PageZero - DRAMOffset_PageTables
        BL      Init_PCBTrans_$pt

; Allocate the L2PT backing store for the logical L2PT space, to
; prevent recursion.
  [ pt = "LongDesc"
        ; L3PT is 8MB in size, and each 4KB of L3PT only covers 2MB of RAM
        ; This means we'll need 4 pages of L3PT to create the full logical
        ; mapping of L3PT. So to avoid recursion, we need to start by explicitly
        ; allocating the area that will be used for the logical mapping of L3PT.
        LDR     a1, =LL3PT+(LL3PT:SHR:9)
        MOV     a2, #&00800000:SHR:9
        BL      AllocateL2PT_$pt
        ; Now allocate the rest of the L3PT logical mapping
        LDR     a1, =LL3PT
        MOV     a2, #&00800000
        BL      AllocateL2PT_$pt
  |
        ; Each 4KB of L2PT covers 4MB of RAM, and L2PT is 4MB in size, so the
        ; first page we allocate here will allow us to create the full logical
        ; mapping of L2PT (protecting against any recursion in the allocation
        ; routines)
        LDR     a1, =L2PT
        MOV     a2, #&00400000
        BL      AllocateL2PT_$pt
  ]

; Allocate workspace for the HAL

        ADD     a4, v3, #DRAMOffset_PageZero - DRAMOffset_PageTables
        LDR     a3, [sp, #8]                            ; recover pushed HAL header
        LDR     a1, =HALWorkspace
        LDR     a2, =AreaFlags_HALWorkspace
        LDR     lr, [a3, #HALDesc_Workspace]            ; their workspace
        LDR     ip, [a3, #HALDesc_NumEntries]           ; plus 1 word per entry
        CMP     ip, #KnownHALEntries
        MOVLO   ip, #KnownHALEntries
        ADD     ip, ip, #1                              ; round up so that HAL workspace is doubleword aligned
        BIC     ip, ip, #1
        ADD     lr, lr, ip, LSL #2
        MOV     a3, lr, LSR #12                         ; round workspace up to whole
        MOV     a3, a3, LSL #12                         ; number of pages
        CMP     a3, lr
        ADDNE   a3, a3, #&1000
        STR     a3, [a4, #HAL_WsSize]                   ; Make a note of allocated space
        ADD     ip, a1, ip, LSL #2                      ; Their workspace starts
        STR     ip, [a4, #HAL_Workspace]                ; after our table of entries
        BL      Init_MapInRAM_$pt

        LDR     a3, [sp, #8]                            ; recover pushed HAL header
        LDR     lr, [a3, #HALDesc_Flags]
        TST     lr, #HALFlag_NCNBWorkspace              ; do they want uncacheable
        LDRNE   a1, =HALWorkspaceNCNB                   ; workspace?
        LDRNE   a2, =AreaFlags_HALWorkspaceNCNB
        LDRNE   a3, =32*1024
        BLNE    Init_MapInRAM_DMA_$pt

; Bootstrap time. We want to get the MMU on ASAP. We also don't want to have to
; clear up too much mess later. So what we'll do is map in the three fixed areas
; (L1PT, scratch space and page zero), the CAM, ourselves, and the HAL,
; then turn on the MMU. The CAM will be filled in once the MMU is on, by
; reverse-engineering the page tables?

        ; Map in page zero
        ADD     a1, v3, #DRAMOffset_PageZero - DRAMOffset_PageTables
        LDR     a2, =ZeroPage
        LDR     a3, =AreaFlags_ZeroPage
        MOV     a4, #16*1024
        BL      Init_MapIn_$pt

        ; Map in scratch space
        ADD     a1, v3, #DRAMOffset_ScratchSpace - DRAMOffset_PageTables
        MOV     a2, #ScratchSpace
        LDR     a3, =AreaFlags_ScratchSpace
        MOV     a4, #16*1024
        BL      Init_MapIn_$pt

      [ pt = "LongDesc"
        ; Map in L1PT+L2PT
        MOV     a1, v3
        LDR     a2, =LL2PT
        ADD     a3, v3, #DRAMOffset_PageZero - DRAMOffset_PageTables
        LDR     a3, [a3, #PageTable_PageFlags]
        ORR     a3, a3, #PageFlags_Unavailable
        ASSERT  LL1PT=LL2PT+16*1024
        MOV     a4, #16*1024+4096
        BL      Init_MapIn_$pt
      |
        ; Map in L1PT
        MOV     a1, v3
        LDR     a2, =L1PT
        ADD     a3, v3, #DRAMOffset_PageZero - DRAMOffset_PageTables
        LDR     a3, [a3, #PageTable_PageFlags]
        ORR     a3, a3, #PageFlags_Unavailable
        MOV     a4, #16*1024
        BL      Init_MapIn_$pt
      ]

        ; Map in L1PT again in PhysicalAccess (see below)
      [ pt = "LongDesc"
        MOV     a1, v3, LSR #21
        MOV     a1, a1, LSL #21                 ; 2MB containing L1PT
        MOV     a4, #2*1024*1024
      |
        MOV     a1, v3, LSR #20
        MOV     a1, a1, LSL #20                 ; megabyte containing L1PT
        MOV     a4, #1024*1024
      ]
        LDR     a2, =PhysicalAccess
        ADD     a3, v3, #DRAMOffset_PageZero - DRAMOffset_PageTables
        LDR     a3, [a3, #PageTable_PageFlags]
        ORR     a3, a3, #PageFlags_Unavailable
        BL      Init_MapIn_$pt


        ; Examine HAL and RISC OS locations
        LDMFD   sp, {v4,v5,v6}                  ; v4 = flags, v5 = RO desc, v6 = HAL desc
        LDR     lr, [v6, #HALDesc_Size]
        LDR     v7, [v6, #HALDesc_Start]
        ADD     v6, v6, v7                      ; (v6,v8)=(start,end) of HAL
        ADD     v8, v6, lr

        LDR     v7, [v5, #OSHdr_ImageSize]
        ADD     v7, v5, v7                      ; (v5,v7)=(start,end) of RISC OS

        TEQ     v8, v5                          ; check contiguity (as in a ROM image)
        BNE     %FT70

        ; HAL and RISC OS are contiguous. Yum.
        MOV     a1, v6
        LDR     a2, =RISCOS_Header
        SUB     a2, a2, lr

        SUB     ip, a2, a1                      ; change physical addresses passed in
        LDMIB   sp, {a3, a4}                    ; into logical addresses
        ADD     a3, a3, ip
        ADD     a4, a4, ip
        STMIB   sp, {a3, a4}
        LDR     a3, [v5, #OSHdr_DecompressHdr]  ; check if ROM is compressed, and if so, make writeable
        CMP     a3, #0
        MOVNE   a3, #OSAP_None
        MOVEQ   a3, #OSAP_ROM
        SUB     a4, v7, v6
        BL      Init_MapIn_$pt
        MOV     a3, v6
        B       %FT75

70
        ; HAL is separate. (We should cope with larger images)
        LDR     a2, =ROM
        MOV     a1, v6
        SUB     ip, a2, a1                      ; change physical address passed in
        LDR     a3, [sp, #8]                    ; into logical address
        ADD     a3, a3, ip
        STR     a3, [sp, #8]
        SUB     a4, v8, v6
        MOV     a3, #OSAP_ROM
        BL      Init_MapIn_$pt

        ; And now map in RISC OS
        LDR     a2, =RISCOS_Header              ; Hmm - what if position independent?
        MOV     a1, v5
        SUB     ip, a2, a1                      ; change physical address passed in
        LDR     a3, [sp, #4]                    ; into logical address
        ADD     a3, a3, ip
        STR     a3, [sp, #4]
        SUB     a4, v7, v5
        LDR     a3, [v5, #OSHdr_DecompressHdr]
        CMP     a3, #0
        MOVNE   a3, #OSAP_None
        MOVEQ   a3, #OSAP_ROM
        BL      Init_MapIn_$pt
        MOV     a3, v5
75
        ; We've now allocated all the pages we're going to before the MMU comes on.
        ; Note the end address (for RAM clear)
        ADD     a1, v3, #DRAMOffset_PageZero - DRAMOffset_PageTables
        STR     v1, [a1, #InitUsedBlock]
        STR     v2, [a1, #InitUsedEnd]
        STR     a3, [a1, #ROMPhysAddr]

        ; Note the HAL flags passed in.
        LDR     a2, [sp, #0]
        STR     a2, [a1, #HAL_StartFlags]

        ; Set up a reset IRQ handler (for IIC CMOS access)
        MSR     CPSR_c, #IRQ32_mode + I32_bit + F32_bit
        LDR     sp_irq, =ScratchSpace + 1024    ; 1K is plenty since Reset_IRQ_Handler now runs in SVC mode
        MSR     CPSR_c, #SVC32_mode + I32_bit + F32_bit
        LDR     a2, =Reset_IRQ_Handler
        STR     a2, [a1, #InitIRQHandler]

        ; Fill in some initial processor vectors. These will be used during ARM
        ; analysis, once the MMU is on. We do it here before the data cache is
        ; activated to save any IMB issues.
        ADRL    a2, InitProcVecs
        ADD     a3, a2, #InitProcVecsEnd - InitProcVecs
76      LDR     a4, [a2], #4
        CMP     a2, a3
        STR     a4, [a1], #4
        BLO     %BT76

MMU_activation_zone_$pt
; The time has come to activate the MMU. Steady now... Due to unpredictability of MMU
; activation, need to ensure that mapped and unmapped addresses are equivalent. To
; do this, we temporarily make the section containing virtual address MMUon_instr map
; to the same physical address. In case the code crosses a section boundary, do the
; next section as well.
;
        MOV     a1, #4_0000000000000001                         ; domain 0 client only
        ARM_MMU_domain a1

        ADR     a1, MMU_activation_zone_$pt
      [ pt = "LongDesc"
        ; To avoid refactoring things too much, only touch one page table
        ; entry instead of two. However because each L2PT entry is 2MB we end
        ; up covering the same area (just 2MB aligned instead of the 1MB
        ; alignment that you'd get with short descriptor L1PT).
        MOV     a3, a1, LSR #21                 ; a3 = 2MB number (stays there till end)
        ADD     lr, v3, a3, LSL #3              ; lr -> L2PT entry
        LDRD    a1, [lr]                        ; remember old mapping
        Push    "a1,a2"
        MOV     a1, a3, LSL #21
        ORR     a1, a1, #LL12_Block + LLAttr_Nrm_NC + LL_Page_LowAttr_AP2 ; Non-cacheable, read-only
        ORR     a1, a1, #LL_Page_LowAttr_AF
        MOV     a2, #0
        STRD    a1, [lr]
        Pull    "a1,a2"
      |
        MOV     a1, a1, LSR #20                 ; a1 = megabyte number (stays there till end)
        ADD     lr, v3, a1, LSL #2              ; lr -> L1PT entry
        LDMIA   lr, {a2, a3}                    ; remember old mappings
 [ MEMM_Type = "VMSAv6"
        LDR     ip, =(AP_ROM * L1_APMult) + L1_Section
 |
  [ ARM6support
        LDR     ip, =(AP_None * L1_APMult) + L1_U + L1_Section
  |
        LDR     ip, =(AP_ROM * L1_APMult) + L1_U + L1_Section
  ]
 ]
        ORR     a4, ip, a1, LSL #20             ; not cacheable, as we don't want
        ADD     v4, a4, #1024*1024              ; to fill the cache with rubbish
        STMIA   lr, {a4, v4}
      ]

        MOV     a4, a1
        Push    "a2,lr"
      [ pt = "LongDesc"
        ADD     a1, v3, #DRAMOffset_LL1PT-DRAMOffset_PageTables
      |
        MOV     a1, v3
      ]
        ADD     a2, v3, #DRAMOffset_PageZero-DRAMOffset_PageTables
        PTOp    SetTTBR
        Pull    "a2,lr"
        BL      Init_ARMarch                    ; corrupts a1 and ip
        MOV     ip, a1                          ; Remember architecture for later
        MOV     a1, a4

        MSREQ   CPSR_c, #F32_bit+I32_bit+UND32_mode ; Recover the soft copy of the CR
        MOVEQ   v5, sp
        ARM_read_control v5, NE
  [ CacheOff
        ORR     v5, v5, #MMUC_M                 ; MMU on
        ORR     v5, v5, #MMUC_R                 ; ROM mode enable
  |
        ORR     v5, v5, #MMUC_W+MMUC_C+MMUC_M   ; Write buffer, data cache, MMU on
        ORR     v5, v5, #MMUC_R+MMUC_Z          ; ROM mode enable, branch predict enable
  ]
  [ MEMM_Type = "VMSAv6"
        ORR     v5, v5, #MMUC_XP ; Extended pages enabled (v6)
        BIC     v5, v5, #MMUC_TRE+MMUC_AFE ; TEX remap, Access Flag disabled
        BIC     v5, v5, #MMUC_EE+MMUC_TE+MMUC_VE ; Exceptions = nonvectored LE ARM
      [ SupportARMv6 :LAND: NoARMv7
        ; Deal with a couple of ARM11 errata
        ARM_read_ID lr
        LDR     a4, =&FFF0
        AND     lr, lr, a4
        LDR     a4, =&B760
        TEQ     lr, a4
        BNE     %FT01
        ORR     v5, v5, #MMUC_FI ; Erratum 716151: Disable hit-under-miss (enable fast interrupt mode) to prevent D-cache corruption from D-cache cleaning (the other workaround, ensuring a DSB exists inbetween the clean op and the next store access to that cache line, feels a bit heavy-handed since we'd probably have to disable IRQs to make it fully safe)
        ; Update the aux control register
        MRC     p15, 0, lr, c1, c0, 1
        ; Bit 28: Erratum 714068: Set PHD bit to prevent deadlock from PLI or I-cache invalidate by MVA
        ; Bit 31: Erratum 716151: Set FIO bit to override some of the behaviour implied by FI bit
        ORR     lr, lr, #(1:SHL:28)+(1:SHL:31)
        MCR     p15, 0, lr, c1, c0, 1
        myISB   ,lr
        TEQ     pc, #0 ; Restore NE condition from Init_ARMarch
01
      ]
  ]
  [ NoUnaligned
        ORR     v5, v5, #MMUC_A ; Alignment exceptions on
  ]
  [ HiProcVecs
        ORR     v5, v5, #MMUC_V ; High processor vectors enabled
  ]

MMUon_instr_$pt
; Note, no RAM access until we've reached MMUon_nol1ptoverlap and the flat
; logical-physical mapping of the ROM has been removed (we can't guarantee that
; the RAM mapping hasn't been clobbered, and SP is currently bogus).
        ARM_write_control v5
  [ MEMM_Type = "VMSAv6"
        MOV     lr, #0
        myISB   ,lr,,y ; Just in case
  ]
        MOVEQ   sp, v5
        MSREQ   CPSR_c, #F32_bit+I32_bit+SVC32_mode

  [ MEMM_Type = "VMSAv6"
        CMP     ip, #ARMvF
        BEQ     %FT01
        MCRNE   ARM_config_cp,0,lr,ARMv4_cache_reg,C7           ; junk MMU-off contents of I-cache (works on ARMv3)
        B       %FT02
01      MCREQ   p15, 0, lr, c7, c5, 0           ; invalidate instruction cache
        MCREQ   p15, 0, lr, c8, c7, 0           ; invalidate TLBs
        MCREQ   p15, 0, lr, c7, c5, 6           ; invalidate branch predictor
        myISB   ,lr,,y ; Ensure below branch works
        BLEQ    HAL_InvalidateCache_ARMvF       ; invalidate data cache (and instruction+TLBs again!)
02
  |
        MOV     lr, #0                                          ; junk MMU-off contents of I-cache
        MCR     ARM_config_cp,0,lr,ARMv4_cache_reg,C7           ; (works on ARMv3)
  ]

; MMU now on. Need to jump to logical copy of ourselves. Complication arises if our
; physical address overlaps our logical address - in that case we need to map
; in another disjoint copy of ourselves and branch to that first, then restore the
; original two sections.
        ADRL    a4, RISCOS_Header
        LDR     ip, =RISCOS_Header
        SUB     ip, ip, a4
        ADR     a4, MMU_activation_zone_$pt
      [ pt = "LongDesc"
        MOV     a4, a4, LSR #21
        MOV     a4, a4, LSL #21                 ; a4 = base of scrambled region
      |
        MOV     a4, a4, LSR #20
        MOV     a4, a4, LSL #20                 ; a4 = base of scrambled region
      ]
        ADD     v4, a4, #2*1024*1024            ; v4 = top of scrambled region
        SUB     v4, v4, #1                      ;      (inclusive, in case wrapped to 0)
        ADR     v5, MMUon_resume_$pt
        ADD     v5, v5, ip                      ; v5 = virtual address of MMUon_resume
        CMP     v5, a4
        BLO     MMUon_nooverlap_$pt
        CMP     v5, v4
        BHI     MMUon_nooverlap_$pt

      [ pt = "LongDesc"
        ASSERT  ROM > 4*1024*1024
; Oh dear. We know the ROM lives high up, so we'll mangle 00200000-003FFFFF.
; But as we're overlapping the ROM, we know we're not overlapping the page tables.
        LDR     lr, =LL2PT                      ; accessing the L2PT virtually now
        ; Use IP+SP as temp
        ORR     ip, a4, #LL12_Block + LLAttr_Nrm_NC + LL_Page_LowAttr_AP2 ; Non-cacheable, read-only
        ORR     ip, ip, #LL_Page_LowAttr_AF
        MOV     sp, #0
        LDRD    v7, [lr, #8]
        STRD    ip, [lr, #8]
        myDSB   ,ip                             ; there shouldn't have been anything at the target, so DSB+ISB is all that's needed for synchronisation
        myISB   ,ip,,y
        RSB     ip, a4, #&00200000
        ADD     pc, pc, ip
        NOP
MMUon_overlapresume_$pt                         ; now executing from 00200000
        ADD     ip, lr, a4, LSR #18
        STRD    a1, [ip]                        ; restore original set of mappings
        BL      Init_PageTablesChanged

        MOV     a1, v7                          ; arrange for code below
        MOV     a2, v8                          ; to restore this area instead
        MOV     a3, #1
      |
        ASSERT  ROM > 3*1024*1024
; Oh dear. We know the ROM lives high up, so we'll mangle 00100000-002FFFFF.
; But as we're overlapping the ROM, we know we're not overlapping the page tables.
        LDR     lr, =L1PT                       ; accessing the L1PT virtually now
 [ MEMM_Type = "VMSAv6"
        LDR     ip, =(AP_ROM * L1_APMult) + L1_Section
 |
  [ ARM6support
        LDR     ip, =(AP_None * L1_APMult) + L1_U + L1_Section
  |
        LDR     ip, =(AP_ROM * L1_APMult) + L1_U + L1_Section
  ]
 ]
        ORR     v6, a4, ip
        ADD     ip, v6, #1024*1024
        LDMIB   lr, {v7, v8}                    ; sections 1 and 2
        STMIB   lr, {v6, ip}
 [ MEMM_Type = "VMSAv6"
        myDSB   ,ip                             ; there shouldn't have been anything at the target, so DSB+ISB is all that's needed for synchronisation
        myISB   ,ip,,y
 ]
        RSB     ip, a4, #&00100000
        ADD     pc, pc, ip
        NOP
MMUon_overlapresume                             ; now executing from 00100000
        ADD     ip, lr, a4, LSR #18
        STMIA   ip, {a2, a3}                    ; restore original set of mappings
        BL      Init_PageTablesChanged

        MOV     a2, v7                          ; arrange for code below
        MOV     a3, v8                          ; to restore section 1+2 instead
        MOV     a1, #1
      ]

; Jump to the final logical mapping of ROM
MMUon_nooverlap_$pt
        ADRL    lr, RISCOS_Header
        LDR     ip, =RISCOS_Header
        SUB     ip, ip, lr
        ADD     pc, pc, ip
        NOP
MMUon_resume_$pt
; Repair the page table mapping that we temporarily modified.
; But what if the logical address of the page tables is at the physical address of the code? (i.e. our temporary modification is blocking access to the page tables)
; Then we have to access it via PhysicalAccess instead.
      [ pt = "LongDesc"
        LDR     lr, =LL2PT
      |
        LDR     lr, =L1PT
      ]
        CMP     lr, a4
        BLO     MMUon_nol1ptoverlap_$pt
        CMP     lr, v4
        BHI     MMUon_nol1ptoverlap_$pt
; PhysicalAccess points to the megabyte containing the L1PT. Find the L1PT within it.
        LDR     lr, =PhysicalAccess
      [ pt = "LongDesc"
        MOV     v6, v3, LSL #11
        ORR     lr, lr, v6, LSR #11
MMUon_nol1ptoverlap_$pt
        ADD     lr, lr, a3, LSL #3
        STRD    a1, [lr]
      |
        MOV     v6, v3, LSL #12
        ORR     lr, lr, v6, LSR #12
MMUon_nol1ptoverlap_$pt
        ADD     lr, lr, a1, LSL #2
        STMIA   lr, {a2, a3}
      ]
        BL      Init_PageTablesChanged

; The MMU is now on. Wahey. Let's get allocating.
;
; Input:
; v3 = phys addr of DRAMOffset_PageTables
; R13_und = MMU control register soft copy (for ARMv3)
;
; All other registers are set up from scratch

        LDR     sp, =ScratchSpace + ScratchSpaceSize - 4*3 ; 3 items already on stack :)

        LDR     a1, =ZeroPage

        ADD     lr, v3, #DRAMOffset_PageZero-DRAMOffset_PageTables ; lr = PhysAddr of zero page

        LDR     v1, [a1, #InitUsedBlock]                ; turn this from Phys to Log
        SUB     v1, v1, lr
        ADD     v1, v1, a1
        STR     v1, [a1, #InitUsedBlock]
        LDR     v2, [a1, #InitUsedEnd]

; Store the logical address of the HAL descriptor
        LDR     a2, [sp, #8]
        STR     a2, [a1, #HAL_Descriptor]

        MOV     v3, #0                                  ; "MMU is on" signal

        BL      ARM_Analyse

        ChangedProcVecs a1

      [ LongDesc
        ASSERT  L1_Fault = LL_Fault
      ]
        MOV     a1, #L1_Fault
        PTOp    ReleasePhysicalAddress

        LDR     a1, =HALWorkspace
        LDR     a2, =ZeroPage
        LDR     a3, [a2, #HAL_WsSize]
      [ ZeroPage <> 0
        MOV     a2, #0
      ]
        BL      memset

        LDR     a2, =ZeroPage
        LDR     a1, =IOLimit
        STR     a1, [a2, #IOAllocLimit]
        LDR     a1, [a2, #SoftCamMapSize]
        RSB     a1, a1, #CAMTop ; Start of CAM
      [ pt = "LongDesc"
        BFC     a1, #0, #21 ; Round down to 2MB for IO start
      |
        MOV     a1, a1, LSR #20 ; Round down to 1MB for IO start
        MOV     a1, a1, LSL #20
      ]
        STR     a1, [a2, #IOAllocPtr]
        STR     a1, [a2, #IOAllocTop]

        BL      SetUpHALEntryTable

; Initialise the HAL. Due to its memory claiming we need to get our v1 and v2 values
; into workspace and out again around it.

        LDR     a1, =ZeroPage
        STR     v1, [a1, #InitUsedBlock]
        STR     v2, [a1, #InitUsedEnd]

        LDR     a1, =RISCOS_Header
        LDR     a2, =HALWorkspaceNCNB
        AddressHAL
        CallHAL HAL_Init

        DebugTX "HAL initialised"

        MOV     a1, #64 ; Old limit prior to OMAP3 port
        CallHAL HAL_IRQMax
        CMP     a1, #MaxInterrupts
        MOVHI   a1, #MaxInterrupts ; Avoid catastrophic failure if someone forgot to increase MaxInterrupts
        LDR     a2, =ZeroPage
        STR     a1, [a2, #IRQMax]

        LDR     v1, [a2, #InitUsedBlock]
        LDR     v2, [a2, #InitUsedEnd]

; Start timer zero, at 100 ticks per second
        MOV     a1, #0
        CallHAL HAL_TimerGranularity

        MOV     a2, a1
        MOV     a1, #100
        BL      __rt_udiv

        MOV     a2, a1
        MOV     a1, #0
        CallHAL HAL_TimerSetPeriod

        DebugTX "IICInit"

        BL      IICInit

; Remember some stuff that's about to get zapped
        LDR     ip, =ZeroPage
        LDR     v4, [ip, #ROMPhysAddr]
        LDR     v5, [ip, #RAMLIMIT]
        LDR     v7, [ip, #MaxCamEntry]
        LDR     v8, [ip, #IRQMax]

        LDR     a1, [ip, #HAL_StartFlags]
        TST     a1, #OSStartFlag_RAMCleared
        BLEQ    ClearWkspRAM            ; Only clear the memory if the HAL didn't

; Put it back
        LDR     a1, =ZeroPage
        STR     v4, [a1, #ROMPhysAddr]
        STR     v5, [a1, #RAMLIMIT]
        STR     v7, [a1, #MaxCamEntry]
        STR     v8, [a1, #IRQMax]
        MSR     CPSR_c, #F32_bit + UND32_mode           ; retrieve the MMU control register soft copy
        STR     sp, [a1, #MMUControlSoftCopy]
        MSR     CPSR_c, #F32_bit + SVC32_mode
        MOV     v8, a1

; Calculate CPU feature flags
        BL      ReadCPUFeatures

        DebugTX "HAL_CleanerSpace"

; Set up the data cache cleaner space if necessary (eg. for StrongARM core)
        MOV     a1, #-1
        CallHAL HAL_CleanerSpace
        CMP     a1, #-1          ;-1 means none needed (HAL only knows this if for specific ARM core eg. system-on-chip)
        BEQ     %FT20
        LDR     a2, =DCacheCleanAddress
        LDR     a3, =AreaFlags_DCacheClean
        ASSERT  DCacheCleanSize = 4*&10000                ; 64k of physical space used 4 times (allows large page mapping)
        MOV     a4, #&10000
        MOV     ip, #4
        SUB     sp, sp, #5*4       ;room for a1-a4,ip
10
        STMIA   sp, {a1-a4, ip}
        BL      Init_MapIn_$pt
        LDMIA   sp, {a1-a4, ip}
        SUBS    ip, ip, #1
        ADD     a2, a2, #&10000
        BNE     %BT10
        ADD     sp, sp, #5*4

20
; Decompress the ROM
        LDR     a1, =RISCOS_Header
        LDR     a2, [a1, #OSHdr_DecompressHdr]
        CMP     a2, #0
        BEQ     %FT30
        ADD     ip, a1, a2
        ASSERT  OSDecompHdr_WSSize = 0
        ASSERT  OSDecompHdr_Code = 4
        LDMIA   ip, {a3-a4}
        ADRL    a2, SyncCodeAreas
        CMP     a3, #0 ; Any workspace required?
        ADD     a4, a4, ip
   [ DebugHALTX
        BNE     %FT25
        DebugTX "Decompressing ROM, no workspace required"
      [ NoARMv5
        MOV     lr, pc
        MOV     pc, a4
      |
        BLX     a4
      ]
        DebugTX "Decompression complete"
        B       %FT27
25
   |
        ADREQ   lr, %FT27
        MOVEQ   pc, a4
   ]
        Push    "a1-a4,v1-v2,v5-v7"
; Allocate workspace for decompression code
; Workspace is located at a 4MB-aligned log addr, and is a multiple of 1MB in
; size. This greatly simplifies the code required to free the workspace, since
; we can guarantee it will have been section-mapped, and won't hit any
; partially-allocated L2PT blocks (where 4 L1PT entries point to subsections of
; the same L2PT page)
; This means all we need to do to free the workspace is zap the L1PT entries
; and rollback v1 & v2
; Note: This is effectively a MB-aligned version of Init_MapInRAM
ROMDecompWSAddr_$pt * 4<<20
   [ pt = "LongDesc"
ROMDecompAlign_$pt * 21 ; 2MB aligned for long descriptor format
   |
ROMDecompAlign_$pt * 20
   ]
        DebugTX "Allocating decompression workspace"
        LDR     v5, =(1<<ROMDecompAlign_$pt)-1
        ADD     v7, a3, v5
        BIC     v7, v7, v5 ; MB-aligned size
        STR     v7, [sp, #8] ; Overwrite stacked WS size
        MOV     v6, #ROMDecompWSAddr_$pt ; Current log addr
26
        ADD     v2, v2, v5, LSR #12
        BIC     v2, v2, v5, LSR #12 ; MB-aligned physram
        LDMIA   v1, {a2, a3}
        SUB     a2, v2, a2 ; Amount of bank used (page units)
        RSB     a2, a2, a3, LSR #12 ; Amount of bank remaining (page units)
        MOVS    a2, a2, ASR #ROMDecompAlign_$pt-12 ; Round down to nearest MB
        LDRLE   v2, [v1, #8]! ; Move to next bank if 0MB left
        BLE     %BT26
        CMP     a2, v7, LSR #ROMDecompAlign_$pt
        MOVHS   a4, v7
        MOVLO   a4, a2, LSL #ROMDecompAlign_$pt ; a4 = amount to take
        MOV     a1, v2, LSL #12 ; set up parameters for MapIn call
        MOV     a2, v6
        MOV     a3, #OSAP_None
        SUB     v7, v7, a4 ; Decrease amount to allocate
        ADD     v2, v2, a4, LSR #12 ; Increase physram ptr
        ADD     v6, v6, a4 ; Increase logram ptr
        BL      Init_MapIn_$pt
        CMP     v7, #0
        BNE     %BT26
        Pull    "a1-a2,v1-v2" ; Pull OS header, IMB func ptr, workspace size, decompression code
        MOV     a3, #ROMDecompWSAddr_$pt
        DebugTX "Decompressing ROM"
      [ NoARMv5
        MOV     lr, pc
        MOV     pc, v2
      |
        BLX     v2
      ]
        DebugTX "Decompression complete"
; Before we free the workspace, make sure we zero it
        MOV     a1, #ROMDecompWSAddr_$pt
        MOV     a2, #0
        MOV     a3, v1
        BL      memset
; Flush the workspace from the cache so we can unmap it
; Really we should make the pages uncacheable first, but for simplicity we do a
; full cache+TLB clean+invalidate later on when changing the ROM permissions
        MOV     a1, #ROMDecompWSAddr_$pt
        ADD     a2, a1, v1
        ARMop   Cache_CleanInvalidateRange
; Zero each L1PT entry
      [ pt = "LongDesc"
        ; Ensure STRD is used
        MOV     a1, #0
        MOV     a2, #0
        LDR     a3, =LL2PT+(ROMDecompWSAddr_$pt>>18)
        MOV     a4, v1, LSR #18
265
        STRD    a1, [a3], #8
        SUBS    a4, a4, #8
        BGE     %BT265
      |
        LDR     a1, =L1PT+(ROMDecompWSAddr_$pt>>18)
        MOV     a2, #0
        MOV     a3, v1, LSR #18
        BL      memset
      ]
; Pop our registers and we're done
        Pull    "v1-v2,v5-v7"
        DebugTX "ROM decompression workspace freed"
27
; Now that the ROM is decompressed we need to change the ROM page mapping to
; read-only. The easiest way to do this is to make another call to Init_MapIn.
; But before we can do that we need to work out if the HAL+OS are contiguous in
; physical space, so that we remap the correct area.
        LDR     a1, =RISCOS_Header
        BL      RISCOS_LogToPhys
        Push    "a1"
        LDR     a1, =ROM
        BL      RISCOS_LogToPhys
        Pull    "a2"                    ; a1 = HAL phys, a2 = OS phys
        LDR     a3, =ZeroPage
        LDR     a3, [a3, #HAL_Descriptor]
        LDR     a3, [a3, #HALDesc_Size]
        ADD     a3, a3, a1              ; a3 = phys addr of OS if contiguous
        CMP     a2, a3
; Contiguous mapping, remap combined HAL+OS
        MOVEQ   a2, #ROM
        MOVEQ   a4, #OSROM_ImageSize*1024
; Discontiguous mapping, only remap OS
        MOVNE   a1, a2
        LDRNE   a2, =RISCOS_Header
        LDRNE   a4, [a2, #OSHdr_ImageSize]
        MOV     a3, #OSAP_ROM
        BL      Init_MapIn_$pt
; Flush & invalidate cache/TLB to ensure everything respects the new page access
; Putting a flush here also means the decompression code doesn't have to worry
; about IMB'ing the decompressed ROM
        ARMop   MMU_Changing ; Perform full clean+invalidate to ensure any lingering cache lines for the decompression workspace are gone
        DebugTX "ROM access changed to read-only"
30
; Allocate the CAM
        LDR     a3, [v8, #SoftCamMapSize]
        LDR     a2, =AreaFlags_CAM
        RSB     a1, a3, #CAMTop
        STR     a1, [v8, #CamEntriesPointer]
        BL      Init_MapInRAM_$pt

; Allocate the supervisor stack
        LDR     a1, =SVCStackAddress
        LDR     a2, =AreaFlags_SVCStack
        LDR     a3, =SVCStackSize
        BL      Init_MapInRAM_$pt

; Allocate the interrupt stack
        LDR     a1, =IRQStackAddress
        LDR     a2, =AreaFlags_IRQStack
        LDR     a3, =IRQStackSize
        BL      Init_MapInRAM_$pt

; Allocate the abort stack
        LDR     a1, =ABTStackAddress
        LDR     a2, =AreaFlags_ABTStack
        LDR     a3, =ABTStackSize
        BL      Init_MapInRAM_$pt

; Allocate the undefined stack
        LDR     a1, =UNDStackAddress
        LDR     a2, =AreaFlags_UNDStack
        LDR     a3, =UNDStackSize
        BL      Init_MapInRAM_$pt

; Allocate the system heap (just 32K for now - will grow as needed)
        LDR     a1, =SysHeapAddress
        LDR     a2, =AreaFlags_SysHeap
        LDR     a3, =32*1024
        BL      Init_MapInRAM_Clear_$pt

; Allocate the cursor/system/sound block - first the cached bit
        LDR     a1, =CursorChunkAddress
        LDR     a2, =AreaFlags_CursorChunkCacheable
        LDR     a3, =SoundDMABuffers - CursorChunkAddress
        BL      Init_MapInRAM_DMA_$pt
; then the uncached bit
        LDR     a1, =SoundDMABuffers
        LDR     a2, =AreaFlags_CursorChunk
        LDR     a3, =?SoundDMABuffers
        BL      Init_MapInRAM_DMA_$pt

        LDR     a1, =KbuffsBaseAddress
        LDR     a2, =AreaFlags_Kbuffs
        LDR     a3, =(KbuffsSize + &FFF) :AND: &FFFFF000  ;(round to 4k)
        BL      Init_MapInRAM_Clear_$pt

; Allocate the RW/ZI data segment
        IMPORT  |Image$$RW$$Base|
        LDR     a1, =|Image$$RW$$Base|
        LDR     a2, =AreaFlags_RWArea
        IMPORT  |Image$$ZI$$Limit|
        LDR     a3, =|Image$$ZI$$Limit|+&FFF
        SUB     a3, a3, a1
        BIC     a3, a3, #&FF
        BIC     a3, a3, #&F00
        BL      Init_MapInRAM_Clear_$pt

 [ HiProcVecs
        ; Map in DebuggerSpace
        LDR     a1, =DebuggerSpace
        LDR     a2, =AreaFlags_DebuggerSpace
        LDR     a3, =(DebuggerSpace_Size + &FFF) :AND: &FFFFF000
        BL      Init_MapInRAM_Clear_$pt
 ]

 [ MinorL2PThack
; Allocate backing L2PT for application space
; Note that ranges must be 4M aligned, as AllocateL2PT only does individual
; (1M) sections, rather than 4 at a time, corresponding to a L2PT page. The
; following space is available for dynamic areas, and ChangeDyn.s will get
; upset if it sees only some out of a set of 4 section entries pointing to the
; L2PT page.
        MOV     a1, #0
        MOV     a2, #AplWorkMaxSize             ; Not quite right, but the whole thing's wrong anyway
        ASSERT  AplWorkMaxSize :MOD: (4*1024*1024) = 0
        BL      AllocateL2PT_$pt
; And for the system heap. Sigh
        LDR     a1, =SysHeapAddress
        LDR     a2, =SysHeapMaxSize
        ASSERT  SysHeapAddress :MOD: (4*1024*1024) = 0
        ASSERT  SysHeapMaxSize :MOD: (4*1024*1024) = 0
        BL      AllocateL2PT_$pt
 ]

        STR     v2, [v8, #InitUsedEnd]

        ; Put InitDMABlock back into PhysRamTable
        Push    "v1-v7"
        ASSERT  InitDMAOffset = InitDMABlock+8
        ADD     v1, v8, #InitDMABlock
        LDMIA   v1, {v1-v3}
        ADD     v3, v3, #PhysRamTable
        ADD     v3, v3, v8
        ; Work out whether the block was removed or merely shrunk
        LDMDB   v3, {v4-v5}
        ADD     v6, v1, v2, LSR #12
        ADD     v7, v4, v5, LSR #12
        STMDB   v3, {v1-v2}
        TEQ     v6, v7
        BEQ     %FT40                   ; End addresses match, it was shrunk
35
        LDMIA   v3, {v1-v2}             ; Shuffle following entries down
        STMIA   v3!, {v4-v5}
        MOV     v4, v1
        MOVS    v5, v2
        BNE     %BT35
40
        Pull    "v1-v7"

        MSR     CPSR_c, #F32_bit+I32_bit+IRQ32_mode
        LDR     sp, =IRQSTK
        MSR     CPSR_c, #F32_bit+I32_bit+ABT32_mode
        LDR     sp, =ABTSTK
        MSR     CPSR_c, #F32_bit+I32_bit+UND32_mode
        LDR     sp, =UNDSTK
        MSR     CPSR_c, #F32_bit+SVC2632
        LDR     sp, =SVCSTK

        BL      ConstructCAMfromPageTables

        MOV     a1, #4096
        STR     a1, [v8, #Page_Size]

        BL      CountPageTablePages_$pt

        BL      Count32bitPages

        B       Continue_after_HALInit

        LTORG

CountPageTablePages_$pt ROUT
        Entry
        LDR     a1, =ZeroPage
        LDR     a2, [a1, #CamEntriesPointer]
        LDR     a3, [a1, #MaxCamEntry]
      [ ZeroPage <> 0
        MOV     a1, #0
      ]
        ADD     a3, a3, #1
        ADD     a4, a2, a3, LSL #CAM_EntrySizeLog2
      [ pt = "LongDesc"
        LDR     lr, =LL3PT
10      LDR     ip, [a4, #CAM_LogAddr-CAM_EntrySize]!
        SUB     ip, ip, lr
        CMP     ip, #8:SHL:20
        ADDLO   a1, a1, #4096
        TEQ     a4, a2
        BNE     %BT10
      |
        ASSERT  (L2PT :AND: &3FFFFF) = 0
        LDR     lr, =L2PT :SHR: 22
10      LDR     ip, [a4, #CAM_LogAddr-CAM_EntrySize]!
        TEQ     lr, ip, LSR #22
        ADDEQ   a1, a1, #4096
        TEQ     a4, a2
        BNE     %BT10
      ]
        LDR     a2, =ZeroPage
        STR     a1, [a2, #LxPTUsed]
        EXIT

; Allocate and map in some RAM.
;
; On entry:
;    a1 = logical address
;    a2 = access permissions (see Init_MapIn)
;    a3 = length (4K multiple)
;    v1 -> current entry in PhysRamTable
;    v2 = next physical address (page units)
;    v3 -> L1PT
;
; On exit:
;    a1 -> physical address of start of RAM (deduce the rest from PhysRamTable)
;
; No out of memory check...
Init_MapInRAM_$pt ROUT
        Push    "v4-v8,lr"
        MOV     v8, #-1
        MOV     v5, a3                          ; v5 = amount of memory required
        MOV     v6, a1                          ; v6 = logical address
        MOV     v7, a2                          ; v7 = access permissions
10      LDMIA   v1, {v4, ip}                    ; v4 = addr of bank, ip = len+flags
        MOV     ip, ip, LSR #12
        SUB     v4, v2, v4                      ; v4 = amount of bank used
        RSBS    v4, v4, ip                      ; v4 = amount of bank left (pages)
        LDREQ   v2, [v1, #8]!                   ; move to next bank if 0 left
        BEQ     %BT10

        CMP     v8, #-1                         ; is this the first bank?
        MOVEQ   v8, v2                          ; remember it

        CMP     v4, v5, LSR #12                 ; sufficient in this bank?
        MOVHS   a4, v5
        MOVLO   a4, v4, LSL #12                 ; a4 = amount to take

        MOV     a1, v2, LSL #12                 ; set up parameters for MapIn call
        MOV     a2, v6                          ; then move globals (in case MapIn
        MOV     a3, v7                          ; needs to allocate for L2PT)
        ADD     v2, v2, a4, LSR #12             ; advance physaddr
        SUB     v5, v5, a4                      ; decrease wanted
        ADD     v6, v6, a4                      ; advance log address pointer
        BL      Init_MapIn_$pt                  ; map in the RAM
        TEQ     v5, #0                          ; more memory still required?
        BNE     %BT10

        MOV     a1, v8
        Pull    "v4-v8,pc"

Init_MapInRAM_Clear_$pt ROUT                    ; same as Init_MapInRAM but also
        Push    "a1,a3,v5,lr"                   ; clears the mapped in result
        BL      Init_MapInRAM_$pt
        MOV     v5, a1
        Pull    "a1,a3"
        MOV     a2, #0
        BL      memset
        MOV     a1, v5
        Pull    "v5,pc"

; Allocate and map a physically contigous chunk of some DMAable RAM.
;
; On entry:
;    a1 = logical address
;    a2 = access permissions (see Init_MapIn)
;    a3 = length (4K multiple)
;    v1 -> current entry in PhysRamTable
;    v2 = next physical address (page units)
;    v3 -> L1PT
;
; On exit:
;    a1 -> physical address of start of RAM (deduce the rest from PhysRamTable)
;
; Use this routine with caution - correct total amount of required DMA memory
; must have been calculated beforehand and stashed in InitDMABlock
Init_MapInRAM_DMA_$pt ROUT
        Push    "a1,a3,v4-v5,ip,lr"
        TEQ     v3, #0                          ; MMU on?
        LDREQ   v4, =ZeroPage                   ; get workspace directly
        ADDNE   v4, v3, #DRAMOffset_PageZero-DRAMOffset_PageTables ; deduce from L1PT
        LDR     v5, [v4, #InitDMAEnd]
        ADD     lr, v5, a3                      ; claim the RAM
        STR     lr, [v4, #InitDMAEnd]

        MOV     a4, a3
        MOV     a3, a2
        MOV     a2, a1
        MOV     a1, v5
        BL      Init_MapIn_$pt                  ; map it in
        ; DMA regions won't get cleared by ClearWkspRam, so do it manually
        ; Could potentially skip this if the HAL says RAM is already clear, but
        ; for now do it anyway (especially since startup flags haven't been set
        ; when we're first called)
        Pull    "a1,a3"
        TEQ     v3, #0
        MOVNE   a1, v5
        MOV     a2, #0
        BL      memset
        MOV     a1, v5
        Pull    "v4-v5,ip,pc"

; Map a range of physical addresses to a range of logical addresses.
;
; On entry:
;    a1 = physical address (32bit)
;    a2 = logical address
;    a3 = DA flags
;    a4 = area size (4K multiple)
;    v1 -> current entry in PhysRamTable
;    v2 = last used physical address (page units)
;    v3 -> L1PT (or 0 if MMU on)

Init_MapIn_$pt ROUT
        Entry   "v4-v7"
        MOV     v4, a1                          ; v4 = physaddr
        MOV     v5, a2                          ; v5 = logaddr
        MOV     v6, a3                          ; v6 = page flags
        MOV     v7, a4                          ; v7 = area size
        ; Set up a2-a4 for the Get*PTE functions
        TEQ     v3, #0
        LDREQ   a3, =ZeroPage
        ADDNE   a3, v3, #DRAMOffset_PageZero-DRAMOffset_PageTables
        MOV     a2, v6
        LDR     a4, [a3, #MMU_PCBTrans]
        LDR     a3, [a3, #MMU_PPLTrans]

        ORR     lr, v4, v5                      ; OR together, physaddr, logaddr
        ORR     lr, lr, v7                      ; and size.
      [ pt = "LongDesc"
        MOVS    ip, lr, LSL #11                 ; If all bottom 21 bits 0
        BEQ     %FT50                           ; it's L2PT block mapped
      |
        MOVS    ip, lr, LSL #12                 ; If all bottom 20 bits 0
        BEQ     %FT50                           ; it's section mapped
      ]

        MOV     a1, #0                          ; We don't want the address in the result

        MOVS    ip, lr, LSL #16                 ; If bottom 16 bits not all 0
        BEQ     %FT05
        PTOp    Get4KPTE                        ; then small pages (4K)
        B       %FT10
05
        PTOp    Get64KPTE                       ; else large pages (64K)
10
      [ pt = "LongDesc"
        ORR     v6, a1, a2                      ; v6 = high & low attributes
      |
        MOV     v6, a1                          ; v6 = access permissions
      ]

20      MOV     a1, v4
        MOV     a2, v5
        MOV     a3, v6
        BL      Init_MapInPage_$pt              ; Loop through mapping in each
        ADD     v4, v4, #4096                   ; page in turn
        ADD     v5, v5, #4096
        SUBS    v7, v7, #4096
        BNE     %BT20
        EXIT

      [ pt = "LongDesc"
50
        BL      Get2MPTE_LongDesc
        MOVS    ip, v3                          ; is MMU on?
        LDREQ   ip, =LL2PT                      ; then use virtual address
        ADD     a3, ip, v5, LSR #18             ; a2 -> L2PT entry
70      STRD    a1, [a3], #8                    ; And store in L2PT
        ADD     a1, a1, #2*1024*1024            ; Advance two megabytes
        SUBS    v7, v7, #2*1024*1024            ; and loop
        BNE     %BT70
        EXIT
      |
50
        BL      Get1MPTE_ShortDesc
        MOVS    ip, v3                          ; is MMU on?
        LDREQ   ip, =L1PT                       ; then use virtual address
        ADD     a2, ip, v5, LSR #18             ; a2 -> L1PT entry
70      STR     a1, [a2], #4                    ; And store in L1PT
        ADD     a1, a1, #1024*1024              ; Advance one megabyte
        SUBS    v7, v7, #1024*1024              ; and loop
        BNE     %BT70
        EXIT
      ]

; Map a logical page to a physical page, allocating L2PT as necessary.
;
; On entry:
;    a1 = physical address (32bit)
;    a2 = logical address
 [ LongDesc
;    a3 = high & low page attributes merged into one word
 |
;    a3 = access permissions + C + B bits + size (all non-address bits, of appropriate type)
 ]
;    v1 -> current entry in PhysRamTable
;    v2 = last used physical address (page units)
;    v3 -> L1PT (or 0 if MMU on)
; On exit:
;    a1 = logical address
;    a2-a4, ip corrupt
;    v1, v2 updated
;

Init_MapInPage_$pt  ROUT
        Entry   "v4-v6"
        MOV     v4, a1                          ; v4 = physical address
        MOV     v5, a2                          ; v5 = logical address
        MOV     v6, a3                          ; v6 = access permissions
        MOV     a1, v5
        MOV     a2, #4096
        BL      AllocateL2PT_$pt
        TEQ     v3, #0                          ; if MMU on, access L2PT virtually...
      [ pt = "LongDesc"
        LDREQ   a1, =LL3PT
        MOVEQ   ip, v5                          ; index using whole address
        BEQ     %FT40
        MOV     ip, v5, LSR #21
        ADD     a1, v3, ip, LSL #3
        LDRD    a1, [a1]                        ; a1 = level two descriptor
        BFC     a1, #0, #LL_LowAddr_Start       ; a1 -> L3PT tables for this section
        MOV     ip, v5, LSL #32-(12+9)
        MOV     ip, ip, LSR #32-(12+9)          ; extract L3 table index bits
40      MOV     a3, v4
        BFI     a3, v6, #0, #LL_LowAddr_Start   ; Add low attributes
        MOV     a4, v6
        BFC     a4, #0, #LL_HighAttr_Start      ; High attributes
        ADD     a1, a1, ip, LSR #9
        STRD    a3, [a1]                        ; update L3PT entry
        MOV     a1, v5
        EXIT
      |
        LDREQ   a1, =L2PT                       ; a1 -> L2PT virtual address
        MOVEQ   ip, v5                          ; index using whole address
        BEQ     %FT40
        MOV     ip, v5, LSR #20
        LDR     a1, [v3, ip, LSL #2]            ; a1 = level one descriptor
        MOV     a1, a1, LSR #10
        MOV     a1, a1, LSL #10                 ; a1 -> L2PT tables for this section
        AND     ip, v5, #&000FF000              ; extract L2 table index bits
40      AND     lr, v6, #3
        TEQ     lr, #L2_LargePage               ; strip out surplus address bits from
        BICEQ   v4, v4, #&0000F000              ; large page descriptors
        ORR     lr, v4, v6                      ; lr = value for L2PT entry
        STR     lr, [a1, ip, LSR #10]           ; update L2PT entry
        MOV     a1, v5
        EXIT
      ]



; On entry:
;    a1 = virtual address L2PT required for
;    a2 = number of bytes of virtual space
;    v1 -> current entry in PhysRamTable
;    v2 = last used physical address (page units)
;    v3 -> L1PT (or 0 if MMU on)
; On exit
;    a1-a4,ip corrupt
;    v1, v2 updated
AllocateL2PT_$pt ROUT
 [ pt = "LongDesc"
        Entry   "v4-v8"
        MOV     v8, a1, LSR #21                 ; round base address down to 2M
        ADD     lr, a1, a2
        MOV     v7, lr, LSR #21
        TEQ     lr, v7, LSL #21
        ADDNE   v7, v7, #1                      ; round end address up to 2M

        MOVS    v6, v3
        LDREQ   v6, =LL2PT                      ; v6->L2PT (whole routine)

05      ADD     a3, v6, v8, LSL #3              ; L2PT contains 1 entry per 2MB
        LDRD    a1, [a3]
        TEQ     a1, #0                          ; if non-zero, the L3PT has
        BNE     %FT40                           ; already been allocated
        ; With 8 bytes per entry, every 4KB L3PT covers 2MB of RAM, the same
        ; as the single L2PT entry covers. So no need to deal with partially
        ; used pages like the short descriptor case.
        BL      Init_ClaimPhysicalPage          ; Claim a page to put L3PT in
        MOV     v4, a1

; Need to zero the L3PT. Must do it before calling in MapInPage, as that may well
; want to put something in the thing we are clearing. If the MMU is off, no problem,
; but if the MMU is on, then the L3PT isn't accessible until we've called MapInPage.
; Solution is to use the AccessPhysicalAddress call.

        TEQ     v3, #0                          ; MMU on?
        MOVNE   a1, v4                          ; if not, just access v4
        LDREQ   a1, =OSAP_None+DynAreaFlags_NotCacheable ; if so, map in v4
        MOVEQ   a2, v4
        SUBEQ   sp, sp, #4
        MOVEQ   a3, #0
        MOVEQ   a4, sp
        PTOpEQ  AccessPhysicalAddress

        MOV     a2, #0
        MOV     a3, #4*1024
        BL      memset

        TEQ     v3, #0
        LDREQ   a1, [sp], #4
        PTOpEQ  ReleasePhysicalAddress

        ; Fill in the L2PT entry
        ORR     a1, v4, #LL12_Table
        MOV     a2, #0
        ADD     a3, v6, v8, LSL #3
        STRD    a1, [a3]

        ; Get the correct page table entry flags for Init_MapInPage
        TEQ     v3, #0
        LDREQ   a3, =ZeroPage
        ADDNE   a3, v3, #DRAMOffset_PageZero-DRAMOffset_PageTables
        LDR     a2, [a3, #PageTable_PageFlags]
        LDR     a4, [a3, #MMU_PCBTrans]
        LDR     a3, [a3, #MMU_PPLTrans]
        MOV     a1, #0
        BL      Get4KPTE_LongDesc

        ORR     a3, a1, a2
        MOV     a1, v4                          ; Map in the L3PT page itself
        LDR     a2, =LL3PT                      ; (can't recurse, because L3PT
        ADD     a2, a2, v8, LSL #12             ; backing for L3PT is preallocated)
        BL      Init_MapInPage_$pt


40      ADD     v8, v8, #1                      ; go back until all
        CMP     v8, v7                          ; pages allocated
        BLO     %BT05

        EXIT
 |
        Entry   "v4-v8"
        MOV     v8, a1, LSR #20                 ; round base address down to 1M
        ADD     lr, a1, a2
        MOV     v7, lr, LSR #20
        TEQ     lr, v7, LSL #20
        ADDNE   v7, v7, #1                      ; round end address up to 1M

        MOVS    v6, v3
        LDREQ   v6, =L1PT                       ; v6->L1PT (whole routine)

05      LDR     v5, [v6, v8, LSL #2]            ; L1PT contains 1 word per M
        TEQ     v5, #0                          ; if non-zero, the L2PT has
                                                ; already been allocated
        BNE     %FT40

        BIC     lr, v8, #3                      ; round down to 4M - each page
        ADD     lr, v6, lr, LSL #2              ; of L2PT maps to 4 sections
        LDMIA   lr, {a3,a4,v5,ip}               ; check if any are page mapped
        ASSERT  L1_Fault = 2_00 :LAND: L1_Page = 2_01 :LAND: L1_Section = 2_10
        TST     a3, #1
        TSTEQ   a4, #1
        TSTEQ   v5, #1
        TSTEQ   ip, #1
        BEQ     %FT20                           ; nothing page mapped - claim a page

        TST     a4, #1                          ; at least one of the sections is page mapped
        SUBNE   a3, a4, #1*1024                 ; find out where it's pointing to and
        TST     v5, #1                          ; derive the corresponding address for our
        SUBNE   a3, v5, #2*1024                 ; section
        TST     ip, #1
        SUBNE   a3, ip, #3*1024

        AND     lr, v8, #3
        ORR     a3, a3, lr, LSL #10
        STR     a3, [v6, v8, LSL #2]            ; fill in the L1PT entry
        B       %FT40                           ; no more to do

20      BL      Init_ClaimPhysicalPage          ; Claim a page to put L2PT in
        MOV     v4, a1

; Need to zero the L2PT. Must do it before calling in MapInPage, as that may well
; want to put something in the thing we are clearing. If the MMU is off, no problem,
; but if the MMU is on, then the L2PT isn't accessible until we've called MapInPage.
; Solution is to use the AccessPhysicalAddress call.

        TEQ     v3, #0                          ; MMU on?
        MOVNE   a1, v4                          ; if not, just access v4
        LDREQ   a1, =OSAP_None+DynAreaFlags_NotCacheable ; if so, map in v4
        MOVEQ   a2, v4
        SUBEQ   sp, sp, #4
        MOVEQ   a3, #0
        MOVEQ   a4, sp
        PTOpEQ  AccessPhysicalAddress

        MOV     a2, #0
        MOV     a3, #4*1024
        BL      memset

        TEQ     v3, #0
        LDREQ   a1, [sp], #4
        PTOpEQ  ReleasePhysicalAddress

      [ MEMM_Type = "VMSAv6"
        ORR     a3, v4, #L1_Page
      |
        ORR     a3, v4, #L1_Page + L1_U         ; Set the U bit for ARM6 (assume L2 pages will generally be cacheable)
      ]
        AND     lr, v8, #3
        ORR     a3, a3, lr, LSL #10
        STR     a3, [v6, v8, LSL #2]            ; fill in the L1PT

        ; Get the correct page table entry flags for Init_MapInPage
        TEQ     v3, #0
        LDREQ   a3, =ZeroPage
        ADDNE   a3, v3, #DRAMOffset_PageZero-DRAMOffset_PageTables
        LDR     a2, [a3, #PageTable_PageFlags]
        LDR     a4, [a3, #MMU_PCBTrans]
        LDR     a3, [a3, #MMU_PPLTrans]
        MOV     a1, #0
        BL      Get4KPTE_ShortDesc

        MOV     a3, a1
        MOV     a1, v4                          ; Map in the L2PT page itself
        LDR     a2, =L2PT                       ; (can't recurse, because L2PT
        ADD     a2, a2, v8, LSL #10             ; backing for L2PT is preallocated)
        BIC     a2, a2, #&C00
        BL      Init_MapInPage_$pt


40      ADD     v8, v8, #1                      ; go back until all
        CMP     v8, v7                          ; pages allocated
        BLO     %BT05

        EXIT
 ]

; Called pre-MMU to set up some (temporary) PCBTrans and PPLTrans pointers,
; and the initial PageTable_PageFlags value
; Also used post-MMU for VMSAv6 case
; In:
;   a1 -> ZeroPage
; Out:
;   a1-a4, ip corrupt
Init_PCBTrans_$pt   ROUT
        LDR     a2, =AreaFlags_PageTablesAccess :OR: DynAreaFlags_NotCacheable :OR: DynAreaFlags_NotBufferable
        STR     a2, [a1, #PageTable_PageFlags]
 [ MEMM_Type = "VMSAv6"
        ADRL    a2, PPLAccess_$pt
        STR     a2, [a1, #MMU_PPLAccess]
      [ pt = "LongDesc"
        ADRL    a2, XCBTableVMSAv6Long
        STR     a2, [a1, #MMU_PCBTrans]
        ADRL    a4, PPLTrans_LongDesc
        STR     a4, [a1, #MMU_PPLTrans]
      |
        ADRL    a2, XCBTableVMSAv6
        STR     a2, [a1, #MMU_PCBTrans]

        ; Use shareable pages if we're a multicore chip
        ; N.B. it's important that we get this correct - single-core chips may
        ; treat shareable memory as non-cacheable (e.g. ARM11)

        ADRL    a4, PPLTransNonShareable_ShortDesc
        ; Look at the cache type register to work out whether this is ARMv6 or ARMv7+
        MRC     p15, 0, a2, c0, c0, 1   ; Cache type register
        TST     a2, #1<<31              ; EQ = ARMv6, NE = ARMv7+
        MRC     p15, 0, a2, c0, c0, 5   ; MPIDR
        BNE     %FT50
        MRC     p15, 0, a3, c0, c0, 0   ; ARMv6: MPIDR is optional, so compare value against MIDR to see if it's implemented. There's no multiprocessing extensions flag so assume the check against MIDR will be good enough.
        TEQ     a2, a3
        ADDNE   a4, a4, #PPLTransShareable_ShortDesc-PPLTransNonShareable_ShortDesc
        B       %FT90
50
        AND     a2, a2, #&c0000000      ; ARMv7+: MPIDR is mandatory, but multicore not guaranteed. Check if multiprocessing extensions implemented (bit 31 set), and not uniprocessor (bit 30 clear).
        TEQ     a2, #&80000000
        ADDEQ   a4, a4, #PPLTransShareable_ShortDesc-PPLTransNonShareable_ShortDesc
90
        STR     a4, [a1, #MMU_PPLTrans]
      ]
 |
        ; Detecting the right PCBTrans table to use is complex
        ; However we know that, pre-MMU, we only use the default cache policy,
        ; and we don't use CNB memory
        ; So just go for a safe PCBTrans, like SA110, and the non-extended
        ; PPLTrans
        ADRL    a2, XCBTableSA110
        STR     a2, [a1, #MMU_PCBTrans]
        ADRL    a2, PPLTrans_ShortDesc
     [ ARM6support
        ARM_6   a3
        ADDEQ   a2, a2, #PPLTransARM6-PPLTrans_ShortDesc
     ]
        STR     a2, [a1, #MMU_PPLTrans]
 ]
        MOV     pc, lr

      ]

        LTORG

ptcounter SETA ptcounter + 1
        WEND
pt      SETS ""

 [ MEMM_Type = "VMSAv6"
HAL_InvalidateCache_ARMvF
        ; Cache invalidation for ARMs with multiple cache levels, used before ARMop initialisation
        ; This function gets called before we have a stack set up, so we've got to preserve as many registers as possible
        ; The only register we can safely change is ip, but we can switch into FIQ mode with interrupts disabled and use the banked registers there
        MRS     ip, CPSR
        MSR     CPSR_c, #F32_bit+I32_bit+FIQ32_mode
        MOV     r9, #0
        MCR     p15, 0, r9, c7, c5, 0           ; invalidate instruction cache
        MCR     p15, 0, r9, c8, c7, 0           ; invalidate TLBs
        MCR     p15, 0, r9, c7, c5, 6           ; invalidate branch target predictor
        myDSB   ,r9,,y                          ; Wait for completion
        myISB   ,r9,,y
        ; Check whether we're ARMv7 (and thus multi-level cache) or ARMv6 (and thus single-level cache)
        MRC     p15, 0, r8, c0, c0, 1
        TST     r8, #&80000000 ; EQ=ARMv6, NE=ARMv7
        BEQ     %FT80

        ; This is basically the same algorithm as the MaintainDataCache_WB_CR7_Lx macro, but tweaked to use less registers and to read from CP15 directly
        MRC     p15, 1, r8, c0, c0, 1           ; Read CLIDR to r8
        TST     r8, #&07000000
        BEQ     %FT50
        MOV     r11, #0 ; Current cache level
10 ; Loop1
        ADD     r10, r11, r11, LSR #1 ; Work out 3 x cachelevel
        MOV     r9, r8, LSR r10 ; bottom 3 bits are the Cache type for this level
        AND     r9, r9, #7 ; get those 3 bits alone
        CMP     r9, #2
        BLT     %FT40 ; no cache or only instruction cache at this level
        MCR     p15, 2, r11, c0, c0, 0 ; write CSSELR from r11
        ISB
        MRC     p15, 1, r9, c0, c0, 0 ; read current CSSIDR to r9
        AND     r10, r9, #CCSIDR_LineSize_mask ; extract the line length field
        ADD     r10, r10, #4 ; add 4 for the line length offset (log2 16 bytes)
        LDR     r8, =CCSIDR_Associativity_mask:SHR:CCSIDR_Associativity_pos
        AND     r8, r8, r9, LSR #CCSIDR_Associativity_pos ; r8 is the max number on the way size (right aligned)
        CLZ     r13, r8 ; r13 is the bit position of the way size increment
        LDR     r12, =CCSIDR_NumSets_mask:SHR:CCSIDR_NumSets_pos
        AND     r12, r12, r9, LSR #CCSIDR_NumSets_pos ; r12 is the max number of the index size (right aligned)
20 ; Loop2
        MOV     r9, r12 ; r9 working copy of the max index size (right aligned)
30 ; Loop3
        ORR     r14, r11, r8, LSL r13 ; factor in the way number and cache number into r14
        ORR     r14, r14, r9, LSL r10 ; factor in the index number
        DCISW   r14 ; Invalidate
        SUBS    r9, r9, #1 ; decrement the index
        BGE     %BT30
        SUBS    r8, r8, #1 ; decrement the way number
        BGE     %BT20
        DSB                ; Cortex-A7 errata 814220: DSB required when changing cache levels when using set/way operations. This also counts as our end-of-maintenance DSB.
        MRC     p15, 1, r8, c0, c0, 1
40 ; Skip
        ADD     r11, r11, #2
        AND     r14, r8, #&07000000
        CMP     r14, r11, LSL #23
        BGT     %BT10

50 ; Finished
        ; Wait for clean to complete
        MOV     r8, #0
        MCR     p15, 0, r8, c7, c5, 0           ; invalidate instruction cache
        MCR     p15, 0, r8, c8, c7, 0           ; invalidate TLBs
        MCR     p15, 0, r8, c7, c5, 6           ; invalidate branch target predictor
        myDSB   ,r8,,y                          ; Wait for completion
        myISB   ,r8,,y
        ; All caches clean; switch back to SVC, then recover the stored PSR from ip (although we can be fairly certain we started in SVC anyway)
        MSR     CPSR_c, #F32_bit+I32_bit+SVC32_mode
        MSR     CPSR_cxsf, ip
        MOV     pc, lr
80 ; ARMv6 case
        MCR     ARM_config_cp,0,r9,ARMv4_cache_reg,C7 ; ARMv3-ARMv6 I+D cache flush
        B       %BT50
 ] ; MEMM_Type = "VMSAv6"

; int PhysAddrToPageNo(uint64_t addr)
;
; Converts a physical address to the page number of the page containing it.
; Returns -1 if address is not in RAM.

PhysAddrToPageNo
        ; Convert address to 4K addressing
        MOV     a1, a1, LSR #12
        ORR     a1, a1, a2, LSL #20
        MOV     a4, #0
        LDR     ip, =ZeroPage + PhysRamTable
10      LDMIA   ip!, {a2, a3}                   ; get phys addr, size
        MOVS    a3, a3, LSR #12                 ; end of list? (size=0)
        BEQ     %FT90                           ;   then it ain't RAM
        SUB     a2, a1, a2                      ; a2 = amount into this bank
        CMP     a2, a3                          ; if more than size
        ADDHS   a4, a4, a3                      ;   increase counter by size of bank
        BHS     %BT10                           ;   and move to next
        ADD     a1, a4, a2                      ; add offset to counter
        MOV     pc, lr

90      MOV     a1, #-1
        MOV     pc, lr


; A routine to construct the soft CAM from the page tables. This is used
; after a soft reset, and also on a hard reset as it's an easy way of
; clearing up after the recursive page table allocaton.

        ROUT
ConstructCAMfromPageTables
        Push    "v1-v8, lr"
        LDR     a1, =ZeroPage
        LDR     a2, [a1, #MaxCamEntry]
        LDR     v1, [a1, #CamEntriesPointer]    ; v1 -> CAM (for whole routine)
        ADD     a2, a2, #1
        ADD     a2, v1, a2, LSL #CAM_EntrySizeLog2

        LDR     a3, =DuffEntry                  ; Clear the whole CAM, from
        MOV     a4, #AreaFlags_Duff             ; the top down.
        ASSERT  CAM_LogAddr=0
        ASSERT  CAM_PageFlags=4
        ASSERT  CAM_PMP=8
        ASSERT  CAM_PMPIndex=12
        ASSERT  CAM_EntrySize=16
        MOV     v2, #0
        MOV     v3, #-1
10      STMDB   a2!, {a3, a4, v2, v3}
        CMP     a2, v1
        BHI     %BT10

        MOV     v2, #0                          ; v2 = logical address
30      MOV     a1, v2
        PTOp    LoadAndDecodeL1Entry            ; a1,a2 = phys addr, a3 = page flags/type, a4 = page size (bytes)
        CMP     a3, #-2                         ; Only care about page table pointers
        BEQ     %FT39
        ADDS    v2, v2, a4
        BCC     %BT30
        Pull    "v1-v8, pc"

39      ADD     v7, v2, a4
40      MOV     a1, v2
        PTOp    LoadAndDecodeL2Entry            ; a1,a2 = phys addr, a3 = flags (-1 if fault), a4 = page size (bytes)
        CMP     a3, #-1                         ; move to next page if fault
        BEQ     %FT80
        SUBS    a4, a4, #4096                   ; large pages get bits 12-15 from the virtual address
        ANDNE   lr, v2, a4
        ORR     v6, a3, #PageFlags_Unavailable
        ORRNE   a1, a1, lr
        BL      PhysAddrToPageNo                ; -1 if unknown page
        ADDS    a1, v1, a1, LSL #CAM_EntrySizeLog2 ; a1 -> CAM entry
        ASSERT  CAM_LogAddr=0
        ASSERT  CAM_PageFlags=4
        STMCCIA a1, {v2, v6}                    ; store logical address, PPL

80      ADDS    v2, v2, #&00001000              ; always advance in 4K units here
        TEQ     v2, v7
        BNE     %BT40                           ; more to do from this L1 entry
        BCC     %BT30                           ; logical address not wrapped yet

        Pull    "v1-v8, pc"



; Allocate a physical page from DRAM
;
; On entry:
;    v1 -> current entry in PhysRamTable
;    v2 -> end of last used physical page (page units)
; On exit:
;    a1 -> next free page (assumed 32bit address)
;    v1, v2 updated
;
; No out of memory check...

Init_ClaimPhysicalPage
        MOV     a1, v2
        LDMIA   v1, {a2, a3}
        ADD     a2, a2, a3, LSR #12             ; a2 = end of this bank
        CMP     v2, a2                          ; advance v2 to next bank if
        LDRHS   a1, [v1, #8]!                   ; this bank is fully used
        ADD     v2, a1, #1
        MOV     a1, a1, LSL #12                 ; Convert to byte address
        MOV     pc, lr

Count32bitPages ROUT
        LDR     a1, =ZeroPage
        LDR     a2, [a1, #MaxCamEntry]
      [ LongDesc
        ; ~64bit RAM addresses supported, examine PhysRamTable to determine
        ; the last page number with a 32bit address
        Entry
        MOV     a3, #-1
        ADD     a4, a1, #PhysRamTable
10
        LDMIA   a4!, {ip, lr}
        CMP     ip, #1:SHL:20                   ; Address below 4G?
        ADDLO   a3, a3, lr, LSR #12             ; Count it up
        CMPLO   a3, a2                          ; Don't overrun the table
        BLO     %BT10
        STR     a3, [a1, #MaxCamEntry32]
        ; Update ProcessorFlags
        CMP     a2, a3
        LDRNE   a2, [a1, #ProcessorFlags]
        ORRNE   a2, a2, #CPUFlag_HighRAM
        STRNE   a2, [a1, #ProcessorFlags]
        EXIT
      |
        ; No 64bit support, so all pages must have 32bit addresses
        STR     a2, [a1, #MaxCamEntry32]
        MOV     pc, lr
      ]

        ; in:  ip = ZeroPage
        ; out: v3 = PhysIllegalMask
DeterminePhysIllegalMask
        Push    "lr"
        MOV     v3, #&FFFFFFFF         ; By default any upper bits are illegal
        ARM_read_ID lr
        AND     lr, lr, #&F :SHL: 16
        CMP     lr, #ARMvF :SHL: 16    ; Check that feature registers are implemented
        BNE     %FT01                  ; Some ARMv5 chips supported supersections, but let's not worry about them
        MRC     p15, 0, lr, c0, c1, 7  ; ID_MMFR3
        TST     lr, #&F :SHL: 28
        MOVEQ   v3, #&FFFFFF00
01      STR     v3, [ip, #PhysIllegalMask]
        Pull    "pc"


; void Init_PageTablesChanged(void)
;
; A TLB+cache invalidation that works on all known ARMs. Invalidate all I+D TLB is the _only_ TLB
; op that works on ARM720T, ARM920T and SA110. Ditto invalidate all I+D cache.
;
; DOES NOT CLEAN THE DATA CACHE. This is a helpful simplification, but requires that don't use
; this routine after we've started using normal RAM.
;
Init_PageTablesChanged
        MOV     a3, lr
        BL      Init_ARMarch
        MOV     ip, #0
        BNE     %FT01
        MCREQ   ARM_config_cp,0,ip,ARMv3_TLBflush_reg,C0
        B       %FT02
01      MCRNE   ARM_config_cp,0,ip,ARMv4_TLB_reg,C7
02
 [ MEMM_Type = "VMSAv6"
        CMP     a1, #ARMvF
        ADREQ   lr, %FT01
        BEQ     HAL_InvalidateCache_ARMvF
        MCRNE   ARM_config_cp,0,ip,ARMv4_cache_reg,C7           ; works on ARMv3
01
 |
        MCR     ARM_config_cp,0,ip,ARMv4_cache_reg,C7           ; works on ARMv3
 ]
        MOV     pc, a3


;++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
;
;       ClearWkspRAM - Routine to clear "all" workspace
;
; We have to avoid anything between InitUsedStart and InitUsedEnd - i.e.
; the page tables, HAL workspace, etc.
;
; Note that zero page workspace isn't included in InitUsedStart/InitUsedEnd.
; Sensitive areas of it (e.g. PhysRamTable, IRQ vector) are skipped via the
; help of RamSkipTable
;
; The bulk of RAM is cleared during the keyboard scan (ClearFreePoolSection).

;
; out:  r4-r11, r13 preserved
;

ClearWkspRAM ROUT
        MSR     CPSR_c, #F32_bit+FIQ32_mode             ; get some extra registers
        MOV     r8, #0
        MOV     r9, #0
        MOV     r10, #0
        MOV     r11, #0
        MOV     r12, #0
        MOV     r13, #0
        MOV     r14, #0
        MSR     CPSR_c, #F32_bit+SVC32_mode

        LDR     r0,=ZeroPage+InitClearRamWs             ;we can preserve r4-r11,lr in one of the skipped regions
        STMIA   r0,{r4-r11,lr}

        DebugTX "ClearWkspRAM"

        ; Start off by clearing zero page + scratch space, as these:
        ; (a) are already mapped in and
        ; (b) may require the use of the skip table
        LDR     r0, =ZeroPage
        ADD     r1, r0, #16*1024
        ADR     r6, RamSkipTable
        MSR     CPSR_c, #F32_bit+FIQ32_mode             ; switch to our bank o'zeros
        LDR     r5, [r6], #4                            ; load first skip addr
10
        TEQ     r0, r1
        TEQNE   r0, r5
        STMNEIA r0!, {r8-r11}
        BNE     %BT10
        TEQ     r0, r1
        BEQ     %FT20
        LDR     r5, [r6], #4                            ; load skip amount
        ADD     r0, r0, r5                              ; and skip it
        LDR     r5, [r6], #4                            ; load next skip addr
        B       %BT10
20
        LDR     r0, =ScratchSpace
        ADD     r1, r0, #ScratchSpaceSize
30
        TEQ     r0, r1
        STMNEIA r0!, {r8-r11}
        STMNEIA r0!, {r8-r11}
        BNE     %BT30

        MSR     CPSR_c, #F32_bit+SVC32_mode

        LDR     r0, =ZeroPage+InitClearRamWs
        LDMIA   r0, {r4-r11,r14}                        ;restore

      [ {FALSE} ; NewReset sets this later
        LDR     r0, =ZeroPage+OsbyteVars + :INDEX: LastBREAK
        MOV     r1, #&80
        STRB    r1, [r0]                                ; flag the fact that RAM cleared
      ]

        MOV     pc, lr

        LTORG

        MACRO
        MakeSkipTable $addr, $size
        ASSERT  ($addr :AND: 15) = 0
        ASSERT  ($size :AND: 15) = 0
        ASSERT  ($addr-ZeroPage) < 16*1024
        &       $addr, $size
        MEND

        MACRO
        EndSkipTables
        &       -1
        MEND

RamSkipTable
        MakeSkipTable   ZeroPage, InitWsEnd
        MakeSkipTable   ZeroPage+SkippedTables, SkippedTablesEnd-SkippedTables
        EndSkipTables

;++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
;
;       ClearFreePoolSection - Routine to clear a section of the free pool
;
; During keyboard scanning we soak up slack time clearing the bulk of RAM
; by picking a section of the free pool, mapping it in, clearing & flushing.

;
; In:   r0 = CAM entry to continue from
; Out:  r0 = updated
;

ClearFreePoolSection ROUT
        Push    "r1-r3, lr"
        LDR     r3, =ZeroPage
        LDR     r1, [r3, #MaxCamEntry]
        LDR     r2, =ZeroPage+FreePoolDANode
        CMP     r0, r1
        BHI     %FT30

        LDR     r3, [r3, #CamEntriesPointer]
        ADD     r1, r3, r1, LSL #CAM_EntrySizeLog2      ; top entry (inc)
        ADD     r3, r3, r0, LSL #CAM_EntrySizeLog2      ; starting entry
10
        LDR     r14, [r3, #CAM_PageFlags]
        TST     r14, #DynAreaFlags_PMP
        BEQ     %FT20

        LDR     r14, [r3, #CAM_PMP]
        TEQ     r14, r2
        BEQ     %FT40
20
        ADD     r3, r3, #CAM_EntrySize                  ; next
        CMP     r3, r1
        BLS     %BT10
30
        MOV     r0, #-1
        Pull    "r1-r3, pc"
40
        Push    "r0-r12"

        ; This is a PMP entry in the free pool
        LDR     r14, [r3, #CAM_PMPIndex]                ; list index
        LDR     r9, [r2, #DANode_PMP]                   ; PMP list base
        LDR     r3, [r9, r14, LSL #2]                   ; ppn
        BL      ppn_to_physical                         ; => r8,r9 = PA

      [ MEMM_Type = "ARM600"
        ; Map in this section, cacheable + bufferable to ensure burst writes
        ; are performed (StrongARM will only perform burst writes to CB areas)
        MOV     a1, #OSAP_None
      |
        ; Map in this section with default NCB cache policy. Making it cacheable
        ; is liable to slow things down significantly on some platforms (e.g.
        ; PL310 L2 cache)
        LDR     a1, =OSAP_None + DynAreaFlags_NotCacheable
      ]
        MOV     a2, r8
        MOV     a3, r9
        MOV     a4, #0
        PTOp    AccessPhysicalAddress

        MOV     r5, r8, LSR #20
        ORR     r5, r5, r9, LSL #12                     ; r5 = physical MB

        MOV     r4, #0                                  ; clear to this value
        MOV     r6, r4
        MOV     r7, r4
        MOV     r12, r4
45
        MOV     r8, r4
        MOV     r9, r4
        MOV     r10, r4
        MOV     r11, r4

        ; Fill that page
        ADD     r2, r0, #4096
50
        STMIA   r0!, {r4,r6-r12}
        STMIA   r0!, {r4,r6-r12}
        TEQ     r0, r2
        BNE     %BT50

        MOV     r10, r5                                 ; previous phys MB

        ; Step the CAM until there are no more pages in that section
        LDR     r1, [sp, #1*4]
        LDR     r2, [sp, #2*4]
        LDR     r11, [sp, #3*4]
        B       %FT65
60
        LDR     r14, [r11, #CAM_PageFlags]
        TST     r14, #DynAreaFlags_PMP
        BEQ     %FT65

        LDR     r14, [r11, #CAM_PMP]
        TEQ     r14, r2
        BEQ     %FT70
65
        ADD     r11, r11, #CAM_EntrySize                ; next
        CMP     r11, r1
        BLS     %BT60

        MOV     r14, #-1                                ; CAM top, no more
        B       %FT80
70

        ; Next PMP entry in the free pool
        LDR     r14, [r11, #CAM_PMPIndex]               ; list index
        LDR     r9, [r2, #DANode_PMP]                   ; PMP list base
        LDR     r3, [r9, r14, LSL #2]                   ; ppn
        BL      ppn_to_physical                         ; => r8,r9 = PA

        MOV     r5, r8, LSR #20
        ORR     r5, r5, r9, LSL #12
        TEQ     r10, r5                                 ; same MB as previous?
        LDRNE   r14, =ZeroPage
        LDRNE   r14, [r14, #CamEntriesPointer]
        SUBNE   r14, r11, r14
        MOVNE   r14, r14, LSR #CAM_EntrySizeLog2        ; no, so compute continuation point
        SUBEQ   r0, r0, #4                              ; wind back to make sure we stay in the correct megabyte of PhysicalAccess
      [ NoARMT2
        MOVEQ   r0, r0, LSR #20
        ORREQ   r0, r0, r8, LSL #12
        MOVEQ   r0, r0, ROR #12
      |
        BFIEQ   r0, r8, #0, #20
      ]
        STREQ   r11, [sp, #3*4]
        BEQ     %BT45                                   ; yes, so clear it
80
        STR     r14, [sp, #0*4]                         ; return value for continuation

 [ MEMM_Type = "ARM600" ; VMSAv6 maps as non-cacheable, so no flush required
        ; Make page uncacheable so the following is safe
        MOV     r4, r0

        LDR     r0, =OSAP_None+DynAreaFlags_NotCacheable
        MOV     r1, r10, LSL #20
        MOV     r2, r10, LSR #12
        MOV     r3, #0
        PTOp    AccessPhysicalAddress

        MOV     r0, r4

        ; Clean & invalidate the cache before the 1MB window closes
      [ CacheCleanerHack
        ; StrongARM requires special clean code, because we haven't mapped in
        ; DCacheCleanAddress yet. Cheat and only perform a clean, not full
        ; clean + invalidate (should be safe as we've only been writing)
        ARM_read_ID r2
        AND     r2, r2, #&F000
        CMP     r2, #&A000
        BNE     %FT90
85
        SUB     r0, r0, #32                             ; rewind 1 cache line
        ARMA_clean_DCentry r0
        MOVS    r1, r0, LSL #12                         ; start of the MB?
        BNE     %BT85
        B       %FT91
90
      ]
        ARMop Cache_CleanInvalidateAll
 ]
91
        MOV     a1, #L1_Fault
        PTOp    ReleasePhysicalAddress                  ; reset to default

        Pull    "r0-r12"

        Pull    "r1-r3, pc"

InitProcVecs
        BKPT    &C000                                   ; Reset
        BKPT    &C004                                   ; Undefined Instruction
        BKPT    &C008                                   ; SWI
        BKPT    &C00C                                   ; Prefetch Abort
        SUBS    pc, lr, #4                              ; ignore data aborts
        BKPT    &C014                                   ; Address Exception
        LDR     pc, InitProcVecs + InitIRQHandler       ; IRQ
        BKPT    &C01C                                   ; FIQ
InitProcVec_FIQ
        DCD     0
InitProcVecsEnd

; RISCOS_MapInIO flags
MapInFlag_DoublyMapped * 1:SHL:20
MapInFlag_APSpecified * 1:SHL:21

      [ LongDesc :LAND: ShortDesc
; Dispatch for pagetable-specific MapInIO / MapInIO64 (mainly to handle calls
; from HAL via the OS header table)
RISCOS_MapInIO
        PTWhich  ip
        BNE      RISCOS_MapInIO_LongDesc
        B        RISCOS_MapInIO_ShortDesc

RISCOS_MapInIO64
        PTWhich  ip
        BNE      RISCOS_MapInIO64_LongDesc
        B        RISCOS_MapInIO64_ShortDesc
      ELIF LongDesc
RISCOS_MapInIO * RISCOS_MapInIO_LongDesc
RISCOS_MapInIO64 * RISCOS_MapInIO64_LongDesc
      |
RISCOS_MapInIO * RISCOS_MapInIO_ShortDesc
RISCOS_MapInIO64 * RISCOS_MapInIO64_ShortDesc
      ]

; void RISCOS_AddDevice(unsigned int flags, struct device *d)
RISCOS_AddDevice
        ADDS    a1, a2, #0      ; also clears V
        B       HardwareDeviceAdd_Common

; uint64_t RISCOS_LogToPhys(const void *log)
; Returns -1 for invalid addresses
; Also returns page flags in r2 (for AbortTrap) and mapping size/alignment
; in r3 (for OS_Memory 65)
        EXPORT  RISCOS_LogToPhys
RISCOS_LogToPhys ROUT
        Push    "lr"
        MOV     r12, a1
        MOV     r0, a1, LSR #20
        MOV     r0, r0, LSL #20
        PTOp    LoadAndDecodeL1Entry
        CMP     r2, #-2
        BNE     %FT50
        ; L2 page table pointer. Because we've already checked L1PT, there
        ; shouldn't be any need for us to check if the relevant logical mapping
        ; of L2PT exists (like logical_to_physical does).
        MOV     r0, r12, LSR #12
        MOV     r0, r0, LSL #12
      [ AMB_LazyMapIn
        BL      AMB_MakeHonestLA
      ]
        PTOp    LoadAndDecodeL2Entry
        CMP     r2, #-2
50
        MOVHI   r0, #-1                ; Translation fault
        MOVHI   r1, #-1
        Pull    "pc",HI
        ; Valid mapping, add in the low address bits
        SUB     lr, r3, #1
        AND     r12, r12, lr
        ORR     r0, r0, r12
        Pull    "pc"

; int RISCOS_IICOpV(IICDesc *descs, uint32_t ndesc_and_bus)
RISCOS_IICOpV ROUT
        Push    "lr"
        BL      IIC_OpV
        MOVVC   a1, #IICStatus_Completed
        Pull    "pc", VC
        ; Map from RISC OS error numbers to abstract IICStatus return values
        LDR     a1, [a1]
        LDR     lr, =ErrorNumber_IIC_NoAcknowledge
        SUB     a1, a1, lr              ; 0/1/2 = NoAck/Error/Busy
        CMP     a1, #3
        MOVCS   a1, #3                  ; 3+ => unknown, either way it's an Error
        ADR     lr, %FT10
        LDRB    a1, [lr, a1]
        Pull    "pc"
10
        ASSERT    (ErrorNumber_IIC_Error - ErrorNumber_IIC_NoAcknowledge) = 1
        ASSERT    (ErrorNumber_IIC_Busy - ErrorNumber_IIC_NoAcknowledge) = 2
        DCB       IICStatus_NoACK, IICStatus_Error, IICStatus_Busy, IICStatus_Error
        ALIGN

SetUpHALEntryTable ROUT
        LDR     a1, =ZeroPage
        LDR     a2, [a1, #HAL_Descriptor]
        LDR     a3, [a1, #HAL_Workspace]
        LDR     a4, [a2, #HALDesc_Entries]
        LDR     ip, [a2, #HALDesc_NumEntries]
        ADD     a4, a2, a4                              ; a4 -> entry table
        MOV     a2, a4                                  ; a2 -> entry table (increments)
10      SUBS    ip, ip, #1                              ; decrement counter
        LDRCS   a1, [a2], #4
        BCC     %FT20
        TEQ     a1, #0
        ADREQ   a1, NullHALEntry
        ADDNE   a1, a4, a1                              ; convert offset to absolute
        STR     a1, [a3, #-4]!                          ; store backwards below HAL workspace
        B       %BT10
20      LDR     a1, =ZeroPage                           ; pad table with NullHALEntries
        LDR     a4, =HALWorkspace                       ; in case where HAL didn't supply enough
        ADR     a1, NullHALEntry
30      CMP     a3, a4
        STRHI   a1, [a3, #-4]!
        BHI     %BT30
        MOV     pc, lr


NullHALEntry
        MOV     pc, lr

; Can freely corrupt r10-r12 (v7,v8,ip).
HardwareSWI
        AND     ip, v5, #&FF

        CMP     ip, #OSHW_LookupRoutine
        ASSERT  OSHW_CallHAL < OSHW_LookupRoutine
        BLO     HardwareCallHAL
        BEQ     HardwareLookupRoutine

        CMP     ip, #OSHW_DeviceRemove
        ASSERT  OSHW_DeviceAdd < OSHW_DeviceRemove
        BLO     HardwareDeviceAdd
        BEQ     HardwareDeviceRemove

        CMP     ip, #OSHW_DeviceEnumerateChrono
        ASSERT  OSHW_DeviceEnumerate < OSHW_DeviceEnumerateChrono
        ASSERT  OSHW_DeviceEnumerateChrono < OSHW_MaxSubreason
        BLO     HardwareDeviceEnumerate
        BEQ     HardwareDeviceEnumerateChrono
        BHI     HardwareBadReason

HardwareCallHAL
        Push    "v1-v4,sb,lr"
        ADD     v8, sb, #1                              ; v8 = entry no + 1
        LDR     ip, =ZeroPage
        LDR     v7, [ip, #HAL_Descriptor]
        AddressHAL ip                                   ; sb set up
        LDR     v7, [v7, #HALDesc_NumEntries]           ; v7 = number of entries
        CMP     v8, v7                                  ; entryno + 1 must be <= number of entries
        BHI     HardwareBadEntry2
        LDR     ip, [sb, -v8, LSL #2]
        ADR     v7, NullHALEntry
        TEQ     ip, v7
        BEQ     HardwareBadEntry2
      [ NoARMv5
        MOV     lr, pc
        MOV     pc, ip
      |
        BLX     ip
      ]
        ADD     sp, sp, #4*4
        Pull    "sb,lr"
        ExitSWIHandler

HardwareLookupRoutine
        ADD     v8, sb, #1                              ; v8 = entry no + 1
        LDR     ip, =ZeroPage
        LDR     v7, [ip, #HAL_Descriptor]
        AddressHAL ip
        LDR     v7, [v7, #HALDesc_NumEntries]
        CMP     v8, v7                                  ; entryno + 1 must be <= number of entries
        BHI     HardwareBadEntry
        LDR     a1, [sb, -v8, LSL #2]
        ADR     v7, NullHALEntry
        TEQ     a1, v7
        BEQ     HardwareBadEntry
        MOV     a2, sb
        ExitSWIHandler

HardwareDeviceAdd
        Push    "r1-r3,lr"
        BL      HardwareDeviceAdd_Common
        Pull    "r1-r3,lr"
        B       SLVK_TestV

HardwareDeviceRemove
        Push    "r1-r3,lr"
        BL      HardwareDeviceRemove_Common
        Pull    "r1-r3,lr"
        B       SLVK_TestV

HardwareDeviceAdd_Common
        Entry
        BL      HardwareDeviceRemove_Common             ; first try to remove any device already at the same address
        EXIT    VS
        LDR     lr, =ZeroPage
        LDR     r1, [lr, #DeviceCount]
        LDR     r2, [lr, #DeviceTable]
        TEQ     r2, #0
        BEQ     %FT80
        ADD     r1, r1, #1                              ; increment DeviceCount
        LDR     lr, [r2, #-4]                           ; word before heap block is length including length word
        TEQ     r1, lr, LSR #2                          ; block already full?
        BEQ     %FT81
        LDR     lr, =ZeroPage
10      STR     r1, [lr, #DeviceCount]
        ADD     lr, r2, r1, LSL #2
        SUB     lr, lr, #4
11      LDR     r1, [lr, #-4]!                          ; copy existing devices up, so new ones get enumerated first
        STR     r1, [lr, #4]
        CMP     lr, r2
        BHI     %BT11
        STR     r0, [r2]
        MOV     r2, r0
        MOV     r1, #Service_Hardware
        MOV     r0, #0
        BL      Issue_Service
        ADDS    r0, r2, #0                              ; exit with V clear
        EXIT

80      ; Claim a system heap block for the device table
        Push    "r0"
        MOV     r3, #16
        BL      ClaimSysHeapNode
        ADDVS   sp, sp, #4
        EXIT    VS
        Pull    "r0"
        LDR     lr, =ZeroPage
        MOV     r1, #1
        STR     r2, [lr, #DeviceTable]
        B       %BT10

81      ; Extend the system heap block
        Push    "r0"
        MOV     r0, #HeapReason_ExtendBlock
        MOV     r3, #16
        BL      DoSysHeapOpWithExtension
        ADDVS   sp, sp, #4
        EXIT    VS
        Pull    "r0"
        LDR     lr, =ZeroPage
        LDR     r1, [lr, #DeviceCount]
        STR     r2, [lr, #DeviceTable]
        ADD     r1, r1, #1
        B       %BT10

HardwareDeviceRemove_Common ROUT
        Entry   "r4"
        LDR     lr, =ZeroPage
        LDR     r3, [lr, #DeviceCount]
        LDR     r4, [lr, #DeviceTable]
        TEQ     r3, #0
        EXIT    EQ                                      ; no devices registered
10      LDR     r2, [r4], #4
        SUBS    r3, r3, #1
        TEQNE   r2, r0
        BNE     %BT10
        TEQ     r2, r0
        EXIT    NE                                      ; this device not registered
        MOV     r0, #1
        MOV     r1, #Service_Hardware
        BL      Issue_Service
        CMP     r1, #0                                  ; if service call claimed
        CMPEQ   r1, #1:SHL:31                           ; then set V (r0 already points to error block)
        EXIT    VS                                      ; and exit
        ; Search for device again - we may have been re-entered
        MOV     r0, r2
        LDR     lr, =ZeroPage
        LDR     r3, [lr, #DeviceCount]
        LDR     r4, [lr, #DeviceTable]
        TEQ     r3, #0
        EXIT    EQ                                      ; no devices registered
20      LDR     r2, [r4], #4
        SUBS    r3, r3, #1
        TEQNE   r2, r0
        BNE     %BT20
        TEQ     r2, r0
        EXIT    NE                                      ; this device not registered
        SUBS    r3, r3, #1
30      LDRCS   r2, [r4], #4                            ; copy down remaining devices
        STRCS   r2, [r4, #-8]
        SUBCSS  r3, r3, #1
        BCS     %BT30
        LDR     lr, =ZeroPage
        LDR     r3, [lr, #DeviceCount]
        SUB     r3, r3, #1
        STR     r3, [lr, #DeviceCount]
        EXIT

HardwareDeviceEnumerate
        Push    "r3-r4,lr"
        LDR     lr, =ZeroPage
        LDR     r2, [lr, #DeviceCount]
        LDR     r3, [lr, #DeviceTable]
        SUBS    r4, r2, r1
        MOVLS   r1, #-1
        BLS     %FT90                                   ; if r1 is out of range then exit
        ADD     r3, r3, r1, LSL #2
10      ADD     r1, r1, #1
        LDR     r2, [r3], #4
        LDR     lr, [r2, #HALDevice_Type]
        EOR     lr, lr, r0
        MOVS    lr, lr, LSL #16                         ; EQ if types match
        SUBNES  r4, r4, #1
        BNE     %BT10
        TEQ     lr, #0
        MOVNE   r1, #-1
        BNE     %FT90
        LDR     lr, [r2, #HALDevice_Version]
        MOV     lr, lr, LSR #16
        CMP     lr, r0, LSR #16                         ; newer than our client understands?
        BLS     %FT90
        SUBS    r4, r4, #1
        BHI     %BT10
        MOV     r1, #-1
90
        Pull    "r3-r4,lr"
        ExitSWIHandler

HardwareDeviceEnumerateChrono
        Push    "r3-r4,lr"
        LDR     lr, =ZeroPage
        LDR     r2, [lr, #DeviceCount]
        LDR     r3, [lr, #DeviceTable]
        SUBS    r4, r2, r1
        MOVLS   r1, #-1
        BLS     %FT90                                   ; if r1 is out of range then exit
        ADD     r3, r3, r4, LSL #2
10      ADD     r1, r1, #1
        LDR     r2, [r3, #-4]!
        LDR     lr, [r2, #HALDevice_Type]
        EOR     lr, lr, r0
        MOVS    lr, lr, LSL #16                         ; EQ if types match
        SUBNES  r4, r4, #1
        BNE     %BT10
        TEQ     lr, #0
        MOVNE   r1, #-1
        BNE     %FT90
        LDR     lr, [r2, #HALDevice_Version]
        MOV     lr, lr, LSR #16
        CMP     lr, r0, LSR #16                         ; newer than our client understands?
        BLS     %FT90
        SUBS    r4, r4, #1
        BHI     %BT10
        MOV     r1, #-1
90
        Pull    "r3-r4,lr"
        ExitSWIHandler

HardwareBadReason
        ADR     r0, ErrorBlock_HardwareBadReason
 [ International
        Push    "lr"
        BL      TranslateError
        Pull    "lr"
 ]
        B       SLVK_SetV

HardwareBadEntry2
        ADD     sp, sp, #4*4
        Pull    "sb,lr"
HardwareBadEntry
        ADR     r0, ErrorBlock_HardwareBadEntry
 [ International
        Push    "lr"
        BL      TranslateError
        Pull    "lr"
 ]
        B       SLVK_SetV

        MakeErrorBlock HardwareBadReason
        MakeErrorBlock HardwareBadEntry

 [ DebugTerminal
DebugTerminal_Rdch
        Push    "a2-a4,sb,ip"
        MOV     sb, ip
20
        WritePSRc SVC_mode, r1
        CallHAL HAL_DebugRX
        CMP     a1, #27
        BNE     %FT25
        LDR     a2, =ZeroPage + OsbyteVars + :INDEX: RS423mode
        LDRB    a2, [a2]
        TEQ     a2, #0                  ; is RS423 raw data,or keyb emulator?
        BNE     %FT25
        LDR     a2, =ZeroPage
        LDRB    a1, [a2, #ESC_Status]
        ORR     a1, a1, #&40
        STRB    a1, [a2, #ESC_Status]   ; mark escape flag
        MOV     a1, #27
        SEC                             ; tell caller to look carefully at R0
        Pull    "a2-a4,sb,ip,pc"
25
        CMP     a1, #-1
        Pull    "a2-a4,sb,ip,pc",NE     ; claim it
        LDR     R0, =ZeroPage
        LDRB    R14, [R0, #CallBack_Flag]
        TST     R14, #CBack_VectorReq
        BLNE    process_callbacks_disableIRQ
        B       %BT20


DebugTerminal_Wrch
        Push    "a1-a4,sb,ip,lr"
        MOV     sb, ip
        CallHAL HAL_DebugTX
        Pull    "a1-a4,sb,ip,pc"        ; don't claim it
 ]


Reset_IRQ_Handler
        SUB     lr, lr, #4
        Push    "a1-a4,v1-v2,sb,ip,lr"
        MRS     a1, SPSR
        MRS     a2, CPSR
        ORR     a3, a2, #SVC32_mode
        MSR     CPSR_c, a3
        Push    "a1-a2,lr"

        ; If it's not an IIC interrupt, mute it
        LDR     v2, =ZeroPage
        AddressHAL v2
        CallHAL HAL_IRQSource
        ADD     v1, v2, #IICBus_Base
        MOV     ip, #0
10
        LDR     a2, [v1, #IICBus_Type]
        TST     a2, #IICFlag_Background
        BEQ     %FT20
        LDR     a2, [v1, #IICBus_Device]
        CMP     a2, a1
        ADREQ   lr, Reset_IRQ_Exit
        BEQ     IICIRQ
20
        ADD     ip, ip, #1
        ADD     v1, v1, #IICBus_Size
        CMP     ip, #IICBus_Count
        BNE     %BT10

        CallHAL HAL_IRQDisable ; Stop the rogue device from killing us completely

Reset_IRQ_Exit
        MyCLREX a1, a2
        Pull    "a1-a2,lr"
        MSR     CPSR_c, a2
        MSR     SPSR_cxsf, a1
        Pull    "a1-a4,v1-v2,sb,ip,pc",,^

 [ DebugHALTX
DebugHALPrint
        Push    "a1-a4,v1,v2,sb,ip"
        MRS     v2, CPSR
        AddressHAL
        MOV     v1, lr
10      LDRB    a1, [v1], #1
        TEQ     a1, #0
        BEQ     %FT20
        CallHAL HAL_DebugTX
        B       %BT10
20      MOV     a1, #13
;        CallHAL HAL_DebugTX
        MOV     a1, #10
;        CallHAL HAL_DebugTX
        ADD     v1, v1, #3
        BIC     lr, v1, #3
        MSR     CPSR_sf, v2
        Pull    "a1-a4,v1,v2,sb,ip"
        MOV     pc, lr
 ]


 [ DebugHALTX
DebugHALPrintReg ; Output number on top of stack to the serial port
        Push    "a1-a4,v1-v4,sb,ip,lr"   ; this is 11 regs
        MRS     v1, CPSR
        LDR     v2, [sp,#11*4]           ; find TOS value on stack
        ADR     v3, hextab
        MOV     v4, #8
05
       AddressHAL
10      LDRB    a1, [v3, v2, LSR #28]
       CallHAL  HAL_DebugTX
        MOV     v2, v2, LSL #4
        SUBS    v4, v4, #1
        BNE     %BT10
        MOV     a1, #13
       CallHAL  HAL_DebugTX
        MOV     a1, #10
       CallHAL  HAL_DebugTX

        MSR     CPSR_sf, v1
        Pull    "a1-a4,v1-v4,sb,ip,lr"
        ADD     sp, sp, #4
        MOV     pc, lr

hextab  DCB "0123456789abcdef"


 ]
;
;
; [ DebugHALTX
;HALDebugHexTX
;       stmfd    r13!, {r0-r3,sb,ip,lr}
;       AddressHAL
;       b        jbdt1
;HALDebugHexTX2
;       stmfd    r13!, {r0-r3,sb,ip,lr}
;       AddressHAL
;       mov      r0,r0,lsl #16
;       b        jbdt2
;HALDebugHexTX4
;       stmfd    r13!, {r0-r3,sb,ip,lr}
;       AddressHAL
;       mov      r0,r0,ror #24          ; hi byte
;       bl       jbdtxh
;       mov      r0,r0,ror #24
;       bl       jbdtxh
;jbdt2
;       mov      r0,r0,ror #24
;       bl       jbdtxh
;       mov      r0,r0,ror #24
;jbdt1
;       bl       jbdtxh
;       mov      r0,#' '
;       CallHAL  HAL_DebugTX
;       ldmfd    r13!, {r0-r3,sb,ip,pc}
;
;jbdtxh stmfd    r13!,{a1,v1,lr}        ; print byte as hex. corrupts a2-a4, ip, assumes sb already AddressHAL'd
;       and      v1,a1,#&f              ; get low nibble
;       and      a1,a1,#&f0             ; get hi nibble
;       mov      a1,a1,lsr #4           ; shift to low nibble
;       cmp      a1,#&9                 ; 9?
;       addle    a1,a1,#&30
;       addgt    a1,a1,#&37             ; convert letter if needed
;       CallHAL  HAL_DebugTX
;       cmp      v1,#9
;       addle    a1,v1,#&30
;       addgt    a1,v1,#&37
;       CallHAL  HAL_DebugTX
;       ldmfd    r13!,{a1,v1,pc}
; ]
;
        END