; Copyright 2000 Pace Micro Technology plc
;
; Licensed under the Apache License, Version 2.0 (the "License");
; you may not use this file except in compliance with the License.
; You may obtain a copy of the License at
;
;     http://www.apache.org/licenses/LICENSE-2.0
;
; Unless required by applicable law or agreed to in writing, software
; distributed under the License is distributed on an "AS IS" BASIS,
; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
; See the License for the specific language governing permissions and
; limitations under the License.
;
        GBLL    MinorL2PThack
MinorL2PThack SETL {TRUE}

; Fixed page allocation is as follows

                        ^       0
DRAMOffset_FirstFixed   #       0
DRAMOffset_ScratchSpace #       16*1024
DRAMOffset_PageZero     #       16*1024
DRAMOffset_PageTables   #       0               ; Base of the page tables used during init; just a useful symbol we can use for relative offsets
 [ LongDesc
DRAMOffset_LL2PT        #       16*1024         ; Each page needs to be page-aligned
DRAMOffset_LL1PT        #       4096            ; Only needs to be 32 byte aligned
 |
DRAMOffset_L1PT         #       16*1024         ; L1PT must be 16K-aligned
 ]
DRAMOffset_LastFixed    #       0

;        IMPORT  Init_ARMarch
;        IMPORT  ARM_Analyse

 [ MEMM_Type = "VMSAv6"
mmuc_init_new
        ; MMUC initialisation flags for ARMv6/ARMv7
        ; This tries to leave the reserved/unpredictable bits untouched, while initialising everything else to what we want
                ; ARMv7MP (probably) wants SW. ARMv6 wants U+XP (which should both be fixed at 1 on ARMv7)
        DCD     MMUC_BEN+MMUC_SW+MMUC_U+MMUC_XP
                ; M+C+W+Z+I+L2 clear to keep MMU/caches off.
                ; A to keep alignment exceptions off (for now at least)
                ; B+EE clear for little-endian
                ; S+R+RR clear to match mmuc_init_old
                ; V+VE clear to keep processor vectors at &0
                ; FI clear to disable fast FIQs (interruptible LDM/STM)
                ; TRE+AFE clear for our VMSAv6 implementation
                ; TE clear for processor vectors to run in ARM mode
        DCD     MMUC_M+MMUC_A+MMUC_C+MMUC_W+MMUC_B+MMUC_S+MMUC_R+MMUC_Z+MMUC_I+MMUC_V+MMUC_RR+MMUC_FI+MMUC_VE+MMUC_EE+MMUC_L2+MMUC_TRE+MMUC_AFE+MMUC_TE
mmuc_init_old
        ; MMUC initialisation flags for ARMv5 and below, as per ARM600 MMU code
                ; Late abort (ARM6 only), 32-bit Data and Program space. No Write buffer (ARM920T
                ; spec says W bit should be set, but I reckon they're bluffing).
                ;
                ; The F bit's tricky. (1 => CPCLK=FCLK, 0=>CPCLK=FCLK/2). The only chip using it was the
                ; ARM700, it never really reached the customer, and it's always been programmed with
                ; CPCLK=FCLK. Therefore we'll keep it that way, and ignore the layering violation.
        DCD     MMUC_F+MMUC_L+MMUC_D+MMUC_P
                ; All of these bits should be off already, but just in case...
        DCD     MMUC_B+MMUC_W+MMUC_C+MMUC_A+MMUC_M+MMUC_RR+MMUC_V+MMUC_I+MMUC_Z+MMUC_R+MMUC_S
 ]

; void RISCOS_InitARM(unsigned int flags)
;
RISCOS_InitARM
        MOV     a4, lr
        ; Check if we're architecture 3. If so, don't read the control register.
        BL      Init_ARMarch
        MOVEQ   a3, #0
        ARM_read_control a3, NE
 [ MEMM_Type = "VMSAv6"
        CMP     a1, #ARMv6
        CMPNE   a1, #ARMvF
        ADREQ   a2, mmuc_init_new
        ADRNE   a2, mmuc_init_old
        LDMIA   a2, {a2, lr}
        ORR     a3, a3, a2
        BIC     a3, a3, lr
 |
        ; Late abort (ARM6 only), 32-bit Data and Program space. No Write buffer (ARM920T
        ; spec says W bit should be set, but I reckon they're bluffing).
        ;
        ; The F bit's tricky. (1 => CPCLK=FCLK, 0=>CPCLK=FCLK/2). The only chip using it was the
        ; ARM700, it never really reached the customer, and it's always been programmed with
        ; CPCLK=FCLK. Therefore we'll keep it that way, and ignore the layering violation.
        ORR     a3, a3, #MMUC_F+MMUC_L+MMUC_D+MMUC_P
        ; All of these bits should be off already, but just in case...
        BIC     a3, a3, #MMUC_B+MMUC_W+MMUC_C+MMUC_A+MMUC_M
        BIC     a3, a3, #MMUC_RR+MMUC_V+MMUC_I+MMUC_Z+MMUC_R+MMUC_S
 ]

        ; Off we go.
        ARM_write_control a3
        MOV     a2, #0
 [ MEMM_Type = "VMSAv6"
        myISB   ,a2,,y ; Ensure the update went through
 ]

        ; In case it wasn't a hard reset
 [ MEMM_Type = "VMSAv6"
        CMP     a1, #ARMvF
        ; Assume that all ARMvF ARMs have multi-level caches and thus no single MCR op for invalidating all the caches
        ADREQ   lr, %FT01
        BEQ     HAL_InvalidateCache_ARMvF
        MCRNE   ARM_config_cp,0,a2,ARMv4_cache_reg,C7           ; invalidate I+D caches
01
 ]
        CMP     a1, #ARMv3
        BNE     %FT01
        MCREQ   ARM_config_cp,0,a2,ARMv3_TLBflush_reg,C0        ; flush TLBs
        B       %FT02
01      MCRNE   ARM_config_cp,0,a2,ARMv4_TLB_reg,C7             ; flush TLBs
02
 [ MEMM_Type = "VMSAv6"
        myDSB   ,a2,,y
        myISB   ,a2,,y
 ]

        ; We assume that ARMs with an I cache can have it enabled while the MMU is off.
        [ :LNOT:CacheOff
        ORRNE   a3, a3, #MMUC_I
        ARM_write_control a3, NE                                ; whoosh
 [ MEMM_Type = "VMSAv6"
        myISB   ,a2,,y ; Ensure the update went through
 ]
        ]

        ; Check if we are in a 26-bit mode.
        MRS     a2, CPSR
        ; Keep a soft copy of the CR in a banked register (R13_und)
        MSR     CPSR_c, #F32_bit+I32_bit+UND32_mode
        MOV     sp, a3
        ; Switch into SVC32 mode (we may have been in SVC26 before).
        MSR     CPSR_c, #F32_bit+I32_bit+SVC32_mode

        ; If we were in a 26-bit mode, the lr value given to us would have had PSR flags in.
        TST     a2, #2_11100
        MOVNE   pc, a4
        BICEQ   pc, a4, #ARM_CC_Mask

; void *RISCOS_AddRAM(unsigned int flags, uintptr_t start, uintptr_t end, uintptr_t sigbits, void *ref)
;   Entry:
;     flags   bit 0: video memory (currently only one block permitted)
;             bit 1: video memory is not suitable for general use
;             bit 2: memory can't be used for DMA (sound, video, or other)
;             bits 8-11: speed indicator (arbitrary, higher => faster)
;             bit 12: start, end, sigbits are shifted right 12 bits (for supporting large physical addresses)
;             other bits reserved (SBZ)
;     start   = start address of RAM (inclusive) (no alignment requirements)
;     end     = end address of RAM (exclusive) (no alignment requirements, but must be >= start)
;     sigbits = significant address bit mask (1 => this bit of addr decoded, 0 => this bit ignored)
;     ref     = reference handle (NULL for first call)
;
; The first registered block must be in the low 4GB, and blocks must not cross
; 4GB thresholds.
; The (unshifted) sigbits is assumed to be the same across all calls.

; A table is built up at the head of the first block of memory.
; The table consists of (addr, len, flags) pairs, terminated by a count of
; those pairs; ref points to that counter.
; The table stores addresses are in terms of 4K pages, allowing us to
; theoretically support a 44 bit physical space.
; Twelve bits of flags are stored at the bottom of the length word.
; The remaining 20 length bits are the length of the block in 4K pages, minus
; one (so a full 4GB region will have length &FFFFF)

        ROUT
RISCOS_AddRAM
        Entry   "v1-v4"
        LDR     v4, [sp, #5*4]          ; Get ref

        ; Round to pages. If we were extra sneaky we could not do this and chuck out incomplete
        ; pages after concatanation, but it would be a weird HAL that gave us pages split across
        ; calls.
        ;
        TST     a1, #OSAddRAM_LargeAddresses
        ADDEQ   a2, a2, #4096           ; round start address up
        SUBEQ   a2, a2, #1
        MOVEQ   a2, a2, LSR #12
        MOVEQ   a3, a3, LSR #12         ; round end address down
        MOVEQ   a4, a4, LSR #12
        ORREQ   a4, a4, #&FF000000
        ORREQ   a4, a4, #&00F00000
        BIC     a1, a1, #OSAddRAM_LargeAddresses ; Flag no longer relevant

      [ :LNOT: LongDesc
        ; Ignore any RAM above the 32bit barrier if we've been built with
        ; long-descriptor support disabled (HAL may not know our build settings)
        CMP     a2, #1:SHL:20
        BHS     %FT90
      ]

        CMP     a3, a2
        BLS     %FT90                   ; check we aren't now null

        CMP     v4, #0
        BEQ     %FT20


        ; We are not dealing with the first block since v4 != 0.  Make an attempt to merge this block
        ; with the previous block.
        LDMDB   v4, {v1, v2}            ; Get details of the previous block
        EOR     ip, v1, a2
        CMP     ip, #1:SHL:20
        BHS     %FT20                   ; Don't merge if they're in different 4GB blocks
        MOV     v3, v2, LSL #20         ; Isolate flags
        MOV     v2, v2, LSR #12         ; And get length in pages
        ADD     v2, v1, v2              ; Get the end address
        ADD     v2, v2, #1
        EOR     v2, v2, a2              ; Compare with the current block start address...
        TST     v2, a4                  ; ... but only check the decoded bits.
        EOR     v2, v2, a2              ; Restore the previous block end address.
        TEQEQ   v3, a1, LSL #20         ; And are the page flags the same?
        BNE     %FT10                   ; We can't merge it after the previous block

        ; v1 = previous start
        ; v2 = previous end
        ; The block is just after the previous block.  That means the start address is unchanged, but
        ; the length is increased.
        LDR     v2, [v4, #-4]           ; Reload length+flags
        SUB     a3, a3, a2              ; Find the length of the new block.
        ; a3 = length of block
        ADD     v2, v2, a3, LSL #12     ; Add it to the previous length.
        STR     v2, [v4, #-4]           ; Update the block size in memory.
        MOV     a1,v4
        EXIT

        ; The block is not just after the previous block, but it may be just before.  This may be the
        ; case if we are softloaded.
10      SUB     v1, v1, #1              ; Compare the address before the previous block start ...
        SUB     a3, a3, #1              ; ... with the address of the last page in this block ...
        EOR     v1, v1, a3
        TST     v1, a4                  ; ... but check only the decoded bits.
        ADD     a3, a3, #1              ; Restore the end address.
        TEQEQ   v3, a1, LSL #20         ; And are the page flags the same?
        BNE     %FT20                   ; Skip if we cannot merge the block.

        ; The block is just before the previous block.  The start address and length both change.
        LDMDB   v4, {v1, v2}            ; Get the previous block again
        SUB     a3, a3, a2              ; Calculate the current block size.
        SUB     v1, v1, a3              ; Subtract from the previous block start address.
        ADD     v2, v2, a3, LSL #12     ; Calculate the new length+flags
        STMDB   v4, {v1, v2}            ; Update the block info in memory.
        MOV     a1,v4
        EXIT

        ; We now have a region which does not merge with a previous region.  We move it up to the
        ; highest address we can in the hope that this block will merge with the next block.
20      SUB     a3, a3, a2              ; Calculate the block size
        SUB     a3, a3, #1
        ORR     a3, a3, a1, LSL #20     ; Put the flags in
        MOV     a3, a3, ROR #20         ; And rotate so they're at the bottom
        MVN     v1, a4                  ; Get the non-decoded address lines.
        ORR     a2, v1, a2              ; Set the non-decoded address bit in the start address.

30      CMP     v4, #0                  ; If the workspace has not been allocated...
        MOVEQ   v4, a2, LSL #12         ; ... use this block.
        MOVEQ   v1, #0                  ; Initialise the counter.

        ; The block/fragment to be added is between a2 and a2+a3.
        LDRNE   v1, [v4]                ; Get the old counter if there was one.
        STMIA   v4!, {a2, a3}           ; Store address and size.
        ADD     v1, v1, #1              ; Increment the counter.
        STR     v1, [v4]                ; Store the counter.

90      MOV     a1,v4
        EXIT                            ; We've done with this block now.



; Subtractv1v2fromRAMtable
;
; On entry: v1 = base of memory area
;           v2 = size of memory area
;           a4 = RAM table handle (ie pointer to terminator word containing number of entries)
;
; On exit:  a1-a3 preserved
;           a4 and RAM table updated
;           other registers corrupted
Subtractv1v2fromRAMtable
        ADD     v2, v1, v2              ; v2 = end address
        MOV     v1, v1, LSR #12         ; round base down
        ADD     v2, v2, #4096
        SUB     v2, v2, #1
        MOV     v2, v2, LSR #12         ; round end up

        LDR     v5, [a4]
        SUB     v8, a4, v5, LSL #3
10      TEQ     v8, a4
        MOVEQ   pc, lr
        LDMIA   v8!, {v3, v4}
        ADD     v6, v3, v4, LSR #12
        ADD     v6, v6, #1              ; v6 = end of RAM block
        CMP     v2, v3                  ; if our end <= RAM block start
        CMPHI   v6, v1                  ; or RAM block end <= our start
        BLS     %BT10                   ; then no intersection

        MOV     v4, v4, LSL #20         ; extract flags

        CMP     v1, v3
        BHI     not_bottom

        ; our area is at the bottom
        CMP     v2, v6
        BHS     remove_block

        SUB     v6, v6, v2              ; v6 = new size
        SUB     v6, v6, #1
        MOV     v6, v6, LSL #12
        ORR     v6, v6, v4, LSR #20     ; + flags
        STMDB   v8, {v2, v6}            ; store new base (= our end) and size
        B       %BT10

        ; we've completely covered a block. Remove it.
remove_block
        MOV     v7, v8
20      TEQ     v7, a4                  ; shuffle down subsequent blocks in table
        LDMNEIA v7, {v3, v4}
        STMNEDB v7, {v3, v4}
        ADDNE   v7, v7, #8
        BNE     %BT20
        SUB     v5, v5, #1
        SUB     a4, a4, #8
        STR     v5, [a4]
        SUB     v8, v8, #8
        B       %BT10

        ; our area is not at the bottom.
not_bottom
        CMP     v2, v6
        BLO     split_block

        ; our area is at the top
        SUB     v6, v1, v3              ; v6 = new size
        SUB     v6, v6, #1
        MOV     v6, v6, LSL #12
        ORR     v6, v6, v4, LSR #20     ; + flags
        STMDB   v8, {v3, v6}            ; store original base and new size
        B       %BT10

split_block
        MOV     v7, a4
30      TEQ     v7, v8                  ; shuffle up subsequent blocks in table
        LDMDB   v7, {v3, v4}
        STMNEIA v7, {v3, v4}
        SUBNE   v7, v7, #8
        BNE     %BT30
        ADD     v5, v5, #1
        ADD     a4, a4, #8
        STR     v5, [a4]

        MOV     v4, v4, LSL #20         ; (re)extract flags

        SUB     v7, v1, v3              ; v7 = size of first half
        SUB     v6, v6, v2              ; v6 = size of second half
        SUB     v7, v7, #1
        SUB     v6, v6, #1
        MOV     v7, v7, LSL #12
        MOV     v6, v6, LSL #12
        ORR     v7, v7, v4, LSR #20
        ORR     v6, v6, v4, LSR #20     ; + flags
        STMDB   v8, {v3, v7}
        STMIA   v8!, {v2, v6}
        B       %BT10


;void RISCOS_Start(unsigned int flags, int *riscos_header, int *hal_header, void *ref)
;

; We don't return, so no need to obey ATPCS, except for parameter passing.
; register usage:   v4 = location of VRAM
;                   v6 = amount of VRAM

        ROUT
RISCOS_Start
        TEQ     a4, #0
01      BEQ     %BT01                           ; Stop here if no RAM

        ; subtract the HAL and OS from the list of RAM areas
        MOV     v1, a2
        LDR     v2, [a2, #OSHdr_ImageSize]
        BL      Subtractv1v2fromRAMtable
        LDR     v1, [a3, #HALDesc_Start]
        ADD     v1, a3, v1
        LDR     v2, [a3, #HALDesc_Size]
        BL      Subtractv1v2fromRAMtable

        LDR     v5, [a4]                        ; v5 = the number of RAM blocks
        SUB     v8, a4, v5, LSL #3              ; Jump back to the start of the list.

        ; Search for some VRAM
05      LDMIA   v8!, {v1, v2}                   ; Get a block from the list. (v1,v2)=(addr,size+flags)
        TST     v2, #OSAddRAM_IsVRAM            ; Is it VRAM?
        BNE     %FT20                           ; If so, deal with it below
        TEQ     v8, a4                          ; Carry on until end of list or we find some.
        BNE     %BT05

        ; Extract some pseudo-VRAM from first DMA-capable RAM block
        SUB     v8, a4, v5, LSL #3              ; Rewind again.
06      LDMIA   v8!, {v1, v2}
        TEQ     v8, a4                          ; End of list?
        TSTNE   v2, #OSAddRAM_NoDMA             ; DMA capable?
        BNE     %BT06
        MOV     v2, v2, LSR #12                 ; Remove flags
        ADD     v2, v2, #1
        ; Is this the only DMA-capable block?
        MOV     v4, v8
        MOV     v6, #OSAddRAM_NoDMA
07      TEQ     v4, a4
        BEQ     %FT08
        LDR     v6, [v4, #4]
        ADD     v4, v4, #8
        TST     v6, #OSAddRAM_NoDMA
        BNE     %BT07
08
        ; v6 has NoDMA set if v8 was the only block
        TST     v6, #OSAddRAM_NoDMA
        MOV     v4, v1                          ; Allocate block as video memory
        MOV     v6, v2
        BEQ     %FT09
        SUBS    v6, v6, #(16*1024*1024):SHR:12  ; Leave 16M if it was the only DMA-capable block
        MOVLS   v6, v2, LSR #1                  ; If that overflowed, take half the bank.
09
        CMP     v6, #(32*1024*1024):SHR:12
        MOVHS   v6, #(32*1024*1024):SHR:12      ; Limit allocation to 32M (arbitrary)

        ADD     v1, v1, v6                      ; Adjust the RAM block base...
        SUBS    v2, v2, v6                      ; ... and the size
        LDREQ   v6, [v8, #-4]
        BEQ     %FT21                           ; pack array tighter if this block is all gone
        STR     v1, [v8, #-8]                   ; update base
        LDR     v1, [v8, #-4]
        MOV     v1, v1, LSL #20                 ; original flags
        ORR     v6, v6, v1                      ; merged into VRAM size
        ORR     v1, v1, v2                      ; and into new size
        MOV     v1, v1, ROR #20
        MOV     v6, v6, ROR #20
        SUB     v1, v1, #4096
        STR     v1, [v8, #-4]                   ; update size
        B       %FT30

        ; Note real VRAM parameters
20      MOV     v6, v2                          ; Remember the size+flags and
        MOV     v4, v1                          ; address of the VRAM
21      ADD     v6, v6, #4096                   ; Assume <4GB of VRAM!
22      TEQ     v8, a4                          ; if not at the end of the array
        LDMNEIA v8, {v1, v2}                    ; pack the array tighter
        STMNEDB v8, {v1, v2}
        ADDNE   v8, v8, #8
        BNE     %BT22
25      SUB     v5, v5, #1                      ; decrease the counter
        STR     v5, [a4, #-8]!                  ; and move the end marker down

30      SUB     v8, a4, v5, LSL #3              ; Rewind to start of list

        ; Scan forwards to find the fastest block of non-DMAable memory which is at least DRAMOffset_LastFixed size, at least 16KB aligned, and located in the first 4GB
        LDMIA   v8!, {v1, v2}
31
        TEQ     v8, a4
        MOVEQ   v1, v1, LSL #12
        BEQ     %FT32
        LDMIA   v8!, {v7, ip}
        MOV     sp, v7, ROR #2                  ; Bottom two bits (for 16KB alignment) rotated high
        CMP     sp, #1:SHL:18                   ; Ignore if not aligned, or base address >= 4GB
        BHS     %BT31
        CMP     ip, #DRAMOffset_LastFixed-4096
        ANDHS   sp, ip, #&F*OSAddRAM_Speed+OSAddRAM_NoDMA
        ANDHS   lr, v2, #&F*OSAddRAM_Speed+OSAddRAM_NoDMA
        ASSERT  OSAddRAM_Speed = 1:SHL:8
        ASSERT  OSAddRAM_NoDMA < OSAddRAM_Speed
        MOVHS   sp, sp, ROR #8                  ; Give NoDMA flag priority over speed when sorting
        CMPHS   sp, lr, ROR #8
        MOVHI   v1, v7
        MOVHI   v2, ip
        B       %BT31
32

        ; Fill in the Kernel's permanent memory table, sorting by address, speed and DMA ability.
        ; * Address: All memory that falls in the low 4GB of the physical map
        ;   comes first. This makes it easier for our initial memory allocation
        ;   (no danger of allocating pages which can't be accessed with the MMU
        ;   off), but may also help with wider software compatibility (all low-
        ;   RAM pages occupy the lowest physical page numbers)
        ; * Non-DMAable RAM is preferred over DMAable, as the kernel requires
        ;   very little DMAable RAM, and we don't want to permanently claim
        ;   DMAable RAM if we're not actually using it for DMA (in case machine
        ;   only has a tiny amount available)
        ; * Speed: Fastest RAM is listed first, so that we'll prefer to allocate
        ;   it for these important kernel/system areas
        ADD     ip, v1, #DRAMOffset_PageZero
        ASSERT  DRAMOffset_PageZero > 0         ; If the workspace block is the block containing the OS_AddRAM list, make sure the two don't overlap otherwise we might corrupt it while we copy it

        ADD     sp, v1, #DRAMOffset_ScratchSpace + ScratchSpaceSize

        Push    "a1,a2,a3"                      ; Remember our arguments

        SUB     v8, a4, v5, LSL #3              ; Rewind to start of list
        CMP     v5, #DRAMPhysTableSize          ; Don't overflow our table
        ADDHI   a4, v8, #DRAMPhysTableSize*8 - 8

        ; First put the VRAM information in to free up some regs
        ADD     v7, ip, #VideoPhysAddr
        STMIA   v7!, {v4, v6}

        ; Now fill in the rest
        ASSERT  DRAMPhysAddrA = VideoPhysAddr+8
        MOV     v1, v1, LSR #12
        ADDS    v2, v2, #4096                   ; Store true length
        ADDCS   v2, v2, #1:SHL:31               ; If it overflowed, must have been 4GB block, so clamp at 2GB (loop below will add the second 2GB)
        STMIA   v7!, {v1, v2}                   ; workspace block must be first
33
        TEQ     v8, a4
        BEQ     %FT39
        LDMIA   v8!, {v1, v2}
        ADDS    v2, v2, #4096                   ; Get true length
        ADDCS   v2, v2, #1:SHL:31               ; If it overflowed, must have been 4GB block, so split into two 2GB blocks
        SUBCS   v2, v2, #4096
        ADDCS   v1, v1, #1:SHL:(31-12)
        STMCSDB v8!, {v1, v2}
        ADDCS   v2, v2, #4096
        SUBCS   v1, v1, #1:SHL:(31-12)
        ADD     a1, ip, #DRAMPhysAddrA
        LDMIA   a1!, {a2, a3}
        TEQ     v1, a2
        BEQ     %BT33                           ; don't duplicate the initial block
        ; Perform insertion sort
        ; a1-a3, v3-v6, ip, lr free
        AND     v3, v2, #&F*OSAddRAM_Speed
        CMP     v1, #1:SHL:20
        ORRLO   v3, v3, #1:SHL:31               ; Low RAM takes priority
        TST     v2, #OSAddRAM_NoDMA
        ORRNE   v3, v3, #1:SHL:30               ; Followed by non-DMA
34
        AND     v4, a3, #&F*OSAddRAM_Speed
        CMP     a2, #1:SHL:20
        ORRLO   v4, v4, #1:SHL:31               ; Low RAM takes priority
        TST     a3, #OSAddRAM_NoDMA
        ORRNE   v4, v4, #1:SHL:30               ; Followed by non-DMA
        CMP     v3, v4                          ; Compare priority value
        BHI     %FT35
        TEQ     a1, v7
        LDMNEIA a1!, {a2, a3}
        BNE     %BT34
        ADD     a1, a1, #8
35
        ADD     v7, v7, #8
        ; Insert at a1-8, overwriting {a2, a3}
36
        STMDB   a1, {v1, v2}                   ; store new entry
        TEQ     a1, v7
        MOVNE   v1, a2                         ; if not at end, shuffle
        MOVNE   v2, a3                         ; overwritten entry down one,
        LDMNEIA a1!, {a2, a3}                  ; load next to be overwritten,
        BNE     %BT36                          ; and loop
        B       %BT33

39
        ; Now we have to work out the total RAM size
        MOV     a2, #0
        ADD     v6, ip, #PhysRamTable
        MOV     a3, v6
40
        LDMIA   v6!, {v1, v2}                   ; get address, size
        ADD     a2, a2, v2, LSR #12             ; add on size
        TEQ     v6, v7
        BNE     %BT40

        ; Work out how much DMAable RAM the HAL/kernel needs
        LDR     a1, [sp, #8]
        LDR     a1, [a1, #HALDesc_Flags]
        TST     a1, #HALFlag_NCNBWorkspace              ; do they want uncacheable workspace?
        LDRNE   a1, =SoundDMABuffers-CursorChunkAddress + ?SoundDMABuffers + 32*1024 + DRAMOffset_LastFixed
        LDREQ   a1, =SoundDMABuffers-CursorChunkAddress + ?SoundDMABuffers + DRAMOffset_LastFixed
        ; Scan PhysRamTable for a DMAable block of at least this size, extract it, and stash it in InitDMABlock
        ; Once the initial memory claiming is done we can re-insert it
        ADD     a4, a3, #DRAMPhysAddrA-VideoPhysAddr    ; don't claim VRAM

        ; First block needs special treatment as we've already claimed some of it
        LDMIA   a4!, {v1, v2}
        TST     v2, #OSAddRAM_NoDMA
        BNE     %FT41
        CMP     v2, a1
        BLO     %FT41
        ; Oh crumbs, the first block is a match for our DMA block
        ; Claim it as normal, but set InitDMAEnd to v1+DRAMOffset_LastFixed so
        ; that the already used bit won't get used for DMA
        ; We also need to be careful later on when picking the initial v2 value
        MOV     lr, v1, LSL #12
        ADD     lr, lr, #DRAMOffset_LastFixed
        STR     lr, [ip, #InitDMAEnd]
        B       %FT43
41
        ; Go on to check the rest of PhysRamTable
        SUB     a1, a1, #DRAMOffset_LastFixed
42
        LDMIA   a4!, {v1, v2}
        TST     v2, #OSAddRAM_NoDMA
        BNE     %BT42
        CMP     v2, a1
        BLO     %BT42
        CMP     v1, #1:SHL:20 ; <4GB only for now
        BHS     %BT42
        ; Make a note of this block
        MOV     lr, v1, LSL #12
        STR     lr, [ip, #InitDMAEnd]
43
        STR     v1, [ip, #InitDMABlock]
        STR     v2, [ip, #InitDMABlock+4]
        SUB     lr, a4, a3
        STR     lr, [ip, #InitDMAOffset]
        ; Now shrink/remove this memory from PhysRamTable
        SUB     v2, v2, a1
        ADD     v1, v1, a1, LSR #12
        CMP     v2, #4096               ; Block all gone?
        STMHSDB a4, {v1, v2}            ; no, just shrink it
        BHS     %FT55
45
        CMP     a4, v7
        LDMNEIA a4, {v1, v2}
        STMNEDB a4, {v1, v2}
        ADDNE   a4, a4, #8
        BNE     %BT45
        SUB     v7, v7, #8

; a2 = Total memory size (pages)
; a3 = PhysRamTable
; v7 = After last used entry in PhysRamTable
; ip -> ZeroPage

; now store zeros to fill out table

55
        ADD     v2, a3, #PhysRamTableEnd-PhysRamTable
        MOV     v3, #0
        MOV     v4, #0
57
        CMP     v7, v2
        STMLOIA v7!, {v3, v4}
        BLO     %BT57

; Calculate PhysIllegalMask before anything tries to use it
        BL      DeterminePhysIllegalMask

; Time to set up the L1PT. Just zero it out for now.

 [ LongDesc
        ASSERT  DRAMOffset_LL1PT = DRAMOffset_LL2PT+16*1024
        LDR     a4, =DRAMOffset_LL1PT+4096-(PhysRamTable+DRAMOffset_PageZero) ; offset from a3 to L1PT end
        ADD     a3, a3, a4
        MOV     a4, #16*1024+4096 ; Clear the full L1PT page, even though we only need/use a small part of it
 |
        LDR     a4, =DRAMOffset_L1PT+16*1024-(PhysRamTable+DRAMOffset_PageZero) ; offset from a3 to L1PT end
        ADD     a3, a3, a4
        MOV     a4, #16*1024
 ]
        MOV     v2, #0
        MOV     v3, #0
        MOV     v4, #0
        MOV     v5, #0
        MOV     v6, #0
        MOV     v7, #0
        MOV     v8, #0
        MOV     ip, #0
60
        STMDB   a3!, {v2-v8,ip}                         ; start at end and work back
        SUBS    a4, a4, #8*4
        BNE     %BT60

 [ LongDesc
        ; a3 -> L2PT
        ; Set up L1PT
        ADD     lr, a3, #DRAMOffset_LL1PT - DRAMOffset_PageTables
        ASSERT  :INDEX: DRAMOffset_PageTables = :INDEX: DRAMOffset_LL2PT
        ORR     v1, a3, #LL12_Table
        MOV     v2, #0
        STRD    v1, [lr], #8
        ADD     v1, v1, #4096
        STRD    v1, [lr], #8
        ADD     v1, v1, #4096
        STRD    v1, [lr], #8
        ADD     v1, v1, #4096
        STRD    v1, [lr], #8
 ]

        ADD     v1, a3, #DRAMOffset_PageZero - DRAMOffset_PageTables
        ADD     v2, a3, #DRAMOffset_LastFixed - DRAMOffset_PageTables
        STR     a2, [v1, #RAMLIMIT]                     ; remember the RAM size
        SUB     lr, a2, #1
        STR     lr, [v1, #MaxCamEntry]
        MOV     lr, a2, LSR #12-CAM_EntrySizeLog2       ; no. of pages needed for CAM
        CMP     a2, lr, LSL #12-CAM_EntrySizeLog2
        ADDNE   lr, lr, #1                              ; round up
        MOV     lr, lr, LSL #12
        STR     lr, [v1, #SoftCamMapSize]
        STR     a3, [v1, #InitUsedStart]                ; store start of L1PT

        ADD     v1, v1, #DRAMPhysAddrA
        MOV     v2, v2, LSR #12
        MOV     v3, a3

        ; Detect if the DMA claiming adjusted the first block
        ; If so, we'll need to reset v2 to the start of the block at v1
        LDR     a1, [v1]
        ADD     lr, a1, #DRAMOffset_LastFixed:SHR:12
        TEQ     lr, v2
        MOVNE   v2, a1

; For the next batch of allocation routines, v1-v3 are treated as globals.
; v1 -> current entry in PhysRamTable
; v2 -> next address to allocate in v1 (may point at end of v1), in units of pages
; v3 -> L1PT (or 0 if MMU on - not yet)

; Set up some temporary PCBTrans and PPLTrans pointers, and the initial page flags used by the page tables
        ADD     a1, v3, #DRAMOffset_PageZero - DRAMOffset_PageTables
        BL      Init_PCBTrans

; Allocate the L2PT backing store for the logical L2PT space, to
; prevent recursion.
  [ LongDesc
        ; L3PT is 8MB in size, and each 4KB of L3PT only covers 2MB of RAM
        ; This means we'll need 4 pages of L3PT to create the full logical
        ; mapping of L3PT. So to avoid recursion, we need to start by explicitly
        ; allocating the area that will be used for the logical mapping of L3PT.
        LDR     a1, =LL3PT+(LL3PT:SHR:9)
        MOV     a2, #&00800000:SHR:9
        BL      AllocateL2PT
        ; Now allocate the rest of the L3PT logical mapping
        LDR     a1, =LL3PT
        MOV     a2, #&00800000
        BL      AllocateL2PT
  |
        ; Each 4KB of L2PT covers 4MB of RAM, and L2PT is 4MB in size, so the
        ; first page we allocate here will allow us to create the full logical
        ; mapping of L2PT (protecting against any recursion in the allocation
        ; routines)
        LDR     a1, =L2PT
        MOV     a2, #&00400000
        BL      AllocateL2PT
  ]

; Allocate workspace for the HAL

        ADD     a4, v3, #DRAMOffset_PageZero - DRAMOffset_PageTables
        LDR     a3, [sp, #8]                            ; recover pushed HAL header
        LDR     a1, =HALWorkspace
        LDR     a2, =AreaFlags_HALWorkspace
        LDR     lr, [a3, #HALDesc_Workspace]            ; their workspace
        LDR     ip, [a3, #HALDesc_NumEntries]           ; plus 1 word per entry
        CMP     ip, #KnownHALEntries
        MOVLO   ip, #KnownHALEntries
        ADD     lr, lr, ip, LSL #2
        MOV     a3, lr, LSR #12                         ; round workspace up to whole
        MOV     a3, a3, LSL #12                         ; number of pages
        CMP     a3, lr
        ADDNE   a3, a3, #&1000
        STR     a3, [a4, #HAL_WsSize]                   ; Make a note of allocated space
        ADD     ip, a1, ip, LSL #2                      ; Their workspace starts
        STR     ip, [a4, #HAL_Workspace]                ; after our table of entries
        BL      Init_MapInRAM

        LDR     a3, [sp, #8]                            ; recover pushed HAL header
        LDR     lr, [a3, #HALDesc_Flags]
        TST     lr, #HALFlag_NCNBWorkspace              ; do they want uncacheable
        LDRNE   a1, =HALWorkspaceNCNB                   ; workspace?
        LDRNE   a2, =AreaFlags_HALWorkspaceNCNB
        LDRNE   a3, =32*1024
        BLNE    Init_MapInRAM_DMA

; Bootstrap time. We want to get the MMU on ASAP. We also don't want to have to
; clear up too much mess later. So what we'll do is map in the three fixed areas
; (L1PT, scratch space and page zero), the CAM, ourselves, and the HAL,
; then turn on the MMU. The CAM will be filled in once the MMU is on, by
; reverse-engineering the page tables?

        ; Map in page zero
        ADD     a1, v3, #DRAMOffset_PageZero - DRAMOffset_PageTables
        LDR     a2, =ZeroPage
        LDR     a3, =AreaFlags_ZeroPage
        MOV     a4, #16*1024
        BL      Init_MapIn

        ; Map in scratch space
        ADD     a1, v3, #DRAMOffset_ScratchSpace - DRAMOffset_PageTables
        MOV     a2, #ScratchSpace
        LDR     a3, =AreaFlags_ScratchSpace
        MOV     a4, #16*1024
        BL      Init_MapIn

      [ LongDesc
        ; Map in L1PT+L2PT
        MOV     a1, v3
        LDR     a2, =LL2PT
        ADD     a3, v3, #DRAMOffset_PageZero - DRAMOffset_PageTables
        LDR     a3, [a3, #PageTable_PageFlags]
        ORR     a3, a3, #PageFlags_Unavailable
        ASSERT  LL1PT=LL2PT+16*1024
        MOV     a4, #16*1024+4096
        BL      Init_MapIn
      |
        ; Map in L1PT
        MOV     a1, v3
        LDR     a2, =L1PT
        ADD     a3, v3, #DRAMOffset_PageZero - DRAMOffset_PageTables
        LDR     a3, [a3, #PageTable_PageFlags]
        ORR     a3, a3, #PageFlags_Unavailable
        MOV     a4, #16*1024
        BL      Init_MapIn
      ]

        ; Map in L1PT again in PhysicalAccess (see below)
      [ LongDesc
        MOV     a1, v3, LSR #21
        MOV     a1, a1, LSL #21                 ; 2MB containing L1PT
        MOV     a4, #2*1024*1024
      |
        MOV     a1, v3, LSR #20
        MOV     a1, a1, LSL #20                 ; megabyte containing L1PT
        MOV     a4, #1024*1024
      ]
        LDR     a2, =PhysicalAccess
        ADD     a3, v3, #DRAMOffset_PageZero - DRAMOffset_PageTables
        LDR     a3, [a3, #PageTable_PageFlags]
        ORR     a3, a3, #PageFlags_Unavailable
        BL      Init_MapIn


        ; Examine HAL and RISC OS locations
        LDMFD   sp, {v4,v5,v6}                  ; v4 = flags, v5 = RO desc, v6 = HAL desc
        LDR     lr, [v6, #HALDesc_Size]
        LDR     v7, [v6, #HALDesc_Start]
        ADD     v6, v6, v7                      ; (v6,v8)=(start,end) of HAL
        ADD     v8, v6, lr

        LDR     v7, [v5, #OSHdr_ImageSize]
        ADD     v7, v5, v7                      ; (v5,v7)=(start,end) of RISC OS

        TEQ     v8, v5                          ; check contiguity (as in a ROM image)
        BNE     %FT70

        ; HAL and RISC OS are contiguous. Yum.
        MOV     a1, v6
        LDR     a2, =RISCOS_Header
        SUB     a2, a2, lr

        SUB     ip, a2, a1                      ; change physical addresses passed in
        LDMIB   sp, {a3, a4}                    ; into logical addresses
        ADD     a3, a3, ip
        ADD     a4, a4, ip
        STMIB   sp, {a3, a4}
        LDR     a3, [v5, #OSHdr_DecompressHdr]  ; check if ROM is compressed, and if so, make writeable
        CMP     a3, #0
        MOVNE   a3, #OSAP_None
        MOVEQ   a3, #OSAP_ROM
        SUB     a4, v7, v6
        BL      Init_MapIn
        MOV     a3, v6
        B       %FT75

70
        ; HAL is separate. (We should cope with larger images)
        LDR     a2, =ROM
        MOV     a1, v6
        SUB     ip, a2, a1                      ; change physical address passed in
        LDR     a3, [sp, #8]                    ; into logical address
        ADD     a3, a3, ip
        STR     a3, [sp, #8]
        SUB     a4, v8, v6
        MOV     a3, #OSAP_ROM
        BL      Init_MapIn

        ; And now map in RISC OS
        LDR     a2, =RISCOS_Header              ; Hmm - what if position independent?
        MOV     a1, v5
        SUB     ip, a2, a1                      ; change physical address passed in
        LDR     a3, [sp, #4]                    ; into logical address
        ADD     a3, a3, ip
        STR     a3, [sp, #4]
        SUB     a4, v7, v5
        LDR     a3, [v5, #OSHdr_DecompressHdr]
        CMP     a3, #0
        MOVNE   a3, #OSAP_None
        MOVEQ   a3, #OSAP_ROM
        BL      Init_MapIn
        MOV     a3, v5
75
        ; We've now allocated all the pages we're going to before the MMU comes on.
        ; Note the end address (for RAM clear)
        ADD     a1, v3, #DRAMOffset_PageZero - DRAMOffset_PageTables
        STR     v1, [a1, #InitUsedBlock]
        STR     v2, [a1, #InitUsedEnd]
        STR     a3, [a1, #ROMPhysAddr]

        ; Note the HAL flags passed in.
        LDR     a2, [sp, #0]
        STR     a2, [a1, #HAL_StartFlags]

        ; Set up a reset IRQ handler (for IIC CMOS access)
        MSR     CPSR_c, #IRQ32_mode + I32_bit + F32_bit
        LDR     sp_irq, =ScratchSpace + 1024    ; 1K is plenty since Reset_IRQ_Handler now runs in SVC mode
        MSR     CPSR_c, #SVC32_mode + I32_bit + F32_bit
        LDR     a2, =Reset_IRQ_Handler
        STR     a2, [a1, #InitIRQHandler]

        ; Fill in some initial processor vectors. These will be used during ARM
        ; analysis, once the MMU is on. We do it here before the data cache is
        ; activated to save any IMB issues.
        ADRL    a2, InitProcVecs
        ADD     a3, a2, #InitProcVecsEnd - InitProcVecs
76      LDR     a4, [a2], #4
        CMP     a2, a3
        STR     a4, [a1], #4
        BLO     %BT76

MMU_activation_zone
; The time has come to activate the MMU. Steady now... Due to unpredictability of MMU
; activation, need to ensure that mapped and unmapped addresses are equivalent. To
; do this, we temporarily make the section containing virtual address MMUon_instr map
; to the same physical address. In case the code crosses a section boundary, do the
; next section as well.
;
        MOV     a1, #4_0000000000000001                         ; domain 0 client only
        ARM_MMU_domain a1

        ADR     a1, MMU_activation_zone
      [ LongDesc
        ; To avoid refactoring things too much, only touch one page table
        ; entry instead of two. However because each L2PT entry is 2MB we end
        ; up covering the same area (just 2MB aligned instead of the 1MB
        ; alignment that you'd get with short descriptor L1PT).
        MOV     a3, a1, LSR #21                 ; a3 = 2MB number (stays there till end)
        ADD     lr, v3, a3, LSL #3              ; lr -> L2PT entry
        LDRD    a1, [lr]                        ; remember old mapping
        Push    "a1,a2"
        MOV     a1, a3, LSL #21
        ORR     a1, a1, #LL12_Block + LLAttr_Nrm_NC + LL_Page_LowAttr_AP2 ; Non-cacheable, read-only
        ORR     a1, a1, #LL_Page_LowAttr_AF
        MOV     a2, #0
        STRD    a1, [lr]
        Pull    "a1,a2"
      |
        MOV     a1, a1, LSR #20                 ; a1 = megabyte number (stays there till end)
        ADD     lr, v3, a1, LSL #2              ; lr -> L1PT entry
        LDMIA   lr, {a2, a3}                    ; remember old mappings
 [ MEMM_Type = "VMSAv6"
        LDR     ip, =(AP_ROM * L1_APMult) + L1_Section
 |
  [ ARM6support
        LDR     ip, =(AP_None * L1_APMult) + L1_U + L1_Section
  |
        LDR     ip, =(AP_ROM * L1_APMult) + L1_U + L1_Section
  ]
 ]
        ORR     a4, ip, a1, LSL #20             ; not cacheable, as we don't want
        ADD     v4, a4, #1024*1024              ; to fill the cache with rubbish
        STMIA   lr, {a4, v4}
      ]

        MOV     a4, a1
        Push    "a2,lr"
      [ LongDesc
        ADD     a1, v3, #DRAMOffset_LL1PT-DRAMOffset_PageTables
      |
        MOV     a1, v3
      ]
        ADD     a2, v3, #DRAMOffset_PageZero-DRAMOffset_PageTables
        BL      SetTTBR
        Pull    "a2,lr"
        BL      Init_ARMarch                    ; corrupts a1 and ip
        MOV     ip, a1                          ; Remember architecture for later
        MOV     a1, a4

        MSREQ   CPSR_c, #F32_bit+I32_bit+UND32_mode ; Recover the soft copy of the CR
        MOVEQ   v5, sp
        ARM_read_control v5, NE
  [ CacheOff
        ORR     v5, v5, #MMUC_M                 ; MMU on
        ORR     v5, v5, #MMUC_R                 ; ROM mode enable
  |
        ORR     v5, v5, #MMUC_W+MMUC_C+MMUC_M   ; Write buffer, data cache, MMU on
        ORR     v5, v5, #MMUC_R+MMUC_Z          ; ROM mode enable, branch predict enable
  ]
  [ MEMM_Type = "VMSAv6"
        ORR     v5, v5, #MMUC_XP ; Extended pages enabled (v6)
        BIC     v5, v5, #MMUC_TRE+MMUC_AFE ; TEX remap, Access Flag disabled
        BIC     v5, v5, #MMUC_EE+MMUC_TE+MMUC_VE ; Exceptions = nonvectored LE ARM
      [ SupportARMv6 :LAND: NoARMv7
        ; Deal with a couple of ARM11 errata
        ARM_read_ID lr
        LDR     a4, =&FFF0
        AND     lr, lr, a4
        LDR     a4, =&B760
        TEQ     lr, a4
        BNE     %FT01
        ORR     v5, v5, #MMUC_FI ; Erratum 716151: Disable hit-under-miss (enable fast interrupt mode) to prevent D-cache corruption from D-cache cleaning (the other workaround, ensuring a DSB exists inbetween the clean op and the next store access to that cache line, feels a bit heavy-handed since we'd probably have to disable IRQs to make it fully safe)
        ; Update the aux control register
        MRC     p15, 0, lr, c1, c0, 1
        ; Bit 28: Erratum 714068: Set PHD bit to prevent deadlock from PLI or I-cache invalidate by MVA
        ; Bit 31: Erratum 716151: Set FIO bit to override some of the behaviour implied by FI bit
        ORR     lr, lr, #(1:SHL:28)+(1:SHL:31)
        MCR     p15, 0, lr, c1, c0, 1
        myISB   ,lr
        TEQ     pc, #0 ; Restore NE condition from Init_ARMarch
01
      ]
  ]
  [ NoUnaligned
        ORR     v5, v5, #MMUC_A ; Alignment exceptions on
  ]
  [ HiProcVecs
        ORR     v5, v5, #MMUC_V ; High processor vectors enabled
  ]

MMUon_instr
; Note, no RAM access until we've reached MMUon_nol1ptoverlap and the flat
; logical-physical mapping of the ROM has been removed (we can't guarantee that
; the RAM mapping hasn't been clobbered, and SP is currently bogus).
        ARM_write_control v5
  [ MEMM_Type = "VMSAv6"
        MOV     lr, #0
        myISB   ,lr,,y ; Just in case
  ]
        MOVEQ   sp, v5
        MSREQ   CPSR_c, #F32_bit+I32_bit+SVC32_mode

  [ MEMM_Type = "VMSAv6"
        CMP     ip, #ARMvF
        BEQ     %FT01
        MCRNE   ARM_config_cp,0,lr,ARMv4_cache_reg,C7           ; junk MMU-off contents of I-cache (works on ARMv3)
        B       %FT02
01      MCREQ   p15, 0, lr, c7, c5, 0           ; invalidate instruction cache
        MCREQ   p15, 0, lr, c8, c7, 0           ; invalidate TLBs
        MCREQ   p15, 0, lr, c7, c5, 6           ; invalidate branch predictor
        myISB   ,lr,,y ; Ensure below branch works
        BLEQ    HAL_InvalidateCache_ARMvF       ; invalidate data cache (and instruction+TLBs again!)
02
  |
        MOV     lr, #0                                          ; junk MMU-off contents of I-cache
        MCR     ARM_config_cp,0,lr,ARMv4_cache_reg,C7           ; (works on ARMv3)
  ]

; MMU now on. Need to jump to logical copy of ourselves. Complication arises if our
; physical address overlaps our logical address - in that case we need to map
; in another disjoint copy of ourselves and branch to that first, then restore the
; original two sections.
        ADRL    a4, RISCOS_Header
        LDR     ip, =RISCOS_Header
        SUB     ip, ip, a4
        ADR     a4, MMU_activation_zone
      [ LongDesc
        MOV     a4, a4, LSR #21
        MOV     a4, a4, LSL #21                 ; a4 = base of scrambled region
      |
        MOV     a4, a4, LSR #20
        MOV     a4, a4, LSL #20                 ; a4 = base of scrambled region
      ]
        ADD     v4, a4, #2*1024*1024            ; v4 = top of scrambled region
        SUB     v4, v4, #1                      ;      (inclusive, in case wrapped to 0)
        ADR     v5, MMUon_resume
        ADD     v5, v5, ip                      ; v5 = virtual address of MMUon_resume
        CMP     v5, a4
        BLO     MMUon_nooverlap
        CMP     v5, v4
        BHI     MMUon_nooverlap

      [ LongDesc
        ASSERT  ROM > 4*1024*1024
; Oh dear. We know the ROM lives high up, so we'll mangle 00200000-003FFFFF.
; But as we're overlapping the ROM, we know we're not overlapping the page tables.
        LDR     lr, =LL2PT                      ; accessing the L2PT virtually now
        ; Use IP+SP as temp
        ORR     ip, a4, #LL12_Block + LLAttr_Nrm_NC + LL_Page_LowAttr_AP2 ; Non-cacheable, read-only
        ORR     ip, ip, #LL_Page_LowAttr_AF
        MOV     sp, #0
        LDRD    v7, [lr, #8]
        STRD    ip, [lr, #8]
        myDSB   ,ip                             ; there shouldn't have been anything at the target, so DSB+ISB is all that's needed for synchronisation
        myISB   ,ip,,y
        RSB     ip, a4, #&00200000
        ADD     pc, pc, ip
        NOP
MMUon_overlapresume                             ; now executing from 00200000
        ADD     ip, lr, a4, LSR #18
        STRD    a1, [ip]                        ; restore original set of mappings
        BL      Init_PageTablesChanged

        MOV     a1, v7                          ; arrange for code below
        MOV     a2, v8                          ; to restore this area instead
        MOV     a3, #1
      |
        ASSERT  ROM > 3*1024*1024
; Oh dear. We know the ROM lives high up, so we'll mangle 00100000-002FFFFF.
; But as we're overlapping the ROM, we know we're not overlapping the page tables.
        LDR     lr, =L1PT                       ; accessing the L1PT virtually now
 [ MEMM_Type = "VMSAv6"
        LDR     ip, =(AP_ROM * L1_APMult) + L1_Section
 |
  [ ARM6support
        LDR     ip, =(AP_None * L1_APMult) + L1_U + L1_Section
  |
        LDR     ip, =(AP_ROM * L1_APMult) + L1_U + L1_Section
  ]
 ]
        ORR     v6, a4, ip
        ADD     ip, v6, #1024*1024
        LDMIB   lr, {v7, v8}                    ; sections 1 and 2
        STMIB   lr, {v6, ip}
 [ MEMM_Type = "VMSAv6"
        myDSB   ,ip                             ; there shouldn't have been anything at the target, so DSB+ISB is all that's needed for synchronisation
        myISB   ,ip,,y
 ]
        RSB     ip, a4, #&00100000
        ADD     pc, pc, ip
        NOP
MMUon_overlapresume                             ; now executing from 00100000
        ADD     ip, lr, a4, LSR #18
        STMIA   ip, {a2, a3}                    ; restore original set of mappings
        BL      Init_PageTablesChanged

        MOV     a2, v7                          ; arrange for code below
        MOV     a3, v8                          ; to restore section 1+2 instead
        MOV     a1, #1
      ]

; Jump to the final logical mapping of ROM
MMUon_nooverlap
        ADRL    lr, RISCOS_Header
        LDR     ip, =RISCOS_Header
        SUB     ip, ip, lr
        ADD     pc, pc, ip
        NOP
MMUon_resume
; Repair the page table mapping that we temporarily modified.
; But what if the logical address of the page tables is at the physical address of the code? (i.e. our temporary modification is blocking access to the page tables)
; Then we have to access it via PhysicalAccess instead.
      [ LongDesc
        LDR     lr, =LL2PT
      |
        LDR     lr, =L1PT
      ]
        CMP     lr, a4
        BLO     MMUon_nol1ptoverlap
        CMP     lr, v4
        BHI     MMUon_nol1ptoverlap
; PhysicalAccess points to the megabyte containing the L1PT. Find the L1PT within it.
        LDR     lr, =PhysicalAccess
      [ LongDesc
        MOV     v6, v3, LSL #11
        ORR     lr, lr, v6, LSR #11
MMUon_nol1ptoverlap
        ADD     lr, lr, a3, LSL #3
        STRD    a1, [lr]
      |
        MOV     v6, v3, LSL #12
        ORR     lr, lr, v6, LSR #12
MMUon_nol1ptoverlap
        ADD     lr, lr, a1, LSL #2
        STMIA   lr, {a2, a3}
      ]
        BL      Init_PageTablesChanged

; The MMU is now on. Wahey. Let's get allocating.
;
; Input:
; v3 = phys addr of DRAMOffset_PageTables
; R13_und = MMU control register soft copy (for ARMv3)
;
; All other registers are set up from scratch

        LDR     sp, =ScratchSpace + ScratchSpaceSize - 4*3 ; 3 items already on stack :)

        LDR     a1, =ZeroPage

        ADD     lr, v3, #DRAMOffset_PageZero-DRAMOffset_PageTables ; lr = PhysAddr of zero page

        LDR     v1, [a1, #InitUsedBlock]                ; turn this from Phys to Log
        SUB     v1, v1, lr
        ADD     v1, v1, a1
        STR     v1, [a1, #InitUsedBlock]
        LDR     v2, [a1, #InitUsedEnd]

; Store the logical address of the HAL descriptor
        LDR     a2, [sp, #8]
        STR     a2, [a1, #HAL_Descriptor]

        MOV     v3, #0                                  ; "MMU is on" signal

        BL      ARM_Analyse

        ChangedProcVecs a1

      [ LongDesc
        ASSERT  L1_Fault = LL_Fault
      ]
        MOV     a1, #L1_Fault
        BL      ReleasePhysicalAddress

        LDR     a1, =HALWorkspace
        LDR     a2, =ZeroPage
        LDR     a3, [a2, #HAL_WsSize]
      [ ZeroPage <> 0
        MOV     a2, #0
      ]
        BL      memset

        LDR     a2, =ZeroPage
        LDR     a1, =IOLimit
        STR     a1, [a2, #IOAllocLimit]
        LDR     a1, [a2, #SoftCamMapSize]
        RSB     a1, a1, #CAMTop ; Start of CAM
      [ LongDesc
        BFC     a1, #0, #21 ; Round down to 2MB for IO start
      |
        MOV     a1, a1, LSR #20 ; Round down to 1MB for IO start
        MOV     a1, a1, LSL #20
      ]
        STR     a1, [a2, #IOAllocPtr]
        STR     a1, [a2, #IOAllocTop]

        BL      SetUpHALEntryTable

; Initialise the HAL. Due to its memory claiming we need to get our v1 and v2 values
; into workspace and out again around it.

        LDR     a1, =ZeroPage
        STR     v1, [a1, #InitUsedBlock]
        STR     v2, [a1, #InitUsedEnd]

        LDR     a1, =RISCOS_Header
        LDR     a2, =HALWorkspaceNCNB
        AddressHAL
        CallHAL HAL_Init

        DebugTX "HAL initialised"

        MOV     a1, #64 ; Old limit prior to OMAP3 port
        CallHAL HAL_IRQMax
        CMP     a1, #MaxInterrupts
        MOVHI   a1, #MaxInterrupts ; Avoid catastrophic failure if someone forgot to increase MaxInterrupts
        LDR     a2, =ZeroPage
        STR     a1, [a2, #IRQMax]

        LDR     v1, [a2, #InitUsedBlock]
        LDR     v2, [a2, #InitUsedEnd]

; Start timer zero, at 100 ticks per second
        MOV     a1, #0
        CallHAL HAL_TimerGranularity

        MOV     a2, a1
        MOV     a1, #100
        BL      __rt_udiv

        MOV     a2, a1
        MOV     a1, #0
        CallHAL HAL_TimerSetPeriod

        DebugTX "IICInit"

        BL      IICInit

; Remember some stuff that's about to get zapped
        LDR     ip, =ZeroPage
        LDR     v4, [ip, #ROMPhysAddr]
        LDR     v5, [ip, #RAMLIMIT]
        LDR     v7, [ip, #MaxCamEntry]
        LDR     v8, [ip, #IRQMax]

        LDR     a1, [ip, #HAL_StartFlags]
        TST     a1, #OSStartFlag_RAMCleared
        BLEQ    ClearWkspRAM            ; Only clear the memory if the HAL didn't

; Put it back
        LDR     a1, =ZeroPage
        STR     v4, [a1, #ROMPhysAddr]
        STR     v5, [a1, #RAMLIMIT]
        STR     v7, [a1, #MaxCamEntry]
        STR     v8, [a1, #IRQMax]
        MSR     CPSR_c, #F32_bit + UND32_mode           ; retrieve the MMU control register soft copy
        STR     sp, [a1, #MMUControlSoftCopy]
        MSR     CPSR_c, #F32_bit + SVC32_mode
        MOV     v8, a1

; Calculate CPU feature flags
        BL      ReadCPUFeatures

        DebugTX "HAL_CleanerSpace"

; Set up the data cache cleaner space if necessary (eg. for StrongARM core)
        MOV     a1, #-1
        CallHAL HAL_CleanerSpace
        CMP     a1, #-1          ;-1 means none needed (HAL only knows this if for specific ARM core eg. system-on-chip)
        BEQ     %FT20
        LDR     a2, =DCacheCleanAddress
        LDR     a3, =AreaFlags_DCacheClean
        ASSERT  DCacheCleanSize = 4*&10000                ; 64k of physical space used 4 times (allows large page mapping)
        MOV     a4, #&10000
        MOV     ip, #4
        SUB     sp, sp, #5*4       ;room for a1-a4,ip
10
        STMIA   sp, {a1-a4, ip}
        BL      Init_MapIn
        LDMIA   sp, {a1-a4, ip}
        SUBS    ip, ip, #1
        ADD     a2, a2, #&10000
        BNE     %BT10
        ADD     sp, sp, #5*4

20
; Decompress the ROM
        LDR     a1, =RISCOS_Header
        LDR     a2, [a1, #OSHdr_DecompressHdr]
        CMP     a2, #0
        BEQ     %FT30
        ADD     ip, a1, a2
        ASSERT  OSDecompHdr_WSSize = 0
        ASSERT  OSDecompHdr_Code = 4
        LDMIA   ip, {a3-a4}
        ADRL    a2, SyncCodeAreas
        CMP     a3, #0 ; Any workspace required?
        ADD     a4, a4, ip
   [ DebugHALTX
        BNE     %FT25
        DebugTX "Decompressing ROM, no workspace required"
      [ NoARMv5
        MOV     lr, pc
        MOV     pc, a4
      |
        BLX     a4
      ]
        DebugTX "Decompression complete"
        B       %FT27
25
   |
        ADREQ   lr, %FT27
        MOVEQ   pc, a4
   ]
        Push    "a1-a4,v1-v2,v5-v7"
; Allocate workspace for decompression code
; Workspace is located at a 4MB-aligned log addr, and is a multiple of 1MB in
; size. This greatly simplifies the code required to free the workspace, since
; we can guarantee it will have been section-mapped, and won't hit any
; partially-allocated L2PT blocks (where 4 L1PT entries point to subsections of
; the same L2PT page)
; This means all we need to do to free the workspace is zap the L1PT entries
; and rollback v1 & v2
; Note: This is effectively a MB-aligned version of Init_MapInRAM
ROMDecompWSAddr * 4<<20
   [ LongDesc
ROMDecompAlign * 21 ; 2MB aligned for long descriptor format
   |
ROMDecompAlign * 20
   ]
        DebugTX "Allocating decompression workspace"
        LDR     v5, =(1<<ROMDecompAlign)-1
        ADD     v7, a3, v5
        BIC     v7, v7, v5 ; MB-aligned size
        STR     v7, [sp, #8] ; Overwrite stacked WS size
        MOV     v6, #ROMDecompWSAddr ; Current log addr
26
        ADD     v2, v2, v5
        BIC     v2, v2, v5 ; MB-aligned physram
        LDMIA   v1, {a2, a3}
        SUB     a2, v2, a2 ; Amount of bank used
        SUB     a2, a3, a2 ; Amount of bank remaining
        MOVS    a2, a2, ASR #ROMDecompAlign ; Round down to nearest MB
        LDRLE   v2, [v1, #8]! ; Move to next bank if 0MB left
        BLE     %BT26
        CMP     a2, v7, LSR #ROMDecompAlign
        MOVHS   a4, v7
        MOVLO   a4, a2, LSL #ROMDecompAlign ; a4 = amount to take
        MOV     a1, v2 ; set up parameters for MapIn call
        MOV     a2, v6
        MOV     a3, #OSAP_None
        SUB     v7, v7, a4 ; Decrease amount to allocate
        ADD     v2, v2, a4 ; Increase physram ptr
        ADD     v6, v6, a4 ; Increase logram ptr
        BL      Init_MapIn
        CMP     v7, #0
        BNE     %BT26
        Pull    "a1-a2,v1-v2" ; Pull OS header, IMB func ptr, workspace size, decompression code
        MOV     a3, #ROMDecompWSAddr
        DebugTX "Decompressing ROM"
      [ NoARMv5
        MOV     lr, pc
        MOV     pc, v2
      |
        BLX     v2
      ]
        DebugTX "Decompression complete"
; Before we free the workspace, make sure we zero it
        MOV     a1, #ROMDecompWSAddr
        MOV     a2, #0
        MOV     a3, v1
        BL      memset
; Flush the workspace from the cache so we can unmap it
; Really we should make the pages uncacheable first, but for simplicity we do a
; full cache+TLB clean+invalidate later on when changing the ROM permissions
        MOV     a1, #ROMDecompWSAddr
        ADD     a2, a1, v1
        ARMop   Cache_CleanInvalidateRange
; Zero each L1PT entry
      [ LongDesc
        ; Ensure STRD is used
        MOV     a1, #0
        MOV     a2, #0
        LDR     a3, =LL2PT+(ROMDecompWSAddr>>18)
        MOV     a4, v1, LSR #18
265
        STRD    a1, [a3], #8
        SUBS    a4, a4, #8
        BGE     %BT265
      |
        LDR     a1, =L1PT+(ROMDecompWSAddr>>18)
        MOV     a2, #0
        MOV     a3, v1, LSR #18
        BL      memset
      ]
; Pop our registers and we're done
        Pull    "v1-v2,v5-v7"
        DebugTX "ROM decompression workspace freed"
27
; Now that the ROM is decompressed we need to change the ROM page mapping to
; read-only. The easiest way to do this is to make another call to Init_MapIn.
; But before we can do that we need to work out if the HAL+OS are contiguous in
; physical space, so that we remap the correct area.
        LDR     a1, =RISCOS_Header
        BL      RISCOS_LogToPhys
        Push    "a1"
        LDR     a1, =ROM
        BL      RISCOS_LogToPhys
        Pull    "a2"                    ; a1 = HAL phys, a2 = OS phys
        LDR     a3, =ZeroPage
        LDR     a3, [a3, #HAL_Descriptor]
        LDR     a3, [a3, #HALDesc_Size]
        ADD     a3, a3, a1              ; a3 = phys addr of OS if contiguous
        CMP     a2, a3
; Contiguous mapping, remap combined HAL+OS
        MOVEQ   a2, #ROM
        MOVEQ   a4, #OSROM_ImageSize*1024
; Discontiguous mapping, only remap OS
        MOVNE   a1, a2
        LDRNE   a2, =RISCOS_Header
        LDRNE   a4, [a2, #OSHdr_ImageSize]
        MOV     a3, #OSAP_ROM
        BL      Init_MapIn
; Flush & invalidate cache/TLB to ensure everything respects the new page access
; Putting a flush here also means the decompression code doesn't have to worry
; about IMB'ing the decompressed ROM
        ARMop   MMU_Changing ; Perform full clean+invalidate to ensure any lingering cache lines for the decompression workspace are gone
        DebugTX "ROM access changed to read-only"
30
; Allocate the CAM
        LDR     a3, [v8, #SoftCamMapSize]
        LDR     a2, =AreaFlags_CAM
        RSB     a1, a3, #CAMTop
        STR     a1, [v8, #CamEntriesPointer]
        BL      Init_MapInRAM

; Allocate the supervisor stack
        LDR     a1, =SVCStackAddress
        LDR     a2, =AreaFlags_SVCStack
        LDR     a3, =SVCStackSize
        BL      Init_MapInRAM

; Allocate the interrupt stack
        LDR     a1, =IRQStackAddress
        LDR     a2, =AreaFlags_IRQStack
        LDR     a3, =IRQStackSize
        BL      Init_MapInRAM

; Allocate the abort stack
        LDR     a1, =ABTStackAddress
        LDR     a2, =AreaFlags_ABTStack
        LDR     a3, =ABTStackSize
        BL      Init_MapInRAM

; Allocate the undefined stack
        LDR     a1, =UNDStackAddress
        LDR     a2, =AreaFlags_UNDStack
        LDR     a3, =UNDStackSize
        BL      Init_MapInRAM

; Allocate the system heap (just 32K for now - will grow as needed)
        LDR     a1, =SysHeapAddress
        LDR     a2, =AreaFlags_SysHeap
        LDR     a3, =32*1024
        BL      Init_MapInRAM_Clear

; Allocate the cursor/system/sound block - first the cached bit
        LDR     a1, =CursorChunkAddress
        LDR     a2, =AreaFlags_CursorChunkCacheable
        LDR     a3, =SoundDMABuffers - CursorChunkAddress
        BL      Init_MapInRAM_DMA
; then the uncached bit
        LDR     a1, =SoundDMABuffers
        LDR     a2, =AreaFlags_CursorChunk
        LDR     a3, =?SoundDMABuffers
        BL      Init_MapInRAM_DMA

        LDR     a1, =KbuffsBaseAddress
        LDR     a2, =AreaFlags_Kbuffs
        LDR     a3, =(KbuffsSize + &FFF) :AND: &FFFFF000  ;(round to 4k)
        BL      Init_MapInRAM_Clear

 [ HiProcVecs
        ; Map in DebuggerSpace
        LDR     a1, =DebuggerSpace
        LDR     a2, =AreaFlags_DebuggerSpace
        LDR     a3, =(DebuggerSpace_Size + &FFF) :AND: &FFFFF000
        BL      Init_MapInRAM_Clear
 ]

 [ MinorL2PThack
; Allocate backing L2PT for application space
; Note that ranges must be 4M aligned, as AllocateL2PT only does individual
; (1M) sections, rather than 4 at a time, corresponding to a L2PT page. The
; following space is available for dynamic areas, and ChangeDyn.s will get
; upset if it sees only some out of a set of 4 section entries pointing to the
; L2PT page.
        MOV     a1, #0
        MOV     a2, #AplWorkMaxSize             ; Not quite right, but the whole thing's wrong anyway
        ASSERT  AplWorkMaxSize :MOD: (4*1024*1024) = 0
        BL      AllocateL2PT
; And for the system heap. Sigh
        LDR     a1, =SysHeapAddress
        LDR     a2, =SysHeapMaxSize
        ASSERT  SysHeapAddress :MOD: (4*1024*1024) = 0
        ASSERT  SysHeapMaxSize :MOD: (4*1024*1024) = 0
        BL      AllocateL2PT
 ]

        STR     v2, [v8, #InitUsedEnd]

        ; Put InitDMABlock back into PhysRamTable
        Push    "v1-v7"
        ASSERT  InitDMAOffset = InitDMABlock+8
        ADD     v1, v8, #InitDMABlock
        LDMIA   v1, {v1-v3}
        ADD     v3, v3, #PhysRamTable
        ADD     v3, v3, v8
        ; Work out whether the block was removed or merely shrunk
        LDMDB   v3, {v4-v5}
        ADD     v6, v1, v2, LSR #12
        ADD     v7, v4, v5, LSR #12
        STMDB   v3, {v1-v2}
        TEQ     v6, v7
        BEQ     %FT40                   ; End addresses match, it was shrunk
35
        LDMIA   v3, {v1-v2}             ; Shuffle following entries down
        STMIA   v3!, {v4-v5}
        MOV     v4, v1
        MOVS    v5, v2
        BNE     %BT35
40
        Pull    "v1-v7"

        MSR     CPSR_c, #F32_bit+I32_bit+IRQ32_mode
        LDR     sp, =IRQSTK
        MSR     CPSR_c, #F32_bit+I32_bit+ABT32_mode
        LDR     sp, =ABTSTK
        MSR     CPSR_c, #F32_bit+I32_bit+UND32_mode
        LDR     sp, =UNDSTK
        MSR     CPSR_c, #F32_bit+SVC2632
        LDR     sp, =SVCSTK

        BL      ConstructCAMfromPageTables

        MOV     a1, #4096
        STR     a1, [v8, #Page_Size]

        BL      CountPageTablePages

        BL      Count32bitPages

        B       Continue_after_HALInit

        LTORG

 [ MEMM_Type = "VMSAv6"
HAL_InvalidateCache_ARMvF
        ; Cache invalidation for ARMs with multiple cache levels, used before ARMop initialisation
        ; This function gets called before we have a stack set up, so we've got to preserve as many registers as possible
        ; The only register we can safely change is ip, but we can switch into FIQ mode with interrupts disabled and use the banked registers there
        MRS     ip, CPSR
        MSR     CPSR_c, #F32_bit+I32_bit+FIQ32_mode
        MOV     r9, #0
        MCR     p15, 0, r9, c7, c5, 0           ; invalidate instruction cache
        MCR     p15, 0, r9, c8, c7, 0           ; invalidate TLBs
        MCR     p15, 0, r9, c7, c5, 6           ; invalidate branch target predictor
        myDSB   ,r9,,y                          ; Wait for completion
        myISB   ,r9,,y
        ; Check whether we're ARMv7 (and thus multi-level cache) or ARMv6 (and thus single-level cache)
        MRC     p15, 0, r8, c0, c0, 1
        TST     r8, #&80000000 ; EQ=ARMv6, NE=ARMv7
        BEQ     %FT80

        ; This is basically the same algorithm as the MaintainDataCache_WB_CR7_Lx macro, but tweaked to use less registers and to read from CP15 directly
        MRC     p15, 1, r8, c0, c0, 1           ; Read CLIDR to r8
        TST     r8, #&07000000
        BEQ     %FT50
        MOV     r11, #0 ; Current cache level
10 ; Loop1
        ADD     r10, r11, r11, LSR #1 ; Work out 3 x cachelevel
        MOV     r9, r8, LSR r10 ; bottom 3 bits are the Cache type for this level
        AND     r9, r9, #7 ; get those 3 bits alone
        CMP     r9, #2
        BLT     %FT40 ; no cache or only instruction cache at this level
        MCR     p15, 2, r11, c0, c0, 0 ; write CSSELR from r11
        ISB
        MRC     p15, 1, r9, c0, c0, 0 ; read current CSSIDR to r9
        AND     r10, r9, #CCSIDR_LineSize_mask ; extract the line length field
        ADD     r10, r10, #4 ; add 4 for the line length offset (log2 16 bytes)
        LDR     r8, =CCSIDR_Associativity_mask:SHR:CCSIDR_Associativity_pos
        AND     r8, r8, r9, LSR #CCSIDR_Associativity_pos ; r8 is the max number on the way size (right aligned)
        CLZ     r13, r8 ; r13 is the bit position of the way size increment
        LDR     r12, =CCSIDR_NumSets_mask:SHR:CCSIDR_NumSets_pos
        AND     r12, r12, r9, LSR #CCSIDR_NumSets_pos ; r12 is the max number of the index size (right aligned)
20 ; Loop2
        MOV     r9, r12 ; r9 working copy of the max index size (right aligned)
30 ; Loop3
        ORR     r14, r11, r8, LSL r13 ; factor in the way number and cache number into r14
        ORR     r14, r14, r9, LSL r10 ; factor in the index number
        DCISW   r14 ; Invalidate
        SUBS    r9, r9, #1 ; decrement the index
        BGE     %BT30
        SUBS    r8, r8, #1 ; decrement the way number
        BGE     %BT20
        DSB                ; Cortex-A7 errata 814220: DSB required when changing cache levels when using set/way operations. This also counts as our end-of-maintenance DSB.
        MRC     p15, 1, r8, c0, c0, 1
40 ; Skip
        ADD     r11, r11, #2
        AND     r14, r8, #&07000000
        CMP     r14, r11, LSL #23
        BGT     %BT10

50 ; Finished
        ; Wait for clean to complete
        MOV     r8, #0
        MCR     p15, 0, r8, c7, c5, 0           ; invalidate instruction cache
        MCR     p15, 0, r8, c8, c7, 0           ; invalidate TLBs
        MCR     p15, 0, r8, c7, c5, 6           ; invalidate branch target predictor
        myDSB   ,r8,,y                          ; Wait for completion
        myISB   ,r8,,y
        ; All caches clean; switch back to SVC, then recover the stored PSR from ip (although we can be fairly certain we started in SVC anyway)
        MSR     CPSR_c, #F32_bit+I32_bit+SVC32_mode
        MSR     CPSR_cxsf, ip
        MOV     pc, lr
80 ; ARMv6 case
        MCR     ARM_config_cp,0,r9,ARMv4_cache_reg,C7 ; ARMv3-ARMv6 I+D cache flush
        B       %BT50
 ] ; MEMM_Type = "VMSAv6"

CountPageTablePages ROUT
        Entry
        LDR     a1, =ZeroPage
        LDR     a2, [a1, #CamEntriesPointer]
        LDR     a3, [a1, #MaxCamEntry]
      [ ZeroPage <> 0
        MOV     a1, #0
      ]
        ADD     a3, a3, #1
        ADD     a4, a2, a3, LSL #CAM_EntrySizeLog2
      [ LongDesc
        LDR     lr, =LL3PT
10      LDR     ip, [a4, #CAM_LogAddr-CAM_EntrySize]!
        SUB     ip, ip, lr
        CMP     ip, #8:SHL:20
        ADDLO   a1, a1, #4096
        TEQ     a4, a2
        BNE     %BT10
      |
        ASSERT  (L2PT :AND: &3FFFFF) = 0
        LDR     lr, =L2PT :SHR: 22
10      LDR     ip, [a4, #CAM_LogAddr-CAM_EntrySize]!
        TEQ     lr, ip, LSR #22
        ADDEQ   a1, a1, #4096
        TEQ     a4, a2
        BNE     %BT10
      ]
        LDR     a2, =ZeroPage
        STR     a1, [a2, #LxPTUsed]
        EXIT

Count32bitPages ROUT
        LDR     a1, =ZeroPage
        LDR     a2, [a1, #MaxCamEntry]
      [ LongDesc
        ; ~64bit RAM addresses supported, examine PhysRamTable to determine
        ; the last page number with a 32bit address
        Entry
        MOV     a3, #-1
        ADD     a4, a1, #PhysRamTable
10
        LDMIA   a4!, {ip, lr}
        CMP     ip, #1:SHL:20                   ; Address below 4G?
        ADDLO   a3, a3, lr, LSR #12             ; Count it up
        CMPLO   a3, a2                          ; Don't overrun the table
        BLO     %BT10
        STR     a3, [a1, #MaxCamEntry32]
        ; Update ProcessorFlags
        CMP     a2, a3
        LDRNE   a2, [a1, #ProcessorFlags]
        ORRNE   a2, a2, #CPUFlag_HighRAM
        STRNE   a2, [a1, #ProcessorFlags]
        EXIT
      |
        ; No 64bit support, so all pages must have 32bit addresses
        STR     a2, [a1, #MaxCamEntry32]
        MOV     pc, lr
      ]

; int PhysAddrToPageNo(uint64_t addr)
;
; Converts a physical address to the page number of the page containing it.
; Returns -1 if address is not in RAM.

PhysAddrToPageNo
        ; Convert address to 4K addressing
        MOV     a1, a1, LSR #12
        ORR     a1, a1, a2, LSL #20
        MOV     a4, #0
        LDR     ip, =ZeroPage + PhysRamTable
10      LDMIA   ip!, {a2, a3}                   ; get phys addr, size
        MOVS    a3, a3, LSR #12                 ; end of list? (size=0)
        BEQ     %FT90                           ;   then it ain't RAM
        SUB     a2, a1, a2                      ; a2 = amount into this bank
        CMP     a2, a3                          ; if more than size
        ADDHS   a4, a4, a3                      ;   increase counter by size of bank
        BHS     %BT10                           ;   and move to next
        ADD     a1, a4, a2                      ; add offset to counter
        MOV     pc, lr

90      MOV     a1, #-1
        MOV     pc, lr


; A routine to construct the soft CAM from the page tables. This is used
; after a soft reset, and also on a hard reset as it's an easy way of
; clearing up after the recursive page table allocaton.

        ROUT
ConstructCAMfromPageTables
        Push    "v1-v8, lr"
        LDR     a1, =ZeroPage
        LDR     a2, [a1, #MaxCamEntry]
        LDR     v1, [a1, #CamEntriesPointer]    ; v1 -> CAM (for whole routine)
        ADD     a2, a2, #1
        ADD     a2, v1, a2, LSL #CAM_EntrySizeLog2

        LDR     a3, =DuffEntry                  ; Clear the whole CAM, from
        MOV     a4, #AreaFlags_Duff             ; the top down.
        ASSERT  CAM_LogAddr=0
        ASSERT  CAM_PageFlags=4
        ASSERT  CAM_PMP=8
        ASSERT  CAM_PMPIndex=12
        ASSERT  CAM_EntrySize=16
        MOV     v2, #0
        MOV     v3, #-1
10      STMDB   a2!, {a3, a4, v2, v3}
        CMP     a2, v1
        BHI     %BT10

        MOV     v2, #0                          ; v2 = logical address
30      MOV     a1, v2
        BL      LoadAndDecodeL1Entry            ; a1,a2 = phys addr, a3 = page flags/type, a4 = page size (bytes)
        CMP     a3, #-2                         ; Only care about page table pointers
        BEQ     %FT39
        ADDS    v2, v2, a4
        BCC     %BT30
        Pull    "v1-v8, pc"

39      ADD     v7, v2, a4
40      MOV     a1, v2
        BL      LoadAndDecodeL2Entry            ; a1,a2 = phys addr, a3 = flags (-1 if fault), a4 = page size (bytes)
        CMP     a3, #-1                         ; move to next page if fault
        BEQ     %FT80
        SUBS    a4, a4, #4096                   ; large pages get bits 12-15 from the virtual address
        ANDNE   lr, v2, a4
        ORR     v6, a3, #PageFlags_Unavailable
        ORRNE   a1, a1, lr
        BL      PhysAddrToPageNo                ; -1 if unknown page
        ADDS    a1, v1, a1, LSL #CAM_EntrySizeLog2 ; a1 -> CAM entry
        ASSERT  CAM_LogAddr=0
        ASSERT  CAM_PageFlags=4
        STMCCIA a1, {v2, v6}                    ; store logical address, PPL

80      ADDS    v2, v2, #&00001000              ; always advance in 4K units here
        TEQ     v2, v7
        BNE     %BT40                           ; more to do from this L1 entry
        BCC     %BT30                           ; logical address not wrapped yet

        Pull    "v1-v8, pc"



; Allocate a physical page from DRAM
;
; On entry:
;    v1 -> current entry in PhysRamTable
;    v2 -> end of last used physical page (page units)
; On exit:
;    a1 -> next free page (assumed 32bit address)
;    v1, v2 updated
;
; No out of memory check...

Init_ClaimPhysicalPage
        MOV     a1, v2
        LDMIA   v1, {a2, a3}
        ADD     a2, a2, a3, LSR #12             ; a2 = end of this bank
        CMP     v2, a2                          ; advance v2 to next bank if
        LDRHS   a1, [v1, #8]!                   ; this bank is fully used
        ADD     v2, a1, #1
        MOV     a1, a1, LSL #12                 ; Convert to byte address
        MOV     pc, lr

; Allocate and map in some RAM.
;
; On entry:
;    a1 = logical address
;    a2 = access permissions (see Init_MapIn)
;    a3 = length (4K multiple)
;    v1 -> current entry in PhysRamTable
;    v2 = next physical address (page units)
;    v3 -> L1PT
;
; On exit:
;    a1 -> physical address of start of RAM (deduce the rest from PhysRamTable)
;
; No out of memory check...
Init_MapInRAM ROUT
        Push    "v4-v8,lr"
        MOV     v8, #-1
        MOV     v5, a3                          ; v5 = amount of memory required
        MOV     v6, a1                          ; v6 = logical address
        MOV     v7, a2                          ; v7 = access permissions
10      LDMIA   v1, {v4, ip}                    ; v4 = addr of bank, ip = len+flags
        MOV     ip, ip, LSR #12
        SUB     v4, v2, v4                      ; v4 = amount of bank used
        RSBS    v4, v4, ip                      ; v4 = amount of bank left (pages)
        LDREQ   v2, [v1, #8]!                   ; move to next bank if 0 left
        BEQ     %BT10

        CMP     v8, #-1                         ; is this the first bank?
        MOVEQ   v8, v2                          ; remember it

        CMP     v4, v5, LSR #12                 ; sufficient in this bank?
        MOVHS   a4, v5
        MOVLO   a4, v4, LSL #12                 ; a4 = amount to take

        MOV     a1, v2, LSL #12                 ; set up parameters for MapIn call
        MOV     a2, v6                          ; then move globals (in case MapIn
        MOV     a3, v7                          ; needs to allocate for L2PT)
        ADD     v2, v2, a4, LSR #12             ; advance physaddr
        SUB     v5, v5, a4                      ; decrease wanted
        ADD     v6, v6, a4                      ; advance log address pointer
        BL      Init_MapIn                      ; map in the RAM
        TEQ     v5, #0                          ; more memory still required?
        BNE     %BT10

        MOV     a1, v8
        Pull    "v4-v8,pc"

Init_MapInRAM_Clear ROUT                        ; same as Init_MapInRAM but also
        Push    "a1,a3,v5,lr"                   ; clears the mapped in result
        BL      Init_MapInRAM
        MOV     v5, a1
        Pull    "a1,a3"
        MOV     a2, #0
        BL      memset
        MOV     a1, v5
        Pull    "v5,pc"

; Allocate and map a physically contigous chunk of some DMAable RAM.
;
; On entry:
;    a1 = logical address
;    a2 = access permissions (see Init_MapIn)
;    a3 = length (4K multiple)
;    v1 -> current entry in PhysRamTable
;    v2 = next physical address (page units)
;    v3 -> L1PT
;
; On exit:
;    a1 -> physical address of start of RAM (deduce the rest from PhysRamTable)
;
; Use this routine with caution - correct total amount of required DMA memory
; must have been calculated beforehand and stashed in InitDMABlock
Init_MapInRAM_DMA ROUT
        Push    "a1,a3,v4-v5,ip,lr"
        TEQ     v3, #0                          ; MMU on?
        LDREQ   v4, =ZeroPage                   ; get workspace directly
        ADDNE   v4, v3, #DRAMOffset_PageZero-DRAMOffset_PageTables ; deduce from L1PT
        LDR     v5, [v4, #InitDMAEnd]
        ADD     lr, v5, a3                      ; claim the RAM
        STR     lr, [v4, #InitDMAEnd]

        MOV     a4, a3
        MOV     a3, a2
        MOV     a2, a1
        MOV     a1, v5
        BL      Init_MapIn                      ; map it in
        ; DMA regions won't get cleared by ClearWkspRam, so do it manually
        ; Could potentially skip this if the HAL says RAM is already clear, but
        ; for now do it anyway (especially since startup flags haven't been set
        ; when we're first called)
        Pull    "a1,a3"
        TEQ     v3, #0
        MOVNE   a1, v5
        MOV     a2, #0
        BL      memset
        MOV     a1, v5
        Pull    "v4-v5,ip,pc"

; Map a range of physical addresses to a range of logical addresses.
;
; On entry:
;    a1 = physical address (32bit)
;    a2 = logical address
;    a3 = DA flags
;    a4 = area size (4K multiple)
;    v1 -> current entry in PhysRamTable
;    v2 = last used physical address (page units)
;    v3 -> L1PT (or 0 if MMU on)

Init_MapIn ROUT
        Entry   "v4-v7"
        MOV     v4, a1                          ; v4 = physaddr
        MOV     v5, a2                          ; v5 = logaddr
        MOV     v6, a3                          ; v6 = page flags
        MOV     v7, a4                          ; v7 = area size
        ; Set up a2-a4 for the Get*PTE functions
        TEQ     v3, #0
        LDREQ   a3, =ZeroPage
        ADDNE   a3, v3, #DRAMOffset_PageZero-DRAMOffset_PageTables
        MOV     a2, v6
        LDR     a4, [a3, #MMU_PCBTrans]
        LDR     a3, [a3, #MMU_PPLTrans]

        ORR     lr, v4, v5                      ; OR together, physaddr, logaddr
        ORR     lr, lr, v7                      ; and size.
      [ LongDesc
        MOVS    ip, lr, LSL #11                 ; If all bottom 21 bits 0
        BEQ     %FT50                           ; it's L2PT block mapped
      |
        MOVS    ip, lr, LSL #12                 ; If all bottom 20 bits 0
        BEQ     %FT50                           ; it's section mapped
      ]

        MOV     a1, #0                          ; We don't want the address in the result

        MOVS    ip, lr, LSL #16                 ; If bottom 16 bits not all 0
        ADR     lr, %FT10
        BNE     Get4KPTE                        ; then small pages (4K)

        BL      Get64KPTE                       ; else large pages (64K)
10
      [ LongDesc
        ORR     v6, a1, a2                      ; v6 = high & low attributes
      |
        MOV     v6, a1                          ; v6 = access permissions
      ]

20      MOV     a1, v4
        MOV     a2, v5
        MOV     a3, v6
        BL      Init_MapInPage                  ; Loop through mapping in each
        ADD     v4, v4, #4096                   ; page in turn
        ADD     v5, v5, #4096
        SUBS    v7, v7, #4096
        BNE     %BT20
        EXIT

      [ LongDesc
50
        BL      Get2MPTE
        MOVS    ip, v3                          ; is MMU on?
        LDREQ   ip, =LL2PT                      ; then use virtual address
        ADD     a3, ip, v5, LSR #18             ; a2 -> L2PT entry
70      STRD    a1, [a3], #8                    ; And store in L2PT
        ADD     a1, a1, #2*1024*1024            ; Advance two megabytes
        SUBS    v7, v7, #2*1024*1024            ; and loop
        BNE     %BT70
        EXIT
      |
50
        BL      Get1MPTE
        MOVS    ip, v3                          ; is MMU on?
        LDREQ   ip, =L1PT                       ; then use virtual address
        ADD     a2, ip, v5, LSR #18             ; a2 -> L1PT entry
70      STR     a1, [a2], #4                    ; And store in L1PT
        ADD     a1, a1, #1024*1024              ; Advance one megabyte
        SUBS    v7, v7, #1024*1024              ; and loop
        BNE     %BT70
        EXIT
      ]

; Map a logical page to a physical page, allocating L2PT as necessary.
;
; On entry:
;    a1 = physical address (32bit)
;    a2 = logical address
 [ LongDesc
;    a3 = high & low page attributes merged into one word
 |
;    a3 = access permissions + C + B bits + size (all non-address bits, of appropriate type)
 ]
;    v1 -> current entry in PhysRamTable
;    v2 = last used physical address (page units)
;    v3 -> L1PT (or 0 if MMU on)
; On exit:
;    a1 = logical address
;    a2-a4, ip corrupt
;    v1, v2 updated
;

Init_MapInPage  ROUT
        Entry   "v4-v6"
        MOV     v4, a1                          ; v4 = physical address
        MOV     v5, a2                          ; v5 = logical address
        MOV     v6, a3                          ; v6 = access permissions
        MOV     a1, v5
        MOV     a2, #4096
        BL      AllocateL2PT
        TEQ     v3, #0                          ; if MMU on, access L2PT virtually...
      [ LongDesc
        LDREQ   a1, =LL3PT
        MOVEQ   ip, v5                          ; index using whole address
        BEQ     %FT40
        MOV     ip, v5, LSR #21
        ADD     a1, v3, ip, LSL #3
        LDRD    a1, [a1]                        ; a1 = level two descriptor
        BFC     a1, #0, #LL_LowAddr_Start       ; a1 -> L3PT tables for this section
        MOV     ip, v5, LSL #32-(12+9)
        MOV     ip, ip, LSR #32-(12+9)          ; extract L3 table index bits
40      MOV     a3, v4
        BFI     a3, v6, #0, #LL_LowAddr_Start   ; Add low attributes
        MOV     a4, v6
        BFC     a4, #0, #LL_HighAttr_Start      ; High attributes
        ADD     a1, a1, ip, LSR #9
        STRD    a3, [a1]                        ; update L3PT entry
        MOV     a1, v5
        EXIT
      |
        LDREQ   a1, =L2PT                       ; a1 -> L2PT virtual address
        MOVEQ   ip, v5                          ; index using whole address
        BEQ     %FT40
        MOV     ip, v5, LSR #20
        LDR     a1, [v3, ip, LSL #2]            ; a1 = level one descriptor
        MOV     a1, a1, LSR #10
        MOV     a1, a1, LSL #10                 ; a1 -> L2PT tables for this section
        AND     ip, v5, #&000FF000              ; extract L2 table index bits
40      AND     lr, v6, #3
        TEQ     lr, #L2_LargePage               ; strip out surplus address bits from
        BICEQ   v4, v4, #&0000F000              ; large page descriptors
        ORR     lr, v4, v6                      ; lr = value for L2PT entry
        STR     lr, [a1, ip, LSR #10]           ; update L2PT entry
        MOV     a1, v5
        EXIT
      ]



; On entry:
;    a1 = virtual address L2PT required for
;    a2 = number of bytes of virtual space
;    v1 -> current entry in PhysRamTable
;    v2 = last used physical address (page units)
;    v3 -> L1PT (or 0 if MMU on)
; On exit
;    a1-a4,ip corrupt
;    v1, v2 updated
AllocateL2PT ROUT
 [ LongDesc
        Entry   "v4-v8"
        MOV     v8, a1, LSR #21                 ; round base address down to 2M
        ADD     lr, a1, a2
        MOV     v7, lr, LSR #21
        TEQ     lr, v7, LSL #21
        ADDNE   v7, v7, #1                      ; round end address up to 2M

        MOVS    v6, v3
        LDREQ   v6, =LL2PT                      ; v6->L2PT (whole routine)

05      ADD     a3, v6, v8, LSL #3              ; L2PT contains 1 entry per 2MB
        LDRD    a1, [a3]
        TEQ     a1, #0                          ; if non-zero, the L3PT has
        BNE     %FT40                           ; already been allocated
        ; With 8 bytes per entry, every 4KB L3PT covers 2MB of RAM, the same
        ; as the single L2PT entry covers. So no need to deal with partially
        ; used pages like the short descriptor case.
        BL      Init_ClaimPhysicalPage          ; Claim a page to put L3PT in
        MOV     v4, a1

; Need to zero the L3PT. Must do it before calling in MapInPage, as that may well
; want to put something in the thing we are clearing. If the MMU is off, no problem,
; but if the MMU is on, then the L3PT isn't accessible until we've called MapInPage.
; Solution is to use the AccessPhysicalAddress call.

        TEQ     v3, #0                          ; MMU on?
        MOVNE   a1, v4                          ; if not, just access v4
        LDREQ   a1, =OSAP_None+DynAreaFlags_NotCacheable ; if so, map in v4
        MOVEQ   a2, v4
        SUBEQ   sp, sp, #4
        MOVEQ   a3, #0
        MOVEQ   a4, sp
        BLEQ    AccessPhysicalAddress

        MOV     a2, #0
        MOV     a3, #4*1024
        BL      memset

        TEQ     v3, #0
        LDREQ   a1, [sp], #4
        BLEQ    ReleasePhysicalAddress

        ; Fill in the L2PT entry
        ORR     a1, v4, #LL12_Table
        MOV     a2, #0
        ADD     a3, v6, v8, LSL #3
        STRD    a1, [a3]

        ; Get the correct page table entry flags for Init_MapInPage
        TEQ     v3, #0
        LDREQ   a3, =ZeroPage
        ADDNE   a3, v3, #DRAMOffset_PageZero-DRAMOffset_PageTables
        LDR     a2, [a3, #PageTable_PageFlags]
        LDR     a4, [a3, #MMU_PCBTrans]
        LDR     a3, [a3, #MMU_PPLTrans]
        MOV     a1, #0
        BL      Get4KPTE

        ORR     a3, a1, a2
        MOV     a1, v4                          ; Map in the L3PT page itself
        LDR     a2, =LL3PT                      ; (can't recurse, because L3PT
        ADD     a2, a2, v8, LSL #12             ; backing for L3PT is preallocated)
        BL      Init_MapInPage


40      ADD     v8, v8, #1                      ; go back until all
        CMP     v8, v7                          ; pages allocated
        BLO     %BT05

        EXIT
 |
        Entry   "v4-v8"
        MOV     v8, a1, LSR #20                 ; round base address down to 1M
        ADD     lr, a1, a2
        MOV     v7, lr, LSR #20
        TEQ     lr, v7, LSL #20
        ADDNE   v7, v7, #1                      ; round end address up to 1M

        MOVS    v6, v3
        LDREQ   v6, =L1PT                       ; v6->L1PT (whole routine)

05      LDR     v5, [v6, v8, LSL #2]            ; L1PT contains 1 word per M
        TEQ     v5, #0                          ; if non-zero, the L2PT has
                                                ; already been allocated
        BNE     %FT40

        BIC     lr, v8, #3                      ; round down to 4M - each page
        ADD     lr, v6, lr, LSL #2              ; of L2PT maps to 4 sections
        LDMIA   lr, {a3,a4,v5,ip}               ; check if any are page mapped
        ASSERT  L1_Fault = 2_00 :LAND: L1_Page = 2_01 :LAND: L1_Section = 2_10
        TST     a3, #1
        TSTEQ   a4, #1
        TSTEQ   v5, #1
        TSTEQ   ip, #1
        BEQ     %FT20                           ; nothing page mapped - claim a page

        TST     a4, #1                          ; at least one of the sections is page mapped
        SUBNE   a3, a4, #1*1024                 ; find out where it's pointing to and
        TST     v5, #1                          ; derive the corresponding address for our
        SUBNE   a3, v5, #2*1024                 ; section
        TST     ip, #1
        SUBNE   a3, ip, #3*1024

        AND     lr, v8, #3
        ORR     a3, a3, lr, LSL #10
        STR     a3, [v6, v8, LSL #2]            ; fill in the L1PT entry
        B       %FT40                           ; no more to do

20      BL      Init_ClaimPhysicalPage          ; Claim a page to put L2PT in
        MOV     v4, a1

; Need to zero the L2PT. Must do it before calling in MapInPage, as that may well
; want to put something in the thing we are clearing. If the MMU is off, no problem,
; but if the MMU is on, then the L2PT isn't accessible until we've called MapInPage.
; Solution is to use the AccessPhysicalAddress call.

        TEQ     v3, #0                          ; MMU on?
        MOVNE   a1, v4                          ; if not, just access v4
        LDREQ   a1, =OSAP_None+DynAreaFlags_NotCacheable ; if so, map in v4
        MOVEQ   a2, v4
        SUBEQ   sp, sp, #4
        MOVEQ   a3, #0
        MOVEQ   a4, sp
        BLEQ    AccessPhysicalAddress

        MOV     a2, #0
        MOV     a3, #4*1024
        BL      memset

        TEQ     v3, #0
        LDREQ   a1, [sp], #4
        BLEQ    ReleasePhysicalAddress

      [ MEMM_Type = "VMSAv6"
        ORR     a3, v4, #L1_Page
      |
        ORR     a3, v4, #L1_Page + L1_U         ; Set the U bit for ARM6 (assume L2 pages will generally be cacheable)
      ]
        AND     lr, v8, #3
        ORR     a3, a3, lr, LSL #10
        STR     a3, [v6, v8, LSL #2]            ; fill in the L1PT

        ; Get the correct page table entry flags for Init_MapInPage
        TEQ     v3, #0
        LDREQ   a3, =ZeroPage
        ADDNE   a3, v3, #DRAMOffset_PageZero-DRAMOffset_PageTables
        LDR     a2, [a3, #PageTable_PageFlags]
        LDR     a4, [a3, #MMU_PCBTrans]
        LDR     a3, [a3, #MMU_PPLTrans]
        MOV     a1, #0
        BL      Get4KPTE

        MOV     a3, a1
        MOV     a1, v4                          ; Map in the L2PT page itself
        LDR     a2, =L2PT                       ; (can't recurse, because L2PT
        ADD     a2, a2, v8, LSL #10             ; backing for L2PT is preallocated)
        BIC     a2, a2, #&C00
        BL      Init_MapInPage


40      ADD     v8, v8, #1                      ; go back until all
        CMP     v8, v7                          ; pages allocated
        BLO     %BT05

        EXIT
 ]


        ; in:  ip = ZeroPage
        ; out: v3 = PhysIllegalMask
DeterminePhysIllegalMask
        Push    "lr"
        MOV     v3, #&FFFFFFFF         ; By default any upper bits are illegal
        ARM_read_ID lr
        AND     lr, lr, #&F :SHL: 16
        CMP     lr, #ARMvF :SHL: 16    ; Check that feature registers are implemented
        BNE     %FT01                  ; Some ARMv5 chips supported supersections, but let's not worry about them
        MRC     p15, 0, lr, c0, c1, 7  ; ID_MMFR3
        TST     lr, #&F :SHL: 28
        MOVEQ   v3, #&FFFFFF00
01      STR     v3, [ip, #PhysIllegalMask]
        Pull    "pc"

 [ :LNOT: LongDesc
        MACRO
$lab    ConstructIOPTE $pte, $phys_mb, $flags, $tmp
        ; $pte (output)                L1 page table entry word
        ; $phys_mb (input, preserved)  physical address, in megabytes
        ;   for vanilla sections:
        ;     bits 0..11 go in bits 20..31
        ;   for supersections:
        ;     bits 0..3 assumed zero
        ;     bits 4..11 go in bits 24..31
        ;     bits 12..15 go in bits 20..23
        ;     bits 16..20 go in bits 5..8
        ; $flags (input, preserved)    page table attribute bits
        ;
        ; UBFXNE should be safe pre v6T2, since we won't attempt to use
        ; supersections on such CPUs and they won't trap untaken undefined instructions
        ASSERT  $pte <> $phys_mb
        ASSERT  $pte <> $flags
        ASSERT  $pte <> $tmp
        ASSERT  $tmp <> $phys_mb
        ASSERT  $tmp <> $flags
$lab    ANDS    $tmp, $flags, #L1_SS
        UBFXNE  $tmp, $phys_mb, #32-20, #L1_SSb32Width
        ORR     $pte, $phys_mb, $tmp
        UBFXNE  $tmp, $phys_mb, #36-20, #L1_SSb36Width
        ASSERT  L1_SSb32Shift = 20
        ORR     $pte, $flags, $pte, LSL #L1_SSb32Shift
        ORRNE   $pte, $pte, $tmp, LSL #L1_SSb36Shift
        MEND
 ]


; void *AccessPhysicalAddress(unsigned int flags, uint64_t phys, void **oldp)
;
; APCS calling convention.
;
; flags: RISC OS page flags
; phys: Physical address to access
; oldp: Pointer to word to store old state (or NULL)
;
; On exit: Returns logical address corresponding to 'phys'.
;
; Arranges for the physical address 'phys' to be (temporarily) mapped into
; logical memory. In fact, at least the whole megabyte containing 'phys' is
; mapped in. All mappings use the same shared logical window; the current state
; of the window will be returned in 'oldp', to allow it to be restored (via
; ReleasePhysicalAddress) once you've finised with it.
;
; Note: No cache maintenance performed. Assumption is that mappings will be
; non-cacheable.
AccessPhysicalAddress ROUT
        ; Check physical address is valid on current CPU
        LDR     ip, =ZeroPage
        Push    "a1,v3,lr"
        LDR     v3, [ip, #PhysIllegalMask]
        TST     a3, v3
        BNE     %FT90
      [ LongDesc
        UBFX    v3, a2, #0, #21                         ; v3 = offset for result
        ; Use Get2MPTE to convert into page table entry
        MOV     ip, a2
        BFI     ip, a3, #0, #21                         ; Get2MPTE packs the high address bits into the low bits
        GetPTE  a1, 2M, ip, a1
        ; Force XN (easier to do afterwards since PPL mapping is non-trivial)
        ORR     a2, a2, #LL_Page_HighAttr_XN
        MOV     lr, a4
        LDR     ip, =LL2PT + (PhysicalAccess:SHR:18)    ; ip -> L2PT entry
        LDRD    a3, [ip]                                ; Get old entry
        STRD    a1, [ip]                                ; Put new entry
        ; Compact old entry into a single word if necessary
        CMP     lr, #0
        BFINE   a3, a4, #12, #8
        STRNE   a3, [lr]
        LDR     a1, =PhysicalAccess
        ORR     a1, a1, v3
      |
        ; Use Get1MPTE to convert DA flags into L1PT section-mapping flags
        MOV     ip, #0
        GetPTE  a1, 1M, ip, a1
        ; Mapping size (section or supersection) depends on address
        MOV     lr, a2, LSR #20
        ORR     lr, lr, a3, LSL #12                     ; lr = physical megabyte number
        TEQ     a3, #0
        ORRNE   a1, a1, #L1_SS                          ; need to use supersection for such addresses
        BIC     a2, a2, #&FF000000                      ; at most, bits 0-23 are used as offsets into section/supersection
        BICNE   lr, lr, #&F                             ; if address >4GB, round mapped address to 16MB (supersection)
        BICEQ   a2, a2, #&00F00000                      ; else no further rounding needed (section) and bits 20-23 are not used as an offset either
        ConstructIOPTE v3, lr, a1, ip
        LDR     ip, =L1PT + (PhysicalAccess:SHR:18)     ; ip -> L1PT entry
 [ MEMM_Type = "VMSAv6"
        ORR     v3, v3, #L1_XN                          ; force non-executable to prevent speculative instruction fetches
 ]
        TEQ     a4, #0
        LDRNE   lr, [ip]                                ; read old value (if necessary)
        STRNE   lr, [a4]                                ; put old one in [oldp]
        MOV     a4, #15
        STR     v3, [ip], #4                            ; store first of 16 new L1PT entries
        TST     v3, #L1_SS
        MOVEQ   v3, #0                                  ; if supersection mapped then use 16 duplicate entries, else remaining entries unmapped
10      SUBS    a4, a4, #1
        STR     v3, [ip], #4
        BNE     %BT10

        LDR     a1, =PhysicalAccess
        ORR     a1, a1, a2
      ]
        STR     a1, [sp]
        ARMop   MMU_ChangingUncached                    ; sufficient, cause not cacheable
        Pull    "a1,v3,pc"

90      ; Invalid physical address
        ADD     sp, sp, #1*4
        MOV     a1, #0
        Pull    "v3,pc"

; void ReleasePhysicalAddress(void *old)
;
; APCS calling convention.
;
; Call with the 'oldp' value from a previous AccessPhysicalAddress call to
; restore previous physical access window state.
ReleasePhysicalAddress
      [ LongDesc
        LDR     ip, =LL2PT + (PhysicalAccess:SHR:18)    ; ip -> L2PT entry
        ; The 8 byte page table entry is packed into the 4 byte token as folllows:
        ; * Bits 0-11 give the low 12 bits of the page table entry (page type + low attributes)
        ; * Bits 21-31 give bits 21-31 of the page table entry (low PA)
        ; * Bits 12-20 give bits 32-39 of the page table entry (high PA)
        ; * Bit 20 is spare  (kept at zero)
        ; * The upper attributes are fixed (always XN)
        UBFX    a2, a1, #12, #8                         ; Get high word
        BICS    a1, a1, #&FF000                         ; Get low word
        ORRNE   a2, a2, #LL_Page_HighAttr_XN            ; Always XN
        STRD    a1, [ip]
      |
        LDR     ip, =L1PT + (PhysicalAccess:SHR:18)     ; ip -> L1PT entry
        MOV     a4, #15
        STR     a1, [ip], #4                            ; restore first of 16 L1PT entries
        TST     a1, #L1_SS
        MOVEQ   a1, #0                                  ; if supersection mapped then use 16 duplicate entries, else remaining entries unmapped
10      SUBS    a4, a4, #1
        STR     a1, [ip], #4
        BNE     %BT10
      ]
        ARMop   MMU_ChangingUncached,,tailcall          ; sufficient, cause not cacheable


; void Init_PageTablesChanged(void)
;
; A TLB+cache invalidation that works on all known ARMs. Invalidate all I+D TLB is the _only_ TLB
; op that works on ARM720T, ARM920T and SA110. Ditto invalidate all I+D cache.
;
; DOES NOT CLEAN THE DATA CACHE. This is a helpful simplification, but requires that don't use
; this routine after we've started using normal RAM.
;
Init_PageTablesChanged
        MOV     a3, lr
        BL      Init_ARMarch
        MOV     ip, #0
        BNE     %FT01
        MCREQ   ARM_config_cp,0,ip,ARMv3_TLBflush_reg,C0
        B       %FT02
01      MCRNE   ARM_config_cp,0,ip,ARMv4_TLB_reg,C7
02
 [ MEMM_Type = "VMSAv6"
        CMP     a1, #ARMvF
        ADREQ   lr, %FT01
        BEQ     HAL_InvalidateCache_ARMvF
        MCRNE   ARM_config_cp,0,ip,ARMv4_cache_reg,C7           ; works on ARMv3
01
 |
        MCR     ARM_config_cp,0,ip,ARMv4_cache_reg,C7           ; works on ARMv3
 ]
        MOV     pc, a3




;++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
;
;       ClearWkspRAM - Routine to clear "all" workspace
;
; We have to avoid anything between InitUsedStart and InitUsedEnd - i.e.
; the page tables, HAL workspace, etc.
;
; Note that zero page workspace isn't included in InitUsedStart/InitUsedEnd.
; Sensitive areas of it (e.g. PhysRamTable, IRQ vector) are skipped via the
; help of RamSkipTable
;
; The bulk of RAM is cleared during the keyboard scan (ClearFreePoolSection).

;
; out:  r4-r11, r13 preserved
;

ClearWkspRAM ROUT
        MSR     CPSR_c, #F32_bit+FIQ32_mode             ; get some extra registers
        MOV     r8, #0
        MOV     r9, #0
        MOV     r10, #0
        MOV     r11, #0
        MOV     r12, #0
        MOV     r13, #0
        MOV     r14, #0
        MSR     CPSR_c, #F32_bit+SVC32_mode

        LDR     r0,=ZeroPage+InitClearRamWs             ;we can preserve r4-r11,lr in one of the skipped regions
        STMIA   r0,{r4-r11,lr}

        DebugTX "ClearWkspRAM"

        ; Start off by clearing zero page + scratch space, as these:
        ; (a) are already mapped in and
        ; (b) may require the use of the skip table
        LDR     r0, =ZeroPage
        ADD     r1, r0, #16*1024
        ADR     r6, RamSkipTable
        MSR     CPSR_c, #F32_bit+FIQ32_mode             ; switch to our bank o'zeros
        LDR     r5, [r6], #4                            ; load first skip addr
10
        TEQ     r0, r1
        TEQNE   r0, r5
        STMNEIA r0!, {r8-r11}
        BNE     %BT10
        TEQ     r0, r1
        BEQ     %FT20
        LDR     r5, [r6], #4                            ; load skip amount
        ADD     r0, r0, r5                              ; and skip it
        LDR     r5, [r6], #4                            ; load next skip addr
        B       %BT10
20
        LDR     r0, =ScratchSpace
        ADD     r1, r0, #ScratchSpaceSize
30
        TEQ     r0, r1
        STMNEIA r0!, {r8-r11}
        STMNEIA r0!, {r8-r11}
        BNE     %BT30

        MSR     CPSR_c, #F32_bit+SVC32_mode

        LDR     r0, =ZeroPage+InitClearRamWs
        LDMIA   r0, {r4-r11,r14}                        ;restore

      [ {FALSE} ; NewReset sets this later
        LDR     r0, =ZeroPage+OsbyteVars + :INDEX: LastBREAK
        MOV     r1, #&80
        STRB    r1, [r0]                                ; flag the fact that RAM cleared
      ]

        MOV     pc, lr

        LTORG

        MACRO
        MakeSkipTable $addr, $size
        ASSERT  ($addr :AND: 15) = 0
        ASSERT  ($size :AND: 15) = 0
        ASSERT  ($addr-ZeroPage) < 16*1024
        &       $addr, $size
        MEND

        MACRO
        EndSkipTables
        &       -1
        MEND

RamSkipTable
        MakeSkipTable   ZeroPage, InitWsEnd
        MakeSkipTable   ZeroPage+SkippedTables, SkippedTablesEnd-SkippedTables
        EndSkipTables

;++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
;
;       ClearFreePoolSection - Routine to clear a section of the free pool
;
; During keyboard scanning we soak up slack time clearing the bulk of RAM
; by picking a section of the free pool, mapping it in, clearing & flushing.

;
; In:   r0 = CAM entry to continue from
; Out:  r0 = updated
;

ClearFreePoolSection ROUT
        Push    "r1-r3, lr"
        LDR     r3, =ZeroPage
        LDR     r1, [r3, #MaxCamEntry]
        LDR     r2, =ZeroPage+FreePoolDANode
        CMP     r0, r1
        BHI     %FT30

        LDR     r3, [r3, #CamEntriesPointer]
        ADD     r1, r3, r1, LSL #CAM_EntrySizeLog2      ; top entry (inc)
        ADD     r3, r3, r0, LSL #CAM_EntrySizeLog2      ; starting entry
10
        LDR     r14, [r3, #CAM_PageFlags]
        TST     r14, #DynAreaFlags_PMP
        BEQ     %FT20

        LDR     r14, [r3, #CAM_PMP]
        TEQ     r14, r2
        BEQ     %FT40
20
        ADD     r3, r3, #CAM_EntrySize                  ; next
        CMP     r3, r1
        BLS     %BT10
30
        MOV     r0, #-1
        Pull    "r1-r3, pc"
40
        Push    "r0-r12"

        ; This is a PMP entry in the free pool
        LDR     r14, [r3, #CAM_PMPIndex]                ; list index
        LDR     r9, [r2, #DANode_PMP]                   ; PMP list base
        LDR     r3, [r9, r14, LSL #2]                   ; ppn
        BL      ppn_to_physical                         ; => r8,r9 = PA

      [ MEMM_Type = "ARM600"
        ; Map in this section, cacheable + bufferable to ensure burst writes
        ; are performed (StrongARM will only perform burst writes to CB areas)
        MOV     a1, #OSAP_None
      |
        ; Map in this section with default NCB cache policy. Making it cacheable
        ; is liable to slow things down significantly on some platforms (e.g.
        ; PL310 L2 cache)
        LDR     a1, =OSAP_None + DynAreaFlags_NotCacheable
      ]
        MOV     a2, r8
        MOV     a3, r9
        MOV     a4, #0
        BL      AccessPhysicalAddress

        MOV     r5, r8, LSR #20
        ORR     r5, r5, r9, LSL #12                     ; r5 = physical MB

        MOV     r4, #0                                  ; clear to this value
        MOV     r6, r4
        MOV     r7, r4
        MOV     r12, r4
45
        MOV     r8, r4
        MOV     r9, r4
        MOV     r10, r4
        MOV     r11, r4

        ; Fill that page
        ADD     r2, r0, #4096
50
        STMIA   r0!, {r4,r6-r12}
        STMIA   r0!, {r4,r6-r12}
        TEQ     r0, r2
        BNE     %BT50

        MOV     r10, r5                                 ; previous phys MB

        ; Step the CAM until there are no more pages in that section
        LDR     r1, [sp, #1*4]
        LDR     r2, [sp, #2*4]
        LDR     r11, [sp, #3*4]
        B       %FT65
60
        LDR     r14, [r11, #CAM_PageFlags]
        TST     r14, #DynAreaFlags_PMP
        BEQ     %FT65

        LDR     r14, [r11, #CAM_PMP]
        TEQ     r14, r2
        BEQ     %FT70
65
        ADD     r11, r11, #CAM_EntrySize                ; next
        CMP     r11, r1
        BLS     %BT60

        MOV     r14, #-1                                ; CAM top, no more
        B       %FT80
70

        ; Next PMP entry in the free pool
        LDR     r14, [r11, #CAM_PMPIndex]               ; list index
        LDR     r9, [r2, #DANode_PMP]                   ; PMP list base
        LDR     r3, [r9, r14, LSL #2]                   ; ppn
        BL      ppn_to_physical                         ; => r8,r9 = PA

        MOV     r5, r8, LSR #20
        ORR     r5, r5, r9, LSL #12
        TEQ     r10, r5                                 ; same MB as previous?
        LDRNE   r14, =ZeroPage
        LDRNE   r14, [r14, #CamEntriesPointer]
        SUBNE   r14, r11, r14
        MOVNE   r14, r14, LSR #CAM_EntrySizeLog2        ; no, so compute continuation point
        SUBEQ   r0, r0, #4                              ; wind back to make sure we stay in the correct megabyte of PhysicalAccess
      [ NoARMT2
        MOVEQ   r0, r0, LSR #20
        ORREQ   r0, r0, r8, LSL #12
        MOVEQ   r0, r0, ROR #12
      |
        BFIEQ   r0, r8, #0, #20
      ]
        STREQ   r11, [sp, #3*4]
        BEQ     %BT45                                   ; yes, so clear it
80
        STR     r14, [sp, #0*4]                         ; return value for continuation

 [ MEMM_Type = "ARM600" ; VMSAv6 maps as non-cacheable, so no flush required
        ; Make page uncacheable so the following is safe
        MOV     r4, r0

        LDR     r0, =OSAP_None+DynAreaFlags_NotCacheable
        MOV     r1, r10, LSL #20
        MOV     r2, r10, LSR #12
        MOV     r3, #0
        BL      AccessPhysicalAddress

        MOV     r0, r4

        ; Clean & invalidate the cache before the 1MB window closes
      [ CacheCleanerHack
        ; StrongARM requires special clean code, because we haven't mapped in
        ; DCacheCleanAddress yet. Cheat and only perform a clean, not full
        ; clean + invalidate (should be safe as we've only been writing)
        ARM_read_ID r2
        AND     r2, r2, #&F000
        CMP     r2, #&A000
        BNE     %FT90
85
        SUB     r0, r0, #32                             ; rewind 1 cache line
        ARMA_clean_DCentry r0
        MOVS    r1, r0, LSL #12                         ; start of the MB?
        BNE     %BT85
        B       %FT91
90
      ]
        ARMop Cache_CleanInvalidateAll
 ]
91
        MOV     a1, #L1_Fault
        BL      ReleasePhysicalAddress                  ; reset to default

        Pull    "r0-r12"

        Pull    "r1-r3, pc"

InitProcVecs
        BKPT    &C000                                   ; Reset
        BKPT    &C004                                   ; Undefined Instruction
        BKPT    &C008                                   ; SWI
        BKPT    &C00C                                   ; Prefetch Abort
        SUBS    pc, lr, #4                              ; ignore data aborts
        BKPT    &C014                                   ; Address Exception
        LDR     pc, InitProcVecs + InitIRQHandler       ; IRQ
        BKPT    &C01C                                   ; FIQ
InitProcVec_FIQ
        DCD     0
InitProcVecsEnd

;
; In:  a1 = flags  (L1_B,L1_C,L1_TEX)
;           bit 20 set if doubly mapped
;           bit 21 set if L1_AP specified (else default to AP_None)
;      a2 = physical address
;      a3 = size
; Out: a1 = assigned logical address, or 0 if failed (no room)
;
RISCOS_MapInIO ROUT
        MOV     a4, a3
        MOV     a3, #0
        ; drop through...
;
; In:  a1 = flags  (L1_B,L1_C,L1_TEX)
;           bit 20 set if doubly mapped
;           bit 21 set if L1_AP specified (else default to AP_None)
;      a2,a3 = physical address
;      a4 = size
; Out: a1 = assigned logical address, or 0 if failed (no room)
;
RISCOS_MapInIO64 ROUT

; Will detect and return I/O space already mapped appropriately, or map and return new space
; For simplicity and speed of search, works on a section (1Mb) granularity
;

        ASSERT  L1_B = 1:SHL:2
        ASSERT  L1_C = 1:SHL:3
 [ MEMM_Type = "VMSAv6"
        ASSERT  L1_AP = 2_100011 :SHL: 10
        ASSERT  L1_TEX = 2_111 :SHL: 12
 |
        ASSERT  L1_AP = 3:SHL:10
        ASSERT  L1_TEX = 2_1111 :SHL: 12
 ]
MapInFlag_DoublyMapped * 1:SHL:20
MapInFlag_APSpecified * 1:SHL:21

    [ LongDesc
        ! 0, "LongDescTODO Decide how to handle this"
        AND     a1, a1, #MapInFlag_DoublyMapped
        ORR     a1, a1, #LL12_Block+LLAttr_SO           ; privileged device mapping
        ORR     a1, a1, #LL_Page_LowAttr_AF+LL_Page_LowAttr_SH1+LL_Page_LowAttr_SH0
RISCOS_MapInIO_LowAttr ; a1 bits 0-11 = low attributes, bits 20+ = our extra flags
    |
        TST     a1, #MapInFlag_APSpecified
        BICEQ   a1, a1, #L1_AP
        ; For VMSAv6, assume HAL knows what it's doing and requests correct settings for AP_ROM
        ORREQ   a1, a1, #L1_APMult * AP_None
        BIC     a1, a1, #3
 [ MEMM_Type = "VMSAv6"
        ORR     a1, a1, #L1_Section+L1_XN               ; force non-executable to prevent speculative instruction fetches
 |
        ORR     a1, a1, #L1_Section
 ]
RISCOS_MapInIO_PTE ; a1 bits 0-19 = L1 section entry flags, bits 20+ = our extra flags
     ]
        Entry   "a2,v1-v8"
        LDR     ip, =ZeroPage
        SUB     a4, a4, #1                              ; reduce by 1 so end physical address is inclusive
        ADDS    v1, a2, a4
        ADC     v2, a3, #0                              ; v1,v2 = end physical address
        LDR     v3, [ip, #PhysIllegalMask]
        TST     v2, v3
        MOVNE   a1, #0
        BNE     %FT90                                   ; can't map in physical addresses in this range

 [ LongDesc
        MOV     v4, a2, LSR #21
        ORR     v4, v4, a3, LSL #11                     ; v4 = physical start 2MB to map
        MOV     v5, v1, LSR #21
        ORR     v5, v5, v2, LSL #11
        ADD     v5, v5, #1                              ; v5 = exclusive physical end 2MB to map
        ANDS    v8, a1, #MapInFlag_DoublyMapped
        SUBNE   v8, v5, v4                              ; v8 = offset of second mapping (in 2MB) or 0
        UBFX    a1, a1, #0, #LL_LowAttr_Start+LL_LowAttr_Size ; mask out our extra flags
        ORR     a1, a1, v4, LSL #21
        MOV     a2, v4, LSR #11
        ORR     a2, a2, #LL_Page_HighAttr_XN            ; a1,a2 = first PT entry to match
        LDR     v7, [ip, #IOAllocPtr]
        MOV     v7, v7, LSR #18                         ; v7 = logical 2MB*8 that we're checking for a match
        LDR     v1, =LL2PT
        LDR     v2, [ip, #IOAllocTop]
        MOV     v2, v2, LSR #18                         ; v2 = last logical 2MB*8 to check (exclusive)
10
        ADD     ip, v7, v8, LSL #3                      ; logical 2MB*8 of base mapping or second mapping if there is one
        CMP     ip, v2
        BHS     %FT50                                   ; run out of logical addresses to check
        LDRD    a3, [v1, v7]                            ; check only or first entry
        TEQ     a1, a3
        TEQEQ   a2, a4
        LDREQD  a3, [v1, ip]                            ; check only or second entry
        TEQEQ   a1, a3
        TEQEQ   a2, a4
        ADD     v7, v7, #8                              ; next logical 2MB to check
        BNE     %BT10

        ; Found start of requested IO already mapped, and with required flags
        ; Now check that the remaining secions are all there too
        MOV     v6, v4                                  ; current 2MB being checked
        MOV     v3, v7, LSL #18
        SUB     v3, v3, #1:SHL:21                       ; start logical address
20
        ADD     v6, v6, #1                              ; next physical 2MB
        ADDS    a1, a1, #1:SHL:21                       ; next PTE
        ADC     a2, a2, #0
        ADD     ip, v7, v8, LSL #3
        CMP     v6, v5
        BHS     %FT80
        CMP     ip, v2
        BHS     %FT45                                   ; run out of logical addresses to check
        LDRD    a3, [v1, v7]                            ; check only or first entry
        TEQ     a1, a3
        TEQEQ   a2, a4
        LDREQD  a3, [v1, ip]                            ; check only or second entry
        TEQEQ   a1, a3
        TEQEQ   a2, a4
        ADDEQ   v7, v7, #8                              ; next logical 2MB*8
        BEQ     %BT20                                   ; good so far, try next entry
        ; Mismatch, rewind PTE and continue outer loop
        SUB     v6, v6, v4
        SUBS    a1, a1, v6, LSL #21
        SBC     a2, a2, v6, LSR #11    
        B       %BT10

45
        ; Rewind PTE
        SUB     v6, v6, v4
        SUBS    a1, a1, v6, LSL #21
        SBC     a2, a2, v6, LSR #11
50      ; Request not currently mapped, only partially mapped, or mapped with wrong flags
        LDR     ip, =ZeroPage
        LDR     a3, [ip, #IOAllocPtr]
        MOV     a3, a3, LSR #21
        SUB     v7, v5, v4                              ; v7 = number of 2MB required
        SUB     a3, a3, v7
        MOV     v3, a3, LSL #21
        LDR     v6, [ip, #IOAllocLimit]
        CMP     v3, v6                                  ; run out of room to allocate IO?
        MOVLS   a1, #0                                  ; LS is to match previous version of the code - perhaps should be LO?
        BLS     %FT90
        STR     v3, [ip, #IOAllocPtr]
        MOV     a3, a3, LSL #3
60
        ADD     ip, a3, v8, LSL #3
        STRD    a1, [v1, a3]                            ; write only or first entry
        STRD    a1, [v1, ip]                            ; write only or second entry
        ADDS    a1, a1, #1:SHL:21
        ADC     a2, a2, #0
        ADD     a3, a3, #8
        SUBS    v7, v7, #1
        BNE     %BT60

        PageTableSync                                   ; corrupts a1
80
        LDR     a2, [sp]                                ; retrieve original physical address from stack
        BFI     v3, a2, #0, #21                         ; apply sub-2MB offset
        MOV     a1, v3
90
        EXIT
 |
        MOVS    v3, v2
        MOVNE   v3, #&F                                 ; v3 = number of MB to use in rounding (0 for sections, 15 for supersections)
        MOV     v4, a2, LSR #20
        ORR     v4, v4, a3, LSL #12
        BIC     v4, v4, v3                              ; v4 = physical start MB to map
        MOV     v5, v1, LSR #20
        ORR     v5, v5, v2, LSL #12
        ADD     v5, v5, #1                              ; make exclusive
        ADD     v5, v5, v3
        BIC     v5, v5, v3                              ; v5 = physical end MB to map
        ANDS    a2, a1, #MapInFlag_DoublyMapped
        SUBNE   a2, v5, v4                              ; a2 = offset of second mapping (in MB) or 0
        LDR     v6, =&FFFFF
        AND     a1, a1, v6                              ; mask out our extra flags
        CMP     v5, #&1000
        ORRHI   a1, a1, #L1_SS                          ; set supersection flag if necessary
        LDR     a3, [ip, #IOAllocPtr]
        MOV     a3, a3, LSR #20
        ADD     a3, a3, v3
        BIC     a3, a3, v3                              ; a3 = logical MB that we're checking for a match
        ConstructIOPTE a4, v4, a1, lr                   ; a4 = first PT entry to match
        ADD     v3, v3, #1                              ; v3 = number of MB to step between sections or supersections
        LDR     v1, =L1PT
        LDR     v2, [ip, #IOAllocTop]
        MOV     v2, v2, LSR #20                         ; v2 = last logical MB to check (exclusive)
        SUB     a3, a3, v3                              ; no increment on first iteration
10
        ADD     a3, a3, v3                              ; next logical MB to check
        ADD     ip, a3, a2                              ; logical MB of base mapping or second mapping if there is one
        CMP     ip, v2
        BHS     %FT50                                   ; run out of logical addresses to check
        LDR     lr, [v1, a3, LSL #2]                    ; check only or first entry
        TEQ     lr, a4
        LDREQ   lr, [v1, ip, LSL #2]                    ; check only or second entry
        TEQEQ   lr, a4
        BNE     %BT10

        ; Found start of requested IO already mapped, and with required flags
        ; Now check that the remaining secions are all there too
        MOV     v6, v4                                  ; v6 = expected physical MB
        MOV     v7, a3                                  ; v7 = logical MB we expect to find it at
20
        ADD     v6, v6, v3                              ; next physical MB
        ADD     v7, v7, v3                              ; next logical MB
        ADD     ip, v7, a2                              ; logical MB of base mapping or second mapping if there is one
        CMP     v6, v5
        MOVHS   a4, a3, LSL #20
        BHS     %FT80                                   ; reached end and everything matched
        CMP     ip, v2
        BHS     %FT50                                   ; run out of logical addresses to check
        ConstructIOPTE v8, v6, a1, lr
        LDR     lr, [v1, v7, LSL #2]                    ; check only or first entry
        TEQ     lr, v8
        LDREQ   lr, [v1, ip, LSL #2]                    ; check only or second entry
        TEQEQ   lr, v8
        BEQ     %BT20                                   ; good so far, try next entry
        B       %BT10                                   ; mismatch, continue outer loop

50      ; Request not currently mapped, only partially mapped, or mapped with wrong flags
        LDR     ip, =ZeroPage
        SUB     v8, v3, #1                              ; v8 = number of MB to use in rounding (0 for sections, 15 for supersections)
        LDR     a3, [ip, #IOAllocPtr]
        MOV     a3, a3, LSR #20
        BIC     a3, a3, v8                              ; round down to 1MB or 16MB boundary (some memory may remain unmapped above when we map in a supersection)
        SUB     a4, v5, v4
        ADD     a4, a4, a2                              ; a4 = number of MB required
        SUB     a3, a3, a4
        MOV     a4, a3, LSL #20
        LDR     v6, [ip, #IOAllocLimit]
        CMP     a4, v6                                  ; run out of room to allocate IO?
        MOVLS   a1, #0                                  ; LS is to match previous version of the code - perhaps should be LO?
        BLS     %FT90
        STR     a4, [ip, #IOAllocPtr]
60
        ConstructIOPTE v8, v4, a1, lr                   ; v8 = page table entry for this (super)section
        MOV     v7, v3                                  ; number of consecutive entries to program the same
70
        ADD     v6, a3, a2
        STR     v8, [v1, a3, LSL #2]                    ; write only or first entry
        ADD     a3, a3, #1
        STR     v8, [v1, v6, LSL #2]                    ; write only or second entry
        SUBS    v7, v7, #1
        BNE     %BT70
        ADD     v4, v4, v3
        CMP     v4, v5
        BLO     %BT60

        MOV     a2, a1
        PageTableSync                                   ; corrupts a1
        MOV     a1, a2
80
        LDR     a2, [sp]                                ; retrieve original physical address from stack
        BIC     a2, a2, #&FF000000                      ; distance from 16MB boundary for supersections
        TST     a1, #L1_SS
        BICEQ   a2, a2, #&00F00000                      ; distance from 1MB boundary for sections
        ADD     a1, a4, a2
90
        EXIT
 ]


; void RISCOS_AddDevice(unsigned int flags, struct device *d)
RISCOS_AddDevice
        ADDS    a1, a2, #0      ; also clears V
        B       HardwareDeviceAdd_Common

; uint64_t RISCOS_LogToPhys(const void *log)
RISCOS_LogToPhys ROUT
        Push    "r4,r5,r8,r9,lr"
        MOV     r4, a1
        BL      logical_to_physical
        MOVCC   a1, r8
        MOVCC   a2, r9
        BCC     %FT10
        ; Try checking L1PT for any section mappings (logical_to_physical only
        ; deals with regular 4K page mappings)
        ; TODO - Add large page support
        MOV     r0, a1, LSR #20
        MOV     r0, r0, LSL #20
        BL      LoadAndDecodeL1Entry
        CMP     r2, #-2
        BHS     %FT10
        MOVHS   a1, #-1                 ; No L1 page
        MOVHS   a2, #-1
        SUBLO   r3, r3, #1              ; Valid L1 page, apply sub-page offset
        ANDLO   r4, r4, r3
        ADDLO   a1, r0, r4
10
        Pull    "r4,r5,r8,r9,pc"

; int RISCOS_IICOpV(IICDesc *descs, uint32_t ndesc_and_bus)
RISCOS_IICOpV ROUT
        Push    "lr"
        BL      IIC_OpV
        MOVVC   a1, #IICStatus_Completed
        Pull    "pc", VC
        ; Map from RISC OS error numbers to abstract IICStatus return values
        LDR     a1, [a1]
        LDR     lr, =ErrorNumber_IIC_NoAcknowledge
        SUB     a1, a1, lr              ; 0/1/2 = NoAck/Error/Busy
        CMP     a1, #3
        MOVCS   a1, #3                  ; 3+ => unknown, either way it's an Error
        ADR     lr, %FT10
        LDRB    a1, [lr, a1]
        Pull    "pc"
10
        ASSERT    (ErrorNumber_IIC_Error - ErrorNumber_IIC_NoAcknowledge) = 1
        ASSERT    (ErrorNumber_IIC_Busy - ErrorNumber_IIC_NoAcknowledge) = 2
        DCB       IICStatus_NoACK, IICStatus_Error, IICStatus_Busy, IICStatus_Error
        ALIGN

SetUpHALEntryTable ROUT
        LDR     a1, =ZeroPage
        LDR     a2, [a1, #HAL_Descriptor]
        LDR     a3, [a1, #HAL_Workspace]
        LDR     a4, [a2, #HALDesc_Entries]
        LDR     ip, [a2, #HALDesc_NumEntries]
        ADD     a4, a2, a4                              ; a4 -> entry table
        MOV     a2, a4                                  ; a2 -> entry table (increments)
10      SUBS    ip, ip, #1                              ; decrement counter
        LDRCS   a1, [a2], #4
        BCC     %FT20
        TEQ     a1, #0
        ADREQ   a1, NullHALEntry
        ADDNE   a1, a4, a1                              ; convert offset to absolute
        STR     a1, [a3, #-4]!                          ; store backwards below HAL workspace
        B       %BT10
20      LDR     a1, =ZeroPage                           ; pad table with NullHALEntries
        LDR     a4, =HALWorkspace                       ; in case where HAL didn't supply enough
        ADR     a1, NullHALEntry
30      CMP     a3, a4
        STRHI   a1, [a3, #-4]!
        BHI     %BT30
        MOV     pc, lr


NullHALEntry
        MOV     pc, lr

; Can freely corrupt r10-r12 (v7,v8,ip).
HardwareSWI
        AND     ip, v5, #&FF

        CMP     ip, #OSHW_LookupRoutine
        ASSERT  OSHW_CallHAL < OSHW_LookupRoutine
        BLO     HardwareCallHAL
        BEQ     HardwareLookupRoutine

        CMP     ip, #OSHW_DeviceRemove
        ASSERT  OSHW_DeviceAdd < OSHW_DeviceRemove
        BLO     HardwareDeviceAdd
        BEQ     HardwareDeviceRemove

        CMP     ip, #OSHW_DeviceEnumerateChrono
        ASSERT  OSHW_DeviceEnumerate < OSHW_DeviceEnumerateChrono
        ASSERT  OSHW_DeviceEnumerateChrono < OSHW_MaxSubreason
        BLO     HardwareDeviceEnumerate
        BEQ     HardwareDeviceEnumerateChrono
        BHI     HardwareBadReason

HardwareCallHAL
        Push    "v1-v4,sb,lr"
        ADD     v8, sb, #1                              ; v8 = entry no + 1
        LDR     ip, =ZeroPage
        LDR     v7, [ip, #HAL_Descriptor]
        AddressHAL ip                                   ; sb set up
        LDR     v7, [v7, #HALDesc_NumEntries]           ; v7 = number of entries
        CMP     v8, v7                                  ; entryno + 1 must be <= number of entries
        BHI     HardwareBadEntry2
        LDR     ip, [sb, -v8, LSL #2]
        ADR     v7, NullHALEntry
        TEQ     ip, v7
        BEQ     HardwareBadEntry2
      [ NoARMv5
        MOV     lr, pc
        MOV     pc, ip
      |
        BLX     ip
      ]
        ADD     sp, sp, #4*4
        Pull    "sb,lr"
        ExitSWIHandler

HardwareLookupRoutine
        ADD     v8, sb, #1                              ; v8 = entry no + 1
        LDR     ip, =ZeroPage
        LDR     v7, [ip, #HAL_Descriptor]
        AddressHAL ip
        LDR     v7, [v7, #HALDesc_NumEntries]
        CMP     v8, v7                                  ; entryno + 1 must be <= number of entries
        BHI     HardwareBadEntry
        LDR     a1, [sb, -v8, LSL #2]
        ADR     v7, NullHALEntry
        TEQ     a1, v7
        BEQ     HardwareBadEntry
        MOV     a2, sb
        ExitSWIHandler

HardwareDeviceAdd
        Push    "r1-r3,lr"
        BL      HardwareDeviceAdd_Common
        Pull    "r1-r3,lr"
        B       SLVK_TestV

HardwareDeviceRemove
        Push    "r1-r3,lr"
        BL      HardwareDeviceRemove_Common
        Pull    "r1-r3,lr"
        B       SLVK_TestV

HardwareDeviceAdd_Common
        Entry
        BL      HardwareDeviceRemove_Common             ; first try to remove any device already at the same address
        EXIT    VS
        LDR     lr, =ZeroPage
        LDR     r1, [lr, #DeviceCount]
        LDR     r2, [lr, #DeviceTable]
        TEQ     r2, #0
        BEQ     %FT80
        ADD     r1, r1, #1                              ; increment DeviceCount
        LDR     lr, [r2, #-4]                           ; word before heap block is length including length word
        TEQ     r1, lr, LSR #2                          ; block already full?
        BEQ     %FT81
        LDR     lr, =ZeroPage
10      STR     r1, [lr, #DeviceCount]
        ADD     lr, r2, r1, LSL #2
        SUB     lr, lr, #4
11      LDR     r1, [lr, #-4]!                          ; copy existing devices up, so new ones get enumerated first
        STR     r1, [lr, #4]
        CMP     lr, r2
        BHI     %BT11
        STR     r0, [r2]
        MOV     r2, r0
        MOV     r1, #Service_Hardware
        MOV     r0, #0
        BL      Issue_Service
        ADDS    r0, r2, #0                              ; exit with V clear
        EXIT

80      ; Claim a system heap block for the device table
        Push    "r0"
        MOV     r3, #16
        BL      ClaimSysHeapNode
        ADDVS   sp, sp, #4
        EXIT    VS
        Pull    "r0"
        LDR     lr, =ZeroPage
        MOV     r1, #1
        STR     r2, [lr, #DeviceTable]
        B       %BT10

81      ; Extend the system heap block
        Push    "r0"
        MOV     r0, #HeapReason_ExtendBlock
        MOV     r3, #16
        BL      DoSysHeapOpWithExtension
        ADDVS   sp, sp, #4
        EXIT    VS
        Pull    "r0"
        LDR     lr, =ZeroPage
        LDR     r1, [lr, #DeviceCount]
        STR     r2, [lr, #DeviceTable]
        ADD     r1, r1, #1
        B       %BT10

HardwareDeviceRemove_Common ROUT
        Entry   "r4"
        LDR     lr, =ZeroPage
        LDR     r3, [lr, #DeviceCount]
        LDR     r4, [lr, #DeviceTable]
        TEQ     r3, #0
        EXIT    EQ                                      ; no devices registered
10      LDR     r2, [r4], #4
        SUBS    r3, r3, #1
        TEQNE   r2, r0
        BNE     %BT10
        TEQ     r2, r0
        EXIT    NE                                      ; this device not registered
        MOV     r0, #1
        MOV     r1, #Service_Hardware
        BL      Issue_Service
        CMP     r1, #0                                  ; if service call claimed
        CMPEQ   r1, #1:SHL:31                           ; then set V (r0 already points to error block)
        EXIT    VS                                      ; and exit
        ; Search for device again - we may have been re-entered
        MOV     r0, r2
        LDR     lr, =ZeroPage
        LDR     r3, [lr, #DeviceCount]
        LDR     r4, [lr, #DeviceTable]
        TEQ     r3, #0
        EXIT    EQ                                      ; no devices registered
20      LDR     r2, [r4], #4
        SUBS    r3, r3, #1
        TEQNE   r2, r0
        BNE     %BT20
        TEQ     r2, r0
        EXIT    NE                                      ; this device not registered
        SUBS    r3, r3, #1
30      LDRCS   r2, [r4], #4                            ; copy down remaining devices
        STRCS   r2, [r4, #-8]
        SUBCSS  r3, r3, #1
        BCS     %BT30
        LDR     lr, =ZeroPage
        LDR     r3, [lr, #DeviceCount]
        SUB     r3, r3, #1
        STR     r3, [lr, #DeviceCount]
        EXIT

HardwareDeviceEnumerate
        Push    "r3-r4,lr"
        LDR     lr, =ZeroPage
        LDR     r2, [lr, #DeviceCount]
        LDR     r3, [lr, #DeviceTable]
        SUBS    r4, r2, r1
        MOVLS   r1, #-1
        BLS     %FT90                                   ; if r1 is out of range then exit
        ADD     r3, r3, r1, LSL #2
10      ADD     r1, r1, #1
        LDR     r2, [r3], #4
        LDR     lr, [r2, #HALDevice_Type]
        EOR     lr, lr, r0
        MOVS    lr, lr, LSL #16                         ; EQ if types match
        SUBNES  r4, r4, #1
        BNE     %BT10
        TEQ     lr, #0
        MOVNE   r1, #-1
        BNE     %FT90
        LDR     lr, [r2, #HALDevice_Version]
        MOV     lr, lr, LSR #16
        CMP     lr, r0, LSR #16                         ; newer than our client understands?
        BLS     %FT90
        SUBS    r4, r4, #1
        BHI     %BT10
        MOV     r1, #-1
90
        Pull    "r3-r4,lr"
        ExitSWIHandler

HardwareDeviceEnumerateChrono
        Push    "r3-r4,lr"
        LDR     lr, =ZeroPage
        LDR     r2, [lr, #DeviceCount]
        LDR     r3, [lr, #DeviceTable]
        SUBS    r4, r2, r1
        MOVLS   r1, #-1
        BLS     %FT90                                   ; if r1 is out of range then exit
        ADD     r3, r3, r4, LSL #2
10      ADD     r1, r1, #1
        LDR     r2, [r3, #-4]!
        LDR     lr, [r2, #HALDevice_Type]
        EOR     lr, lr, r0
        MOVS    lr, lr, LSL #16                         ; EQ if types match
        SUBNES  r4, r4, #1
        BNE     %BT10
        TEQ     lr, #0
        MOVNE   r1, #-1
        BNE     %FT90
        LDR     lr, [r2, #HALDevice_Version]
        MOV     lr, lr, LSR #16
        CMP     lr, r0, LSR #16                         ; newer than our client understands?
        BLS     %FT90
        SUBS    r4, r4, #1
        BHI     %BT10
        MOV     r1, #-1
90
        Pull    "r3-r4,lr"
        ExitSWIHandler

HardwareBadReason
        ADR     r0, ErrorBlock_HardwareBadReason
 [ International
        Push    "lr"
        BL      TranslateError
        Pull    "lr"
 ]
        B       SLVK_SetV

HardwareBadEntry2
        ADD     sp, sp, #4*4
        Pull    "sb,lr"
HardwareBadEntry
        ADR     r0, ErrorBlock_HardwareBadEntry
 [ International
        Push    "lr"
        BL      TranslateError
        Pull    "lr"
 ]
        B       SLVK_SetV

        MakeErrorBlock HardwareBadReason
        MakeErrorBlock HardwareBadEntry

 [ DebugTerminal
DebugTerminal_Rdch
        Push    "a2-a4,sb,ip"
        MOV     sb, ip
20
        WritePSRc SVC_mode, r1
        CallHAL HAL_DebugRX
        CMP     a1, #27
        BNE     %FT25
        LDR     a2, =ZeroPage + OsbyteVars + :INDEX: RS423mode
        LDRB    a2, [a2]
        TEQ     a2, #0                  ; is RS423 raw data,or keyb emulator?
        BNE     %FT25
        LDR     a2, =ZeroPage
        LDRB    a1, [a2, #ESC_Status]
        ORR     a1, a1, #&40
        STRB    a1, [a2, #ESC_Status]   ; mark escape flag
        MOV     a1, #27
        SEC                             ; tell caller to look carefully at R0
        Pull    "a2-a4,sb,ip,pc"
25
        CMP     a1, #-1
        Pull    "a2-a4,sb,ip,pc",NE     ; claim it
        LDR     R0, =ZeroPage
        LDRB    R14, [R0, #CallBack_Flag]
        TST     R14, #CBack_VectorReq
        BLNE    process_callbacks_disableIRQ
        B       %BT20


DebugTerminal_Wrch
        Push    "a1-a4,sb,ip,lr"
        MOV     sb, ip
        CallHAL HAL_DebugTX
        Pull    "a1-a4,sb,ip,pc"        ; don't claim it
 ]


Reset_IRQ_Handler
        SUB     lr, lr, #4
        Push    "a1-a4,v1-v2,sb,ip,lr"
        MRS     a1, SPSR
        MRS     a2, CPSR
        ORR     a3, a2, #SVC32_mode
        MSR     CPSR_c, a3
        Push    "a1-a2,lr"

        ; If it's not an IIC interrupt, mute it
        LDR     v2, =ZeroPage
        AddressHAL v2
        CallHAL HAL_IRQSource
        ADD     v1, v2, #IICBus_Base
        MOV     ip, #0
10
        LDR     a2, [v1, #IICBus_Type]
        TST     a2, #IICFlag_Background
        BEQ     %FT20
        LDR     a2, [v1, #IICBus_Device]
        CMP     a2, a1
        ADREQ   lr, Reset_IRQ_Exit
        BEQ     IICIRQ
20
        ADD     ip, ip, #1
        ADD     v1, v1, #IICBus_Size
        CMP     ip, #IICBus_Count
        BNE     %BT10

        CallHAL HAL_IRQDisable ; Stop the rogue device from killing us completely

Reset_IRQ_Exit
        MyCLREX a1, a2
        Pull    "a1-a2,lr"
        MSR     CPSR_c, a2
        MSR     SPSR_cxsf, a1
        Pull    "a1-a4,v1-v2,sb,ip,pc",,^

 [ DebugHALTX
DebugHALPrint
        Push    "a1-a4,v1,v2,sb,ip"
        MRS     v2, CPSR
        AddressHAL
        MOV     v1, lr
10      LDRB    a1, [v1], #1
        TEQ     a1, #0
        BEQ     %FT20
        CallHAL HAL_DebugTX
        B       %BT10
20      MOV     a1, #13
;        CallHAL HAL_DebugTX
        MOV     a1, #10
;        CallHAL HAL_DebugTX
        ADD     v1, v1, #3
        BIC     lr, v1, #3
        MSR     CPSR_sf, v2
        Pull    "a1-a4,v1,v2,sb,ip"
        MOV     pc, lr
 ]


 [ DebugHALTX
DebugHALPrintReg ; Output number on top of stack to the serial port
        Push    "a1-a4,v1-v4,sb,ip,lr"   ; this is 11 regs
        MRS     v1, CPSR
        LDR     v2, [sp,#11*4]           ; find TOS value on stack
        ADR     v3, hextab
        MOV     v4, #8
05
       AddressHAL
10      LDRB    a1, [v3, v2, LSR #28]
       CallHAL  HAL_DebugTX
        MOV     v2, v2, LSL #4
        SUBS    v4, v4, #1
        BNE     %BT10
        MOV     a1, #13
       CallHAL  HAL_DebugTX
        MOV     a1, #10
       CallHAL  HAL_DebugTX

        MSR     CPSR_sf, v1
        Pull    "a1-a4,v1-v4,sb,ip,lr"
        ADD     sp, sp, #4
        MOV     pc, lr

hextab  DCB "0123456789abcdef"


 ]
;
;
; [ DebugHALTX
;HALDebugHexTX
;       stmfd    r13!, {r0-r3,sb,ip,lr}
;       AddressHAL
;       b        jbdt1
;HALDebugHexTX2
;       stmfd    r13!, {r0-r3,sb,ip,lr}
;       AddressHAL
;       mov      r0,r0,lsl #16
;       b        jbdt2
;HALDebugHexTX4
;       stmfd    r13!, {r0-r3,sb,ip,lr}
;       AddressHAL
;       mov      r0,r0,ror #24          ; hi byte
;       bl       jbdtxh
;       mov      r0,r0,ror #24
;       bl       jbdtxh
;jbdt2
;       mov      r0,r0,ror #24
;       bl       jbdtxh
;       mov      r0,r0,ror #24
;jbdt1
;       bl       jbdtxh
;       mov      r0,#' '
;       CallHAL  HAL_DebugTX
;       ldmfd    r13!, {r0-r3,sb,ip,pc}
;
;jbdtxh stmfd    r13!,{a1,v1,lr}        ; print byte as hex. corrupts a2-a4, ip, assumes sb already AddressHAL'd
;       and      v1,a1,#&f              ; get low nibble
;       and      a1,a1,#&f0             ; get hi nibble
;       mov      a1,a1,lsr #4           ; shift to low nibble
;       cmp      a1,#&9                 ; 9?
;       addle    a1,a1,#&30
;       addgt    a1,a1,#&37             ; convert letter if needed
;       CallHAL  HAL_DebugTX
;       cmp      v1,#9
;       addle    a1,v1,#&30
;       addgt    a1,v1,#&37
;       CallHAL  HAL_DebugTX
;       ldmfd    r13!,{a1,v1,pc}
; ]
;
        END