From 5e11e66520ac367224221fb84cd899538accce76 Mon Sep 17 00:00:00 2001
From: Jeffrey Lee <jlee@gitlab.riscosopen.org>
Date: Sun, 15 Apr 2012 19:48:09 +0000
Subject: [PATCH] OS_ChangeDynamicArea performance optimisations

Detail:
  s/ChangeDyn:
    - Apply various optimisations to OS_ChangeDynamicArea to reduce the execution time when performing large grows/shrinks.
    - Optimisations can be toggled on/off with FastCDA_* flags for debugging.
    - On a 1GHz 512MB BB-xM, the initial *FreePool call now takes 0.15s instead of 13.46s. On a 512MB Iyonix the time has dropped from 1.18s to 0.23s.
    - Growing screen memory (on BB-xM) has also seen significant gains - between 2x and 4x speedup, depending on what state the source pages are in.
    - Added/updated documentation for a few functions and made more use of ROUTs for safety
  s/ARM600, s/VMSAv6:
    - Update BangCamUpdate, etc. to add support for the PageFlags_Unsafe flag that OS_ChangeDynamicArea uses to bypass cache/TLB maintenance in some situations
    - Avoid BangCamUpdate calling BangL2PT to map out the page if the page isn't mapped in (avoids unnecessary cache/TLB flush)
  s/ArthurSWIs:
    - Add extra ASSERT for safety
  s/AMBcontrol/memory
    - Fix incorrect assumption that the usable size of a heap block is always 8 less than the value stored in the header. Even with the old 8 byte aligned allocations the usable size will always be 4 bytes less than the value in the header. This code would have resulted in some slight memory wasteage, as AMBcontrol will have always tried growing the block four bytes bigger than needed.
Admin:
  Tested on Iyonix & BB-xM


Version 5.35, 4.79.2.146. Tagged as 'Kernel-5_35-4_79_2_146'
---
 VersionASM          |  10 +-
 VersionNum          |  14 +-
 s/AMBControl/Memory |   2 +-
 s/ARM600            |  12 +-
 s/ArthurSWIs        |   1 +
 s/ChangeDyn         | 402 +++++++++++++++++++++++++++++++++++++++++---
 s/VMSAv6            |  12 +-
 7 files changed, 417 insertions(+), 36 deletions(-)

diff --git a/VersionASM b/VersionASM
index b0fca0c..1537ffb 100644
--- a/VersionASM
+++ b/VersionASM
@@ -13,11 +13,11 @@
                         GBLS    Module_ComponentPath
 Module_MajorVersion     SETS    "5.35"
 Module_Version          SETA    535
-Module_MinorVersion     SETS    "4.79.2.145"
-Module_Date             SETS    "08 Apr 2012"
-Module_ApplicationDate  SETS    "08-Apr-12"
+Module_MinorVersion     SETS    "4.79.2.146"
+Module_Date             SETS    "15 Apr 2012"
+Module_ApplicationDate  SETS    "15-Apr-12"
 Module_ComponentName    SETS    "Kernel"
 Module_ComponentPath    SETS    "castle/RiscOS/Sources/Kernel"
-Module_FullVersion      SETS    "5.35 (4.79.2.145)"
-Module_HelpVersion      SETS    "5.35 (08 Apr 2012) 4.79.2.145"
+Module_FullVersion      SETS    "5.35 (4.79.2.146)"
+Module_HelpVersion      SETS    "5.35 (15 Apr 2012) 4.79.2.146"
                         END
diff --git a/VersionNum b/VersionNum
index ac8e2be..5c76c40 100644
--- a/VersionNum
+++ b/VersionNum
@@ -5,19 +5,19 @@
  *
  */
 #define Module_MajorVersion_CMHG        5.35
-#define Module_MinorVersion_CMHG        4.79.2.145
-#define Module_Date_CMHG                08 Apr 2012
+#define Module_MinorVersion_CMHG        4.79.2.146
+#define Module_Date_CMHG                15 Apr 2012
 
 #define Module_MajorVersion             "5.35"
 #define Module_Version                  535
-#define Module_MinorVersion             "4.79.2.145"
-#define Module_Date                     "08 Apr 2012"
+#define Module_MinorVersion             "4.79.2.146"
+#define Module_Date                     "15 Apr 2012"
 
-#define Module_ApplicationDate          "08-Apr-12"
+#define Module_ApplicationDate          "15-Apr-12"
 
 #define Module_ComponentName            "Kernel"
 #define Module_ComponentPath            "castle/RiscOS/Sources/Kernel"
 
-#define Module_FullVersion              "5.35 (4.79.2.145)"
-#define Module_HelpVersion              "5.35 (08 Apr 2012) 4.79.2.145"
+#define Module_FullVersion              "5.35 (4.79.2.146)"
+#define Module_HelpVersion              "5.35 (15 Apr 2012) 4.79.2.146"
 #define Module_LibraryVersionInfo       "5:35"
diff --git a/s/AMBControl/Memory b/s/AMBControl/Memory
index 8fc2f47..4932ec3 100644
--- a/s/AMBControl/Memory
+++ b/s/AMBControl/Memory
@@ -50,7 +50,7 @@ AMB_BlockResize ROUT
         ADD     r3,r3,#AMBblockQ - 1
         BIC     r3,r3,#AMBblockQ - 1
         LDR     r1,[r2,#-4]                  ;pick up OS_Heap's size word (naughty!)
-        SUB     r1,r1,#8                     ;heap size will be 8 more than quantised size
+        SUB     r1,r1,#4                     ;heap size will be (at least) 4 more than quantised size
         SUBS    r3,r3,r1                     ;required size change
         MOVNE   r0, #HeapReason_ExtendBlock
         BLNE    DoSysHeapOpWithExtension
diff --git a/s/ARM600 b/s/ARM600
index d14728d..fd55935 100644
--- a/s/ARM600
+++ b/s/ARM600
@@ -218,7 +218,8 @@ BangCamUpdate ROUT
         LDR     r1, [r1, #CamEntriesPointer]
         ADD     r1, r1, r2, LSL #3              ; point at cam entry (logaddr, PPL)
         LDMIA   r1, {r0, r6}                    ; r0 = current logaddress, r6 = current PPL
-        STMIA   r1, {r3, r11}                   ; store new address, PPL
+        BIC     r4, r11, #PageFlags_Unsafe
+        STMIA   r1, {r3, r4}                    ; store new address, PPL
         Push    "r0, r6"                        ; save old logical address, PPL
         LDR     r1, =ZeroPage+PhysRamTable      ; go through phys RAM table
         MOV     r6, r2                          ; make copy of r2 (since that must be preserved)
@@ -248,11 +249,15 @@ BangCamUpdate ROUT
         TEQ     r4, r0, LSR #12                 ; if equal to physical address of page being moved
         BNE     %FT20                           ; if not there, then just put in new page
 
+        AND     r4, r11, #PageFlags_Unsafe
         Push    "r0, r3, r11, r14"              ; save phys.addr, new log.addr, new PPL, lr
         ADD     r3, sp, #4*4
         LDMIA   r3, {r3, r11}                   ; reload old logical address, old PPL
+        LDR     r0, =DuffEntry                  ; Nothing to do if wasn't mapped in
+        ORR     r11, r11, r4
+        TEQ     r3, r0
         MOV     r0, #0                          ; cause translation fault
-        BL      BangL2PT                        ; map page out
+        BLNE    BangL2PT                        ; map page out
         Pull    "r0, r3, r11, r14"
 20
         ADD     sp, sp, #8                      ; junk old logical address, PPL
@@ -340,6 +345,9 @@ BangL2PT                                        ; internal entry point used only
         Push    "lr"
         MOV     r6, r0
 
+        TST     r11, #PageFlags_Unsafe
+        BNE     %FT30
+
         TST     r11, #DynAreaFlags_DoublyMapped
         BNE     BangL2PT_sledgehammer           ;if doubly mapped, don't try to be clever
 
diff --git a/s/ArthurSWIs b/s/ArthurSWIs
index 93172a7..9db8f57 100644
--- a/s/ArthurSWIs
+++ b/s/ArthurSWIs
@@ -1083,6 +1083,7 @@ fakeservicecall
        MOV      r0, r0
        LDR      r10, =ZeroPage
        LDRB     r9, [r10, #FIQclaim_interlock]
+       ASSERT   (ZeroPage :AND: 255) = 0
        STRB     r10, [r10, #FIQclaim_interlock]
 
    [ FIQDebug
diff --git a/s/ChangeDyn b/s/ChangeDyn
index 3b8a382..4023df2 100644
--- a/s/ChangeDyn
+++ b/s/ChangeDyn
@@ -14,6 +14,89 @@
 ;
         TTL   => ChangeDyn
 
+        ; OS_ChangeDynamicArea optimisations:
+
+        GBLL  FastCDA_UpFront ; Do all cache/TLB maintenance upfront instead of on a per-page basis
+FastCDA_UpFront SETL {TRUE}
+
+        GBLL  FastCDA_FIQs ; Don't thrash ClaimFIQ/ReleaseFIQ in DoTheGrowPagesSpecified
+FastCDA_FIQs SETL {TRUE}
+
+        GBLL  FastCDA_NoPhysical ; Don't use RISCOS_AccessPhysicalAddress/RISCOS_ReleasePhysicalAddress in DoTheGrowPagesSpecified. Instead, map pages straight to destination address.
+FastCDA_NoPhysical SETL {TRUE}
+
+        GBLL  FastCDA_CorruptFreePool ; Contents of free pool doesn't need preserving in DoTheGrowPagesSpecified
+FastCDA_CorruptFreePool SETL {TRUE}
+
+        GBLL  FastCDA_Unnecessary ; Avoid unnecessary MMU_ChangingEntry calls in DoTheGrowPagesSpecified
+FastCDA_Unnecessary SETL {TRUE}
+
+        ; DoTheGrowPagesSpecified profiling code
+        ; Written to use Cortex-A8 cycle count performance counter - will need modifying for other CPUs!
+
+        GBLL  FastCDA_Prof
+FastCDA_Prof SETL {FALSE}
+
+      [ FastCDA_Prof
+        ; Squeeze profiling workspace into "free space after envstring"
+                                 ^ ExtendedROMFooter+4
+        ! 0, "FastCDA_Prof workspace at ":CC::STR:@
+FastCDA_Prof_DoTheGrowInit           # 4
+FastCDA_Prof_MarkRequired            # 4
+FastCDA_Prof_PagesUnsafe             # 4
+FastCDA_Prof_DoublyCheckCacheability # 4
+FastCDA_Prof_DoublyMovePages         # 4
+FastCDA_Prof_FindSpare               # 4
+FastCDA_Prof_ClaimFIQ                # 4
+FastCDA_Prof_AccessPhysical          # 4
+FastCDA_Prof_CopyPage                # 4
+FastCDA_Prof_ReleasePhysical         # 4
+FastCDA_Prof_MoveReplacement         # 4
+FastCDA_Prof_MoveNeeded              # 4
+FastCDA_Prof_ReleaseFIQ              # 4
+FastCDA_Prof_PagesSafe               # 4
+FastCDA_Prof_CallPreGrow             # 4
+FastCDA_Prof_CallPostGrow            # 4
+FastCDA_Prof_MMUChanging             # 4
+FastCDA_Prof_MMUChangingUncached     # 4
+FastCDA_Prof_ChangingEntry           # 4
+        ASSERT @ <= &500
+      ]
+
+        MACRO
+        FastCDA_ProfInit $temp
+      [ FastCDA_Prof
+        MVN     $temp,#0
+        MCR     p15,0,$temp,c9,c12,2
+        MOV     $temp,#1<<31
+        MCR     p15,0,$temp,c9,c12,1
+        MOV     $temp,#7
+        MCR     p15,0,$temp,c9,c12,0
+      ]
+        MEND
+
+        MACRO
+        FastCDA_ProfStart $var,$temp,$temp2,$temp3,$cc
+      [ FastCDA_Prof
+        LDR$cc  $temp,=ZeroPage+FastCDA_Prof_$var
+        LDR$cc  $temp2,[$temp]
+        MRC$cc  p15,0,$temp3,c9,c13,0
+        SUB$cc  $temp2,$temp2,$temp3
+        STR$cc  $temp2,[$temp]
+      ]
+        MEND
+
+        MACRO
+        FastCDA_ProfEnd $var,$temp,$temp2,$temp3,$cc
+      [ FastCDA_Prof
+        MRC$cc  p15,0,$temp3,c9,c13,0
+        LDR$cc  $temp,=ZeroPage+FastCDA_Prof_$var
+        LDR$cc  $temp2,[$temp]
+        ADD$cc  $temp2,$temp2,$temp3
+        STR$cc  $temp2,[$temp]
+      ]
+        MEND
+
 ;******************************************************************************
 ; ChangeDynamic SWI
 ; In  : R0 =  0 => System Heap,
@@ -279,7 +362,7 @@ GetPageFlagsForR0IntoR6 Entry "R0-R2, R4-R5, R7"
 ; MoveCAMatR0toR3
 ; in:   r0 = old logaddr
 ;       r3 = new logaddr
-;       r9 = MEMC CR
+;       r9 = offset from 1st to 2nd copy of doubly mapped area (either source or dest, but not both)
 ;       r11 = page protection level
 ;
 ; out:  r2 = physical page number of page moved, unless there was a serious error
@@ -364,11 +447,36 @@ CamMapBroke
         &       0
         =       "!!!! CAM Map Corrupt !!!!", 0
         ALIGN
+; Call_CAM_Mapping
+; in:   r2 = physical page number
+;       r3 = logical address (2nd copy of doubly mapped area)
+;       r9 = offset from 1st to 2nd copy of doubly mapped area (either source or dest, but not both)
+;       r11 = PPL + CB bits
 Call_CAM_Mapping
         Push    "r0, r1, r4, r6, lr"
         BL      BangCamUpdate
         Pull    "r0, r1, r4, r6, pc"
 
+  [ FastCDA_UpFront
+; CheckCacheabilityR0ByMinusR2
+; in:   r0 = end of area
+;       r2 = size of area (must be nonzero)
+; out:  r6 has DynAreaFlags_NotCacheable set if entire region noncacheable
+;              Flag clear if at least one page is cacheable
+;       r0 points to start of area
+;
+;       4K page size assumed!
+CheckCacheabilityR0ByMinusR2 ROUT
+        Entry   "r2"
+10
+        SUB     r0, r0, #4096
+        BL      GetPageFlagsForR0IntoR6
+        SUBS    r2, r2, #4096
+        TSTNE   r6, #DynAreaFlags_NotCacheable
+        BNE     %BT10
+        SUB     r0, r0, r2
+        EXIT
+  ]
 
 ; +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 ; In    r0 bits 0..6 = area number
@@ -719,6 +827,7 @@ PageFlags_Unavailable           *  1 :SHL: 20                      ; physical pa
 ; Temporary flags only used by kernel
 ;
 PageFlags_Required              *  1 :SHL: 21                      ; physical page asked for by handler
+PageFlags_Unsafe                *  1 :SHL: 22                      ; skip cache/TLB maintenance in BangCamUpdate. flag not saved to CAM map.
 
 
 DynamicAreaSWI Entry
@@ -3007,6 +3116,8 @@ SysHeapString
 ChangeDynamicSWI ROUT
         Push    "r0, r2-r9, r10, lr"
 
+        FastCDA_ProfInit r3
+
         LDR     r10, =ZeroPage                  ; check we're not in an IRQ
         LDR     lr, [r10, #IRQsema]
         TEQ     lr, #0
@@ -3082,7 +3193,7 @@ daq_cda_od6done
         BEQ     IssueServiceMemoryMoved         ; zero pages! (r0 = area number, r1 = size change (0))
         BPL     AreaGrow
 
-AreaShrink
+AreaShrink ROUT
         RSB     r1, r1, #0                      ; make size change positive
  [ DebugCDA2
         DREG    r0, "Shrinking area ", cc
@@ -3177,10 +3288,35 @@ AreaShrink
         LDR     r3, [r12, #DANode_Size]
         ADD     r1, r1, r3                      ; r1 -> address of 1st extra page
 
+      [ FastCDA_UpFront
+        Push    "r0-r1"
+        BL      CheckCacheabilityR0ByMinusR2
+        LDR     r4, [r11, #DANode_Flags]
+        ADR     lr, %FT19
+        TST     r4, #DynAreaFlags_DoublyMapped
+        LDR     r4, =ZeroPage
+        BNE     %FT18
+        ; Interacting with singly-mapped region - use regular logic
+        TST     r6, #DynAreaFlags_NotCacheable
+        MOV     r1, r2, LSR #12
+        ARMop   MMU_ChangingEntries, EQ, tailcall, r4
+        ARMop   MMU_ChangingUncachedEntries, NE, tailcall, r4
+18
+        ; Interacting with doubly-mapped region - use sledgehammer logic
+        TST     r6, #DynAreaFlags_NotCacheable
+        ARMop   MMU_Changing, EQ, tailcall, r4
+        ARMop   MMU_ChangingUncached, NE, tailcall, r4
+19
+        Pull    "r0-r1"
+      ]
+
         LDR     lr, =DynAreaFlags_AccessMask
         MOV     r4, r2
         LDR     r6, [r12, #DANode_Flags]        ; r6 = dst flags
         AND     r6, r6, lr
+      [ FastCDA_UpFront
+        ORR     r6, r6, #PageFlags_Unsafe
+      ]
 20
         SUB     r0, r0, r5                      ; pre-decrement source pointer
  [ DebugCDA2
@@ -3221,6 +3357,9 @@ AreaShrink
         SUB     r1, r1, r5
  [ 1 = 1
         BL      GetPageFlagsForR0IntoR6
+ ]
+ [ FastCDA_UpFront
+        ORR     r6, r6, #PageFlags_Unsafe
  ]
         BL      MovePageAtR0ToR1WithAccessR6
         SUBS    r4, r4, r5
@@ -3241,7 +3380,7 @@ AreaShrink
         LDR     r0, [r11, #DANode_Number]       ; reload dynamic area number
         B       IssueServiceMemoryMoved
 
-AreaGrow
+AreaGrow ROUT
  [ DebugCDA2
         DREG    r0, "Growing area ", cc
         DREG    r1, " by "
@@ -3540,7 +3679,8 @@ ISMM_BatCloak
 ;       C=0 => failed to move as much as we wanted
 ;       C=1 => succeeded in moving as much as we wanted
 
-TryToShrinkShrinkables Entry "r0,r1,r10"
+TryToShrinkShrinkables ROUT
+        Entry "r0,r1,r10"
         LDR     lr, [r11, #DANode_Number]
         TEQ     lr, #ChangeDyn_FreePool
         EXIT    NE                              ; if src <> free pool, exit with C, V flags intact
@@ -3604,26 +3744,33 @@ TryToShrinkShrinkables Entry "r0,r1,r10"
 
                 ^       0, sp
 NumEntries      #       4                       ; Number of entries to do for this chunk
-DestAddr        #       4                       ; Log addr of 1st page being added to dest
-DestFlags       #       4                       ; Page flags for destination area
 TotalAmount     #       4                       ; Total size of grow for this chunk (ie entry value of r3)
-SavedPSR        #       4                       ; PSR before IRQs disabled
-Offset1To2      #       4                       ; Offset from 1st to 2nd bank
 
 DoTheGrowNotSpecifiedStackSize * :INDEX: @      ; amount of stack needed for 'not specified' version
 
+DestAddr        #       4                       ; Log addr of 1st page being added to dest
+DestFlags       #       4                       ; Page flags for destination area
+SavedPSR        #       4                       ; PSR before IRQs disabled
+Offset1To2      #       4                       ; Offset from 1st to 2nd bank
 PageBlock1      #       PageBlockSize           ; 1st page block, for original page numbers and phys. addrs
 PageBlock2      #       PageBlockSize           ; 2nd page block, for new page numbers and phys. addrs
 
 DoTheGrowStackSize *    :INDEX: @
 
-DoTheGrow Entry "r3,r5,r10-r12", DoTheGrowStackSize
+; Offset1To2 is only used by the first half of the routine. Reuse the space as flags for the second half:
+                    ^   :INDEX: Offset1To2, sp
+NeedToMoveFlag      #   1                       ; Whether we still need to move the current page
+                    #   3                       ; (spare)
+
+DoTheGrow ROUT
+        Entry "r3,r5,r10-r12", DoTheGrowStackSize
 
 ; First fill in the page block with -1 in the physical page number words
 
         STR     r2, NumEntries                  ; save number of entries for use later
         STR     r7, TotalAmount                 ; save amount growing by
 
+        FastCDA_ProfStart DoTheGrowInit, r0, r1, lr
         ADR     r1, PageBlock1                  ; point at 1st page block on stack
         ADD     lr, r2, r2, LSL #1              ; lr = number of words in page block
         ADD     lr, r1, lr, LSL #2              ; lr -> off end of page block
@@ -3633,6 +3780,7 @@ DoTheGrow Entry "r3,r5,r10-r12", DoTheGrowStackSize
         STR     r0, [lr, #PageBlockSize]        ; and put -1 in 2nd page block as well
         TEQ     lr, r1                          ; until the end
         BNE     %BT10
+        FastCDA_ProfEnd DoTheGrowInit, r0, r3, lr
 
 ; Now call the pre-grow handler
 
@@ -3675,6 +3823,23 @@ DoTheGrow Entry "r3,r5,r10-r12", DoTheGrowStackSize
 
         MOVS    r4, r3                          ; amount to do
         BEQ     %FT20                           ; [none, so skip all this]
+
+      [ FastCDA_UpFront
+        ; Perform sledgehammer logic upfront
+        Push    "r0,r2,r6"
+        MOV     r0, r1
+        MOV     r2, r3
+        BL      CheckCacheabilityR0ByMinusR2
+        TST     r6, #DynAreaFlags_NotCacheable
+        LDR     r0, =ZeroPage
+        ADR     lr, %FT14
+        ARMop   MMU_Changing, EQ, tailcall, r0
+        ARMop   MMU_ChangingUncached, NE, tailcall, r0
+14
+        Pull    "r0,r2,r6"
+        ORR     r6, r6, #PageFlags_Unsafe
+      ]
+
         Push    "r0, r1"
         SUB     r0, r1, r3                      ; src starts at start of 1st copy = start of 2nd - old size
         SUB     r1, r0, r2                      ; dst start = src start - amount of room needed
@@ -3690,6 +3855,21 @@ DoTheGrow Entry "r3,r5,r10-r12", DoTheGrowStackSize
         ADD     r9, r3, r2                      ; set up offset from 1st copy to 2nd copy (= new size)
 25
         ADD     r1, r1, r3                      ; r1 -> address of 1st extra page
+   [ FastCDA_UpFront
+        ; Flush src region from cache
+        Push    "r0-r1,r6"
+        BL      CheckCacheabilityR0ByMinusR2
+        TST     r6, #DynAreaFlags_NotCacheable
+        LDR     r4, =ZeroPage
+        MOV     r1, r2, LSR #12
+        ADR     lr, %FT29
+        ARMop   MMU_ChangingEntries, EQ, tailcall, r4
+        ARMop   MMU_ChangingUncachedEntries, NE, tailcall, r4
+29
+        Pull    "r0-r1,r6"
+        ; Now BangCam for all the pages
+        ORR     r6, r6, #PageFlags_Unsafe
+   ]
         MOV     r4, #0                          ; amount done so far
         MOV     r10, r2                         ; move amount to do into r10, as routine returns page number in r2
         ADR     r3, PageBlock1                  ; point at 1st entry we have to update
@@ -3718,7 +3898,7 @@ DoTheGrow Entry "r3,r5,r10-r12", DoTheGrowStackSize
         CLRV
         EXIT
 
-37
+DoTheGrowPageUnavailable ROUT
 
 ; Come here if a required page is not available
 ; First we need to go back thru all the part of the page block we've already done,
@@ -3751,11 +3931,12 @@ DoTheGrow Entry "r3,r5,r10-r12", DoTheGrowStackSize
 
         MakeErrorBlock  CantGetPhysMem
 
-DoTheGrowPagesSpecified
+DoTheGrowPagesSpecified ROUT
 
 ; First check if any of the pages requested are unavailable
 ; At the same time as we're doing this, we fill in the log. and phys. addresses in the block
 
+        FastCDA_ProfStart MarkRequired, r0, r6, lr
         LDR     r0, =ZeroPage
         LDR     r0, [r0, #CamEntriesPointer]
         LDR     r6, =L2PT
@@ -3767,7 +3948,7 @@ DoTheGrowPagesSpecified
         STR     r8, [r1, #PageBlockSize+4-12]   ; and in 2nd page block
 
         TST     lr, #PageFlags_Unavailable :OR: PageFlags_Required ; if page in use by someone else, or by us, then return error
-        BNE     %BT37
+        BNE     DoTheGrowPageUnavailable
         ORR     lr, lr, #PageFlags_Required     ; set bit in flags to say page will be needed
         STR     lr, [r4, #4]                    ; and store back
 
@@ -3785,13 +3966,16 @@ DoTheGrowPagesSpecified
 
         SUBS    r2, r2, #1
         BNE     %BT40
+        FastCDA_ProfEnd MarkRequired, r0, r6, lr
 
 ; now issue Service_PagesUnsafe
 
+        FastCDA_ProfStart PagesUnsafe, r0, r6, lr
         ADR     r2, PageBlock1                  ; r2 -> 1st page block
         LDR     r3, NumEntries                  ; r3 = number of entries in page block
         MOV     r1, #Service_PagesUnsafe
         BL      Issue_Service
+        FastCDA_ProfEnd PagesUnsafe, r0, r6, lr
 
 ; now move the pages
 
@@ -3824,7 +4008,44 @@ DoTheGrowPagesSpecified
 
         MOVS    r4, r3                          ; amount to do
         BEQ     %FT50                           ; [none, so skip all this]
+
+    [ FastCDA_UpFront
+        ; Perform sledgehammer logic upfront
+        Push    "r0,r2,r6"
+        FastCDA_ProfStart DoublyCheckCacheability, r0, r6, lr
+        MOV     r0, r1
+        MOV     r2, r3
+        BL      CheckCacheabilityR0ByMinusR2
+        FastCDA_ProfEnd DoublyCheckCacheability, r0, r2, lr
+        TST     r6, #DynAreaFlags_NotCacheable
+        LDR     r2, =ZeroPage
+      [ FastCDA_Prof
+        LDREQ   lr,[r2,#FastCDA_Prof_MMUChanging]
+        LDRNE   lr,[r2,#FastCDA_Prof_MMUChangingUncached]
+        MRC     p15,0,r0,c9,c13,0
+        SUB     lr,lr,r0
+        STREQ   lr,[r2,#FastCDA_Prof_MMUChanging]
+        STRNE   lr,[r2,#FastCDA_Prof_MMUChangingUncached]
+      ]
+        ADR     lr, %FT44
+        ARMop   MMU_Changing, EQ, tailcall, r2
+        ARMop   MMU_ChangingUncached, NE, tailcall, r2
+44
+      [ FastCDA_Prof
+        MRC     p15,0,r0,c9,c13,0
+        TST     r6, #DynAreaFlags_NotCacheable
+        LDREQ   lr,[r2,#FastCDA_Prof_MMUChanging]
+        LDRNE   lr,[r2,#FastCDA_Prof_MMUChangingUncached]
+        ADD     lr,lr,r0
+        STREQ   lr,[r2,#FastCDA_Prof_MMUChanging]
+        STRNE   lr,[r2,#FastCDA_Prof_MMUChangingUncached]
+      ]
+        Pull    "r0,r2,r6"
+        ORR     r6, r6, #PageFlags_Unsafe
+    ]
+
         Push    "r0, r1"
+        FastCDA_ProfStart DoublyMovePages, r0, r9, lr
         SUB     r0, r1, r3                      ; src starts at start of 1st copy = start of 2nd - old size
         SUB     r1, r0, r2                      ; dst start = src start - amount of room needed
         MOV     r9, #0                          ; no funny business while moving these pages
@@ -3834,6 +4055,7 @@ DoTheGrowPagesSpecified
         ADD     r1, r1, r5                      ; advance dst ptr
         SUBS    r4, r4, r5                      ; one less page to move
         BNE     %BT45                           ; loop if more
+        FastCDA_ProfEnd DoublyMovePages, r0, r9, lr
         Pull    "r0, r1"                        ; restore original regs
 50
         ADD     r9, r3, r2                      ; set up offset from 1st copy to 2nd copy (= new size)
@@ -3851,6 +4073,7 @@ DoTheGrowPagesSpecified
 ; Now before we start, we must construct the second page block, with replacement page numbers
 
 ;        DLINE   "Start of 1st loop"
+        FastCDA_ProfStart FindSpare, r6, r1, lr
 
 60
         LDR     r6, [r8], #12                   ; r6 = page number required
@@ -3907,11 +4130,21 @@ DoTheGrowPagesSpecified
 
         SUBS    r9, r9, #1                      ; one less entry to do
         BNE     %BT60
+        FastCDA_ProfEnd FindSpare, r7, r1, lr
 
         MOV     r7, r3                          ; r7 -> camentries
 
 ; Now we can go onto the 2nd loop which actually moves the pages
 
+     [ FastCDA_FIQs
+        ; Claim FIQs for this entire loop
+        ; (With the old behaviour, for large grows, total time in ReleaseFIQ could be several centiseconds, since the kernel reinstalls the default handler each time)
+        FastCDA_ProfStart ClaimFIQ, r6, r1, lr
+        MOV     r1, #Service_ClaimFIQ
+        BL      Issue_Service
+        FastCDA_ProfEnd ClaimFIQ, r6, r1, lr
+     ]
+
         LDR     r1, DestAddr
         MOV     r4, #0                          ; amount done
         MOV     r0, r7                          ; point r0 at camentries
@@ -3922,9 +4155,22 @@ DoTheGrowPagesSpecified
         MRS     r14, CPSR
         STR     r14, SavedPSR                   ; save old PSR (note: stack must be flat when we do this!)
 
+    [ FastCDA_NoPhysical
+        ; Grab the flags for the page we're replacing; NoPhysical optimisation means the page may get mapped to its target pos earlier than before, causing the flags in the CAM map to be "wrong" when we read them back out later on
+        LDR     r11, [r8, #0]                   ; need to get PPL for page being replaced
+        ADD     lr, r0, #4                      ; point at PPLs, not addresses
+        LDR     r11, [lr, r11, LSL #3]
+        MOV     lr, #1
+        STRB    lr, NeedToMoveFlag
+    ]
+
         Push    "r0-r4,r7-r12"                  ; save regs used during copy
+    [ :LNOT: FastCDA_FIQs
+        FastCDA_ProfStart ClaimFIQ, r6, r1, lr
         MOV     r1, #Service_ClaimFIQ
         BL      Issue_Service
+        FastCDA_ProfEnd ClaimFIQ, r6, r1, lr
+    ]
 
         WritePSRc I_bit+SVC_mode, r6            ; disable IRQs round here (we don't want interrupt code to update
                                                 ; the old mapping behind us while we're trying to copy it)
@@ -3941,24 +4187,50 @@ DoTheGrowPagesSpecified
         ; The old scheme, always copying from other mapping, had interrupt cache coherency hole, at least for
         ; ARM with writeback cache (bug in 3.7, fixed in Ursula, then lost)
 
+        LDR     r6, [r8, #4]                    ;logical address of src page
+
+      [ FastCDA_CorruptFreePool
+        ; If the source is in the free pool, we don't need to preserve its contents
+        LDR     r3, =ZeroPage+FreePoolDANode
+        SUB     r2, r6, #FreePoolAddress
+        LDR     r3, [r3, #DANode_MaxSize]
+        CMP     r2, r3
+        BLO     %FT74
+      ]
+
         LDR     r0, [r0, lr, LSL #3]            ; r0 = log. address for replacement page (NB use logical address to write to, for cache consistency)
 
-        LDR     r6, [r8, #4]                    ;logical address of src page
         LDR     r3, =Nowhere
         TEQ     r6, r3                          ;will be 'Nowhere' if not mapped in
         BNE     %FT71
 
+      [ FastCDA_NoPhysical
+        ; No need to use AccessPhysicalAddress here - if the required page isn't mapped in, all we need to do is map it in at the target address and use that mapping to copy the data out
+        LDR     r2, [r8, #0]
+        LDR     r3, [sp, #4] ; Get stacked r1
+        LDR     r11, [sp, #11*4+:INDEX:DestFlags]
+        FastCDA_ProfStart MoveNeeded, lr, r4, r7
+        BL      Call_CAM_Mapping                ; move needed page to destination
+        FastCDA_ProfEnd MoveNeeded, lr, r4, r7
+        MOV     r6, r3                          ; r6 = logical address of src for copy
+        MOV     lr, #0
+        STRB    lr, [sp, #11*4+:INDEX:NeedToMoveFlag]
+      |
         ASSERT  HAL
         SUB     sp, sp, #4                      ; for oldp
         Push    "r0,r1"
+        FastCDA_ProfStart AccessPhysical, r0, r1, lr
         MOV     r0, #0
         LDR     r1, [r8, #8]                    ; r1 = physical address of src for copy
         ADD     r2, sp, #8                      ; must use physical address, as page may be mapped to nowhere along with others
         BL      RISCOS_AccessPhysicalAddress
         MOV     r6, r0                          ; r6 = logical address of src for copy
+        FastCDA_ProfEnd AccessPhysical, r0, r1, lr
         Pull    "r0,r1"
+      ]
 
 71
+        FastCDA_ProfStart CopyPage, r2, r3, r4
         ADD     lr, r6, r5                      ; lr = end src address
 72
         LDMIA   r6!, {r2, r3, r4, r7, r9, r10, r11, r12}
@@ -3966,12 +4238,24 @@ DoTheGrowPagesSpecified
         TEQ     r6, lr
         BNE     %BT72
 
+        FastCDA_ProfEnd CopyPage, r2, r3, r4
+
+     [ :LNOT: FastCDA_NoPhysical
         LDR     r0, [r8, #4]                    ;logical address of src page
         LDR     r3, =Nowhere
         TEQ     r0, r3
 
         Pull    "r0", EQ                        ; oldp
+      [ FastCDA_Prof
+        BNE     %FT73
+        FastCDA_ProfStart ReleasePhysical, r2, r3, r4
+      ]
         BLEQ    RISCOS_ReleasePhysicalAddress
+      [ FastCDA_Prof
+        FastCDA_ProfEnd ReleasePhysical, r2, r3, r4
+73
+      ]
+     ]
 
 ; now check if page we're replacing is in L2PT, and if so then adjust L1PT entries (4 of these)
 
@@ -4010,38 +4294,71 @@ DoTheGrowPagesSpecified
         ; to avoid serious grief in the awkward cases. Fortunately, these page substitutions are relatively
         ; rare, so performance is not critical.
 
+ [ :LNOT: FastCDA_NoPhysical
         LDR     lr, =ZeroPage
         LDR     lr, [lr, #CamEntriesPointer]    ; lr -> soft cam map
         ADD     lr, lr, #4                      ; point at PPLs, not addresses
         LDR     r2, [r8, #0]                    ; need to get PPL for page being replaced
         LDR     r11, [lr, r2, LSL #3]
+ ]
         BIC     r11, r11, #PageFlags_Required   ; knock off bits that indicate that it was a required page
 
         ADD     lr, r8, #PageBlockSize
         LDMIA   lr, {r2, r3}                    ; get page number, logical address
 
+        LDR     lr, =Nowhere                    ; There's no point in cleaning the nowhere page, and on some architectures it'll even trigger an abort handler due to the lack of mapping
+        TEQ     r3, lr
+        BEQ     %FT75
+      [ FastCDA_Unnecessary
+        ; We only need to clean the cache/TLB if the page is cacheable
+        ; For uncacheable pages, BangL2PT will flush the TLB for us just before the mapping is updated
+        TST     r11, #DynAreaFlags_NotCacheable
+        BNE     %FT75
+      ]
         Push    "r0, r4"
-        LDR     r4, =Nowhere                    ; There's no point in cleaning the nowhere page, and on some architectures it'll even trigger an abort handler due to the lack of mapping
         MOV     r0, r3
-        TEQ     r3, r4
-        LDRNE   r4, =ZeroPage
-        ARMop   MMU_ChangingEntry,NE,,r4
+      [ FastCDA_Prof
+        FastCDA_ProfStart ChangingEntry, r6, lr, r4
+      ]
+        LDR     r4, =ZeroPage
+        ARMop   MMU_ChangingEntry,,,r4
+      [ FastCDA_Prof
+        FastCDA_ProfEnd ChangingEntry, r6, lr, r4
+      ]
         Pull    "r0, r4"
+75
 
+        FastCDA_ProfStart MoveReplacement, r6, lr, r5
         BL      Call_CAM_Mapping                ; move replacement page in
+        FastCDA_ProfEnd MoveReplacement, r6, lr, r5
 76
         LDR     r2, [r8, #0]
         MOV     r3, r1
         LDR     r11, DestFlags
+      [ FastCDA_NoPhysical
+        LDRB    lr, NeedToMoveFlag
+        TEQ     lr, #0
+        BEQ     %FT77                           ; don't bother if page already been moved to dest
+      ]
+        FastCDA_ProfStart MoveNeeded, r6, lr, r5
         BL      Call_CAM_Mapping                ; move needed page to destination
+        FastCDA_ProfEnd MoveNeeded, r6, lr, r5
 
+77
         LDR     lr, SavedPSR
         MSR     CPSR_cf, lr
 
+      [ :LNOT: FastCDA_FIQs
         Push    "r1"
+        FastCDA_ProfStart ReleaseFIQ, r1, lr, r5
         MOV     r1, #Service_ReleaseFIQ
         BL      Issue_Service
+        FastCDA_ProfEnd ReleaseFIQ, r1, lr, r5
         Pull    "r1"
+      ]
+      [ FastCDA_Prof
+        MOV     r5, #4096
+      ]
 
         ADD     r1, r1, r5                      ; advance dest ptr
         ADD     r4, r4, r5                      ; increment amount done
@@ -4049,17 +4366,26 @@ DoTheGrowPagesSpecified
         CMP     r4, r7                          ; have we done all?
         BNE     %BT70                           ; [no, so loop]
 
+     [ FastCDA_FIQs
+        FastCDA_ProfStart ReleaseFIQ, r1, lr, r2
+        MOV     r1, #Service_ReleaseFIQ
+        BL      Issue_Service
+        FastCDA_ProfEnd ReleaseFIQ, r1, lr, r2
+     ]
+
         LDR     r3, [r12, #DANode_Size]
         ADD     r3, r3, r7
         STR     r3, [r12, #DANode_Size]         ; store increased destination size
 
 ; now issue Service_PagesSafe
 
+        FastCDA_ProfStart PagesSafe, r1, r2, r3
         LDR     r2, NumEntries
         ADR     r3, PageBlock1
         ADR     r4, PageBlock2
         MOV     r1, #Service_PagesSafe
         BL      Issue_Service
+        FastCDA_ProfEnd PagesSafe, r1, r2, r3
 
 ; now call Post_Grow handler
 
@@ -4090,7 +4416,8 @@ DoTheGrowPagesSpecified
 ; Note: Removal is from one area only, the calling routine breaks the chunk at free/app boundary.
 
 
-DoTheGrowNotSpecified Entry "r3,r5,r10-r12", DoTheGrowNotSpecifiedStackSize
+DoTheGrowNotSpecified ROUT
+        Entry "r3,r5,r10-r12", DoTheGrowNotSpecifiedStackSize
 
         STR     r2, NumEntries                  ; save number of entries for use later
         STR     r7, TotalAmount                 ; save amount growing by
@@ -4124,6 +4451,23 @@ DoTheGrowNotSpecified Entry "r3,r5,r10-r12", DoTheGrowNotSpecifiedStackSize
 
         MOVS    r4, r3                          ; amount to do
         BEQ     %FT20                           ; [none, so skip all this]
+
+      [ FastCDA_UpFront
+        ; Perform sledgehammer logic upfront
+        Push    "r0,r2,r6"
+        MOV     r0, r1
+        MOV     r2, r3
+        BL      CheckCacheabilityR0ByMinusR2
+        TST     r6, #DynAreaFlags_NotCacheable
+        LDR     r0, =ZeroPage
+        ADR     lr, %FT14
+        ARMop   MMU_Changing, EQ, tailcall, r0
+        ARMop   MMU_ChangingUncached, NE, tailcall, r0
+14
+        Pull    "r0,r2,r6"
+        ORR     r6, r6, #PageFlags_Unsafe
+      ]
+
         Push    "r0, r1"
         SUB     r0, r1, r3                      ; src starts at start of 1st copy = start of 2nd - old size
         SUB     r1, r0, r2                      ; dst start = src start - amount of room needed
@@ -4139,6 +4483,21 @@ DoTheGrowNotSpecified Entry "r3,r5,r10-r12", DoTheGrowNotSpecifiedStackSize
         ADD     r9, r3, r2                      ; set up offset from 1st copy to 2nd copy (= new size)
 25
         ADD     r1, r1, r3                      ; r1 -> address of 1st extra page
+   [ FastCDA_UpFront
+        ; Flush src region from cache
+        Push    "r0-r1,r6"
+        BL      CheckCacheabilityR0ByMinusR2
+        TST     r6, #DynAreaFlags_NotCacheable
+        LDR     r4, =ZeroPage
+        MOV     r1, r2, LSR #12
+        ADR     lr, %FT29
+        ARMop   MMU_ChangingEntries, EQ, tailcall, r4
+        ARMop   MMU_ChangingUncachedEntries, NE, tailcall, r4
+29
+        Pull    "r0-r1,r6"
+        ; Now BangCam for all the pages
+        ORR     r6, r6, #PageFlags_Unsafe
+   ]
         MOV     r4, #0                          ; amount done so far
         MOV     r10, r2                         ; move amount to do into r10
 30
@@ -4182,7 +4541,8 @@ DoTheGrowNotSpecified Entry "r3,r5,r10-r12", DoTheGrowNotSpecifiedStackSize
 ;       endif
 ;
 
-CheckAppSpace Entry "r0-r3"
+CheckAppSpace ROUT
+        Entry "r0-r3"
         LDR     r2, =ZeroPage
         LDR     r3, [r2, #AplWorkSize]
         LDR     r2, [r2, #Curr_Active_Object]
@@ -4336,12 +4696,14 @@ CallPreGrow Entry "r0,r4, r12"
         CMP     r0, #0                                  ; if none (V=0)
         EXIT    EQ                                      ; then exit
 
+        FastCDA_ProfStart CallPreGrow, r0, r4, lr
         MOV     r0, #DAHandler_PreGrow                  ; r0 = reason code
         LDR     r4, [r12, #DANode_Size]                 ; r4 = current size
         ASSERT  DANode_Handler = DANode_Workspace +4
         ADD     r12, r12, #DANode_Workspace
         MOV     lr, pc
         LDMIA   r12, {r12, pc}                          ; load workspace pointer and jump to handler
+        FastCDA_ProfEnd CallPreGrow, r12, r4, lr
         EXIT    VC                                      ; if no error then exit
 
         TEQ     r0, #0                                  ; if generic error returned
@@ -4371,12 +4733,14 @@ CallPostGrow Entry "r0,r3,r4, r12"
         CMP     r0, #0                                  ; if none (V=0)
         EXIT    EQ                                      ; then exit
 
+        FastCDA_ProfStart CallPostGrow, r0, r4, lr
         MOV     r0, #DAHandler_PostGrow                 ; r0 = reason code
         LDR     r4, [r12, #DANode_Size]                 ; r4 = new size
         ASSERT  DANode_Handler = DANode_Workspace +4
         ADD     r12, r12, #DANode_Workspace
         MOV     lr, pc
         LDMIA   r12, {r12, pc}                          ; load workspace pointer and jump to handler
+        FastCDA_ProfEnd CallPostGrow, r12, r4, lr
         EXIT
 
  [ ShrinkableDAs
diff --git a/s/VMSAv6 b/s/VMSAv6
index bcff6c6..7d65228 100644
--- a/s/VMSAv6
+++ b/s/VMSAv6
@@ -75,7 +75,8 @@ BangCamUpdate ROUT
         LDR     r1, [r1, #CamEntriesPointer]
         ADD     r1, r1, r2, LSL #3              ; point at cam entry (logaddr, PPL)
         LDMIA   r1, {r0, r6}                    ; r0 = current logaddress, r6 = current PPL
-        STMIA   r1, {r3, r11}                   ; store new address, PPL
+        BIC     r4, r11, #PageFlags_Unsafe
+        STMIA   r1, {r3, r4}                    ; store new address, PPL
         Push    "r0, r6"                        ; save old logical address, PPL
         LDR     r1, =ZeroPage+PhysRamTable      ; go through phys RAM table
         MOV     r6, r2                          ; make copy of r2 (since that must be preserved)
@@ -105,11 +106,15 @@ BangCamUpdate ROUT
         TEQ     r4, r0, LSR #12                 ; if equal to physical address of page being moved
         BNE     %FT20                           ; if not there, then just put in new page
 
+        AND     r4, r11, #PageFlags_Unsafe
         Push    "r0, r3, r11, r14"              ; save phys.addr, new log.addr, new PPL, lr
         ADD     r3, sp, #4*4
         LDMIA   r3, {r3, r11}                   ; reload old logical address, old PPL
+        LDR     r0, =DuffEntry                  ; Nothing to do if wasn't mapped in
+        ORR     r11, r11, r4
+        TEQ     r3, r0
         MOV     r0, #0                          ; cause translation fault
-        BL      BangL2PT                        ; map page out
+        BLNE    BangL2PT                        ; map page out
         Pull    "r0, r3, r11, r14"
 20
         ADD     sp, sp, #8                      ; junk old logical address, PPL
@@ -197,6 +202,9 @@ BangL2PT                                        ; internal entry point used only
         Push    "lr"
         MOV     r6, r0
 
+        TST     r11, #PageFlags_Unsafe
+        BNE     %FT30
+
         TST     r11, #DynAreaFlags_DoublyMapped
         BNE     BangL2PT_sledgehammer           ;if doubly mapped, don't try to be clever
 
-- 
GitLab