From 5e11e66520ac367224221fb84cd899538accce76 Mon Sep 17 00:00:00 2001 From: Jeffrey Lee <jlee@gitlab.riscosopen.org> Date: Sun, 15 Apr 2012 19:48:09 +0000 Subject: [PATCH] OS_ChangeDynamicArea performance optimisations Detail: s/ChangeDyn: - Apply various optimisations to OS_ChangeDynamicArea to reduce the execution time when performing large grows/shrinks. - Optimisations can be toggled on/off with FastCDA_* flags for debugging. - On a 1GHz 512MB BB-xM, the initial *FreePool call now takes 0.15s instead of 13.46s. On a 512MB Iyonix the time has dropped from 1.18s to 0.23s. - Growing screen memory (on BB-xM) has also seen significant gains - between 2x and 4x speedup, depending on what state the source pages are in. - Added/updated documentation for a few functions and made more use of ROUTs for safety s/ARM600, s/VMSAv6: - Update BangCamUpdate, etc. to add support for the PageFlags_Unsafe flag that OS_ChangeDynamicArea uses to bypass cache/TLB maintenance in some situations - Avoid BangCamUpdate calling BangL2PT to map out the page if the page isn't mapped in (avoids unnecessary cache/TLB flush) s/ArthurSWIs: - Add extra ASSERT for safety s/AMBcontrol/memory - Fix incorrect assumption that the usable size of a heap block is always 8 less than the value stored in the header. Even with the old 8 byte aligned allocations the usable size will always be 4 bytes less than the value in the header. This code would have resulted in some slight memory wasteage, as AMBcontrol will have always tried growing the block four bytes bigger than needed. Admin: Tested on Iyonix & BB-xM Version 5.35, 4.79.2.146. Tagged as 'Kernel-5_35-4_79_2_146' --- VersionASM | 10 +- VersionNum | 14 +- s/AMBControl/Memory | 2 +- s/ARM600 | 12 +- s/ArthurSWIs | 1 + s/ChangeDyn | 402 +++++++++++++++++++++++++++++++++++++++++--- s/VMSAv6 | 12 +- 7 files changed, 417 insertions(+), 36 deletions(-) diff --git a/VersionASM b/VersionASM index b0fca0c..1537ffb 100644 --- a/VersionASM +++ b/VersionASM @@ -13,11 +13,11 @@ GBLS Module_ComponentPath Module_MajorVersion SETS "5.35" Module_Version SETA 535 -Module_MinorVersion SETS "4.79.2.145" -Module_Date SETS "08 Apr 2012" -Module_ApplicationDate SETS "08-Apr-12" +Module_MinorVersion SETS "4.79.2.146" +Module_Date SETS "15 Apr 2012" +Module_ApplicationDate SETS "15-Apr-12" Module_ComponentName SETS "Kernel" Module_ComponentPath SETS "castle/RiscOS/Sources/Kernel" -Module_FullVersion SETS "5.35 (4.79.2.145)" -Module_HelpVersion SETS "5.35 (08 Apr 2012) 4.79.2.145" +Module_FullVersion SETS "5.35 (4.79.2.146)" +Module_HelpVersion SETS "5.35 (15 Apr 2012) 4.79.2.146" END diff --git a/VersionNum b/VersionNum index ac8e2be..5c76c40 100644 --- a/VersionNum +++ b/VersionNum @@ -5,19 +5,19 @@ * */ #define Module_MajorVersion_CMHG 5.35 -#define Module_MinorVersion_CMHG 4.79.2.145 -#define Module_Date_CMHG 08 Apr 2012 +#define Module_MinorVersion_CMHG 4.79.2.146 +#define Module_Date_CMHG 15 Apr 2012 #define Module_MajorVersion "5.35" #define Module_Version 535 -#define Module_MinorVersion "4.79.2.145" -#define Module_Date "08 Apr 2012" +#define Module_MinorVersion "4.79.2.146" +#define Module_Date "15 Apr 2012" -#define Module_ApplicationDate "08-Apr-12" +#define Module_ApplicationDate "15-Apr-12" #define Module_ComponentName "Kernel" #define Module_ComponentPath "castle/RiscOS/Sources/Kernel" -#define Module_FullVersion "5.35 (4.79.2.145)" -#define Module_HelpVersion "5.35 (08 Apr 2012) 4.79.2.145" +#define Module_FullVersion "5.35 (4.79.2.146)" +#define Module_HelpVersion "5.35 (15 Apr 2012) 4.79.2.146" #define Module_LibraryVersionInfo "5:35" diff --git a/s/AMBControl/Memory b/s/AMBControl/Memory index 8fc2f47..4932ec3 100644 --- a/s/AMBControl/Memory +++ b/s/AMBControl/Memory @@ -50,7 +50,7 @@ AMB_BlockResize ROUT ADD r3,r3,#AMBblockQ - 1 BIC r3,r3,#AMBblockQ - 1 LDR r1,[r2,#-4] ;pick up OS_Heap's size word (naughty!) - SUB r1,r1,#8 ;heap size will be 8 more than quantised size + SUB r1,r1,#4 ;heap size will be (at least) 4 more than quantised size SUBS r3,r3,r1 ;required size change MOVNE r0, #HeapReason_ExtendBlock BLNE DoSysHeapOpWithExtension diff --git a/s/ARM600 b/s/ARM600 index d14728d..fd55935 100644 --- a/s/ARM600 +++ b/s/ARM600 @@ -218,7 +218,8 @@ BangCamUpdate ROUT LDR r1, [r1, #CamEntriesPointer] ADD r1, r1, r2, LSL #3 ; point at cam entry (logaddr, PPL) LDMIA r1, {r0, r6} ; r0 = current logaddress, r6 = current PPL - STMIA r1, {r3, r11} ; store new address, PPL + BIC r4, r11, #PageFlags_Unsafe + STMIA r1, {r3, r4} ; store new address, PPL Push "r0, r6" ; save old logical address, PPL LDR r1, =ZeroPage+PhysRamTable ; go through phys RAM table MOV r6, r2 ; make copy of r2 (since that must be preserved) @@ -248,11 +249,15 @@ BangCamUpdate ROUT TEQ r4, r0, LSR #12 ; if equal to physical address of page being moved BNE %FT20 ; if not there, then just put in new page + AND r4, r11, #PageFlags_Unsafe Push "r0, r3, r11, r14" ; save phys.addr, new log.addr, new PPL, lr ADD r3, sp, #4*4 LDMIA r3, {r3, r11} ; reload old logical address, old PPL + LDR r0, =DuffEntry ; Nothing to do if wasn't mapped in + ORR r11, r11, r4 + TEQ r3, r0 MOV r0, #0 ; cause translation fault - BL BangL2PT ; map page out + BLNE BangL2PT ; map page out Pull "r0, r3, r11, r14" 20 ADD sp, sp, #8 ; junk old logical address, PPL @@ -340,6 +345,9 @@ BangL2PT ; internal entry point used only Push "lr" MOV r6, r0 + TST r11, #PageFlags_Unsafe + BNE %FT30 + TST r11, #DynAreaFlags_DoublyMapped BNE BangL2PT_sledgehammer ;if doubly mapped, don't try to be clever diff --git a/s/ArthurSWIs b/s/ArthurSWIs index 93172a7..9db8f57 100644 --- a/s/ArthurSWIs +++ b/s/ArthurSWIs @@ -1083,6 +1083,7 @@ fakeservicecall MOV r0, r0 LDR r10, =ZeroPage LDRB r9, [r10, #FIQclaim_interlock] + ASSERT (ZeroPage :AND: 255) = 0 STRB r10, [r10, #FIQclaim_interlock] [ FIQDebug diff --git a/s/ChangeDyn b/s/ChangeDyn index 3b8a382..4023df2 100644 --- a/s/ChangeDyn +++ b/s/ChangeDyn @@ -14,6 +14,89 @@ ; TTL => ChangeDyn + ; OS_ChangeDynamicArea optimisations: + + GBLL FastCDA_UpFront ; Do all cache/TLB maintenance upfront instead of on a per-page basis +FastCDA_UpFront SETL {TRUE} + + GBLL FastCDA_FIQs ; Don't thrash ClaimFIQ/ReleaseFIQ in DoTheGrowPagesSpecified +FastCDA_FIQs SETL {TRUE} + + GBLL FastCDA_NoPhysical ; Don't use RISCOS_AccessPhysicalAddress/RISCOS_ReleasePhysicalAddress in DoTheGrowPagesSpecified. Instead, map pages straight to destination address. +FastCDA_NoPhysical SETL {TRUE} + + GBLL FastCDA_CorruptFreePool ; Contents of free pool doesn't need preserving in DoTheGrowPagesSpecified +FastCDA_CorruptFreePool SETL {TRUE} + + GBLL FastCDA_Unnecessary ; Avoid unnecessary MMU_ChangingEntry calls in DoTheGrowPagesSpecified +FastCDA_Unnecessary SETL {TRUE} + + ; DoTheGrowPagesSpecified profiling code + ; Written to use Cortex-A8 cycle count performance counter - will need modifying for other CPUs! + + GBLL FastCDA_Prof +FastCDA_Prof SETL {FALSE} + + [ FastCDA_Prof + ; Squeeze profiling workspace into "free space after envstring" + ^ ExtendedROMFooter+4 + ! 0, "FastCDA_Prof workspace at ":CC::STR:@ +FastCDA_Prof_DoTheGrowInit # 4 +FastCDA_Prof_MarkRequired # 4 +FastCDA_Prof_PagesUnsafe # 4 +FastCDA_Prof_DoublyCheckCacheability # 4 +FastCDA_Prof_DoublyMovePages # 4 +FastCDA_Prof_FindSpare # 4 +FastCDA_Prof_ClaimFIQ # 4 +FastCDA_Prof_AccessPhysical # 4 +FastCDA_Prof_CopyPage # 4 +FastCDA_Prof_ReleasePhysical # 4 +FastCDA_Prof_MoveReplacement # 4 +FastCDA_Prof_MoveNeeded # 4 +FastCDA_Prof_ReleaseFIQ # 4 +FastCDA_Prof_PagesSafe # 4 +FastCDA_Prof_CallPreGrow # 4 +FastCDA_Prof_CallPostGrow # 4 +FastCDA_Prof_MMUChanging # 4 +FastCDA_Prof_MMUChangingUncached # 4 +FastCDA_Prof_ChangingEntry # 4 + ASSERT @ <= &500 + ] + + MACRO + FastCDA_ProfInit $temp + [ FastCDA_Prof + MVN $temp,#0 + MCR p15,0,$temp,c9,c12,2 + MOV $temp,#1<<31 + MCR p15,0,$temp,c9,c12,1 + MOV $temp,#7 + MCR p15,0,$temp,c9,c12,0 + ] + MEND + + MACRO + FastCDA_ProfStart $var,$temp,$temp2,$temp3,$cc + [ FastCDA_Prof + LDR$cc $temp,=ZeroPage+FastCDA_Prof_$var + LDR$cc $temp2,[$temp] + MRC$cc p15,0,$temp3,c9,c13,0 + SUB$cc $temp2,$temp2,$temp3 + STR$cc $temp2,[$temp] + ] + MEND + + MACRO + FastCDA_ProfEnd $var,$temp,$temp2,$temp3,$cc + [ FastCDA_Prof + MRC$cc p15,0,$temp3,c9,c13,0 + LDR$cc $temp,=ZeroPage+FastCDA_Prof_$var + LDR$cc $temp2,[$temp] + ADD$cc $temp2,$temp2,$temp3 + STR$cc $temp2,[$temp] + ] + MEND + ;****************************************************************************** ; ChangeDynamic SWI ; In : R0 = 0 => System Heap, @@ -279,7 +362,7 @@ GetPageFlagsForR0IntoR6 Entry "R0-R2, R4-R5, R7" ; MoveCAMatR0toR3 ; in: r0 = old logaddr ; r3 = new logaddr -; r9 = MEMC CR +; r9 = offset from 1st to 2nd copy of doubly mapped area (either source or dest, but not both) ; r11 = page protection level ; ; out: r2 = physical page number of page moved, unless there was a serious error @@ -364,11 +447,36 @@ CamMapBroke & 0 = "!!!! CAM Map Corrupt !!!!", 0 ALIGN +; Call_CAM_Mapping +; in: r2 = physical page number +; r3 = logical address (2nd copy of doubly mapped area) +; r9 = offset from 1st to 2nd copy of doubly mapped area (either source or dest, but not both) +; r11 = PPL + CB bits Call_CAM_Mapping Push "r0, r1, r4, r6, lr" BL BangCamUpdate Pull "r0, r1, r4, r6, pc" + [ FastCDA_UpFront +; CheckCacheabilityR0ByMinusR2 +; in: r0 = end of area +; r2 = size of area (must be nonzero) +; out: r6 has DynAreaFlags_NotCacheable set if entire region noncacheable +; Flag clear if at least one page is cacheable +; r0 points to start of area +; +; 4K page size assumed! +CheckCacheabilityR0ByMinusR2 ROUT + Entry "r2" +10 + SUB r0, r0, #4096 + BL GetPageFlagsForR0IntoR6 + SUBS r2, r2, #4096 + TSTNE r6, #DynAreaFlags_NotCacheable + BNE %BT10 + SUB r0, r0, r2 + EXIT + ] ; +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ ; In r0 bits 0..6 = area number @@ -719,6 +827,7 @@ PageFlags_Unavailable * 1 :SHL: 20 ; physical pa ; Temporary flags only used by kernel ; PageFlags_Required * 1 :SHL: 21 ; physical page asked for by handler +PageFlags_Unsafe * 1 :SHL: 22 ; skip cache/TLB maintenance in BangCamUpdate. flag not saved to CAM map. DynamicAreaSWI Entry @@ -3007,6 +3116,8 @@ SysHeapString ChangeDynamicSWI ROUT Push "r0, r2-r9, r10, lr" + FastCDA_ProfInit r3 + LDR r10, =ZeroPage ; check we're not in an IRQ LDR lr, [r10, #IRQsema] TEQ lr, #0 @@ -3082,7 +3193,7 @@ daq_cda_od6done BEQ IssueServiceMemoryMoved ; zero pages! (r0 = area number, r1 = size change (0)) BPL AreaGrow -AreaShrink +AreaShrink ROUT RSB r1, r1, #0 ; make size change positive [ DebugCDA2 DREG r0, "Shrinking area ", cc @@ -3177,10 +3288,35 @@ AreaShrink LDR r3, [r12, #DANode_Size] ADD r1, r1, r3 ; r1 -> address of 1st extra page + [ FastCDA_UpFront + Push "r0-r1" + BL CheckCacheabilityR0ByMinusR2 + LDR r4, [r11, #DANode_Flags] + ADR lr, %FT19 + TST r4, #DynAreaFlags_DoublyMapped + LDR r4, =ZeroPage + BNE %FT18 + ; Interacting with singly-mapped region - use regular logic + TST r6, #DynAreaFlags_NotCacheable + MOV r1, r2, LSR #12 + ARMop MMU_ChangingEntries, EQ, tailcall, r4 + ARMop MMU_ChangingUncachedEntries, NE, tailcall, r4 +18 + ; Interacting with doubly-mapped region - use sledgehammer logic + TST r6, #DynAreaFlags_NotCacheable + ARMop MMU_Changing, EQ, tailcall, r4 + ARMop MMU_ChangingUncached, NE, tailcall, r4 +19 + Pull "r0-r1" + ] + LDR lr, =DynAreaFlags_AccessMask MOV r4, r2 LDR r6, [r12, #DANode_Flags] ; r6 = dst flags AND r6, r6, lr + [ FastCDA_UpFront + ORR r6, r6, #PageFlags_Unsafe + ] 20 SUB r0, r0, r5 ; pre-decrement source pointer [ DebugCDA2 @@ -3221,6 +3357,9 @@ AreaShrink SUB r1, r1, r5 [ 1 = 1 BL GetPageFlagsForR0IntoR6 + ] + [ FastCDA_UpFront + ORR r6, r6, #PageFlags_Unsafe ] BL MovePageAtR0ToR1WithAccessR6 SUBS r4, r4, r5 @@ -3241,7 +3380,7 @@ AreaShrink LDR r0, [r11, #DANode_Number] ; reload dynamic area number B IssueServiceMemoryMoved -AreaGrow +AreaGrow ROUT [ DebugCDA2 DREG r0, "Growing area ", cc DREG r1, " by " @@ -3540,7 +3679,8 @@ ISMM_BatCloak ; C=0 => failed to move as much as we wanted ; C=1 => succeeded in moving as much as we wanted -TryToShrinkShrinkables Entry "r0,r1,r10" +TryToShrinkShrinkables ROUT + Entry "r0,r1,r10" LDR lr, [r11, #DANode_Number] TEQ lr, #ChangeDyn_FreePool EXIT NE ; if src <> free pool, exit with C, V flags intact @@ -3604,26 +3744,33 @@ TryToShrinkShrinkables Entry "r0,r1,r10" ^ 0, sp NumEntries # 4 ; Number of entries to do for this chunk -DestAddr # 4 ; Log addr of 1st page being added to dest -DestFlags # 4 ; Page flags for destination area TotalAmount # 4 ; Total size of grow for this chunk (ie entry value of r3) -SavedPSR # 4 ; PSR before IRQs disabled -Offset1To2 # 4 ; Offset from 1st to 2nd bank DoTheGrowNotSpecifiedStackSize * :INDEX: @ ; amount of stack needed for 'not specified' version +DestAddr # 4 ; Log addr of 1st page being added to dest +DestFlags # 4 ; Page flags for destination area +SavedPSR # 4 ; PSR before IRQs disabled +Offset1To2 # 4 ; Offset from 1st to 2nd bank PageBlock1 # PageBlockSize ; 1st page block, for original page numbers and phys. addrs PageBlock2 # PageBlockSize ; 2nd page block, for new page numbers and phys. addrs DoTheGrowStackSize * :INDEX: @ -DoTheGrow Entry "r3,r5,r10-r12", DoTheGrowStackSize +; Offset1To2 is only used by the first half of the routine. Reuse the space as flags for the second half: + ^ :INDEX: Offset1To2, sp +NeedToMoveFlag # 1 ; Whether we still need to move the current page + # 3 ; (spare) + +DoTheGrow ROUT + Entry "r3,r5,r10-r12", DoTheGrowStackSize ; First fill in the page block with -1 in the physical page number words STR r2, NumEntries ; save number of entries for use later STR r7, TotalAmount ; save amount growing by + FastCDA_ProfStart DoTheGrowInit, r0, r1, lr ADR r1, PageBlock1 ; point at 1st page block on stack ADD lr, r2, r2, LSL #1 ; lr = number of words in page block ADD lr, r1, lr, LSL #2 ; lr -> off end of page block @@ -3633,6 +3780,7 @@ DoTheGrow Entry "r3,r5,r10-r12", DoTheGrowStackSize STR r0, [lr, #PageBlockSize] ; and put -1 in 2nd page block as well TEQ lr, r1 ; until the end BNE %BT10 + FastCDA_ProfEnd DoTheGrowInit, r0, r3, lr ; Now call the pre-grow handler @@ -3675,6 +3823,23 @@ DoTheGrow Entry "r3,r5,r10-r12", DoTheGrowStackSize MOVS r4, r3 ; amount to do BEQ %FT20 ; [none, so skip all this] + + [ FastCDA_UpFront + ; Perform sledgehammer logic upfront + Push "r0,r2,r6" + MOV r0, r1 + MOV r2, r3 + BL CheckCacheabilityR0ByMinusR2 + TST r6, #DynAreaFlags_NotCacheable + LDR r0, =ZeroPage + ADR lr, %FT14 + ARMop MMU_Changing, EQ, tailcall, r0 + ARMop MMU_ChangingUncached, NE, tailcall, r0 +14 + Pull "r0,r2,r6" + ORR r6, r6, #PageFlags_Unsafe + ] + Push "r0, r1" SUB r0, r1, r3 ; src starts at start of 1st copy = start of 2nd - old size SUB r1, r0, r2 ; dst start = src start - amount of room needed @@ -3690,6 +3855,21 @@ DoTheGrow Entry "r3,r5,r10-r12", DoTheGrowStackSize ADD r9, r3, r2 ; set up offset from 1st copy to 2nd copy (= new size) 25 ADD r1, r1, r3 ; r1 -> address of 1st extra page + [ FastCDA_UpFront + ; Flush src region from cache + Push "r0-r1,r6" + BL CheckCacheabilityR0ByMinusR2 + TST r6, #DynAreaFlags_NotCacheable + LDR r4, =ZeroPage + MOV r1, r2, LSR #12 + ADR lr, %FT29 + ARMop MMU_ChangingEntries, EQ, tailcall, r4 + ARMop MMU_ChangingUncachedEntries, NE, tailcall, r4 +29 + Pull "r0-r1,r6" + ; Now BangCam for all the pages + ORR r6, r6, #PageFlags_Unsafe + ] MOV r4, #0 ; amount done so far MOV r10, r2 ; move amount to do into r10, as routine returns page number in r2 ADR r3, PageBlock1 ; point at 1st entry we have to update @@ -3718,7 +3898,7 @@ DoTheGrow Entry "r3,r5,r10-r12", DoTheGrowStackSize CLRV EXIT -37 +DoTheGrowPageUnavailable ROUT ; Come here if a required page is not available ; First we need to go back thru all the part of the page block we've already done, @@ -3751,11 +3931,12 @@ DoTheGrow Entry "r3,r5,r10-r12", DoTheGrowStackSize MakeErrorBlock CantGetPhysMem -DoTheGrowPagesSpecified +DoTheGrowPagesSpecified ROUT ; First check if any of the pages requested are unavailable ; At the same time as we're doing this, we fill in the log. and phys. addresses in the block + FastCDA_ProfStart MarkRequired, r0, r6, lr LDR r0, =ZeroPage LDR r0, [r0, #CamEntriesPointer] LDR r6, =L2PT @@ -3767,7 +3948,7 @@ DoTheGrowPagesSpecified STR r8, [r1, #PageBlockSize+4-12] ; and in 2nd page block TST lr, #PageFlags_Unavailable :OR: PageFlags_Required ; if page in use by someone else, or by us, then return error - BNE %BT37 + BNE DoTheGrowPageUnavailable ORR lr, lr, #PageFlags_Required ; set bit in flags to say page will be needed STR lr, [r4, #4] ; and store back @@ -3785,13 +3966,16 @@ DoTheGrowPagesSpecified SUBS r2, r2, #1 BNE %BT40 + FastCDA_ProfEnd MarkRequired, r0, r6, lr ; now issue Service_PagesUnsafe + FastCDA_ProfStart PagesUnsafe, r0, r6, lr ADR r2, PageBlock1 ; r2 -> 1st page block LDR r3, NumEntries ; r3 = number of entries in page block MOV r1, #Service_PagesUnsafe BL Issue_Service + FastCDA_ProfEnd PagesUnsafe, r0, r6, lr ; now move the pages @@ -3824,7 +4008,44 @@ DoTheGrowPagesSpecified MOVS r4, r3 ; amount to do BEQ %FT50 ; [none, so skip all this] + + [ FastCDA_UpFront + ; Perform sledgehammer logic upfront + Push "r0,r2,r6" + FastCDA_ProfStart DoublyCheckCacheability, r0, r6, lr + MOV r0, r1 + MOV r2, r3 + BL CheckCacheabilityR0ByMinusR2 + FastCDA_ProfEnd DoublyCheckCacheability, r0, r2, lr + TST r6, #DynAreaFlags_NotCacheable + LDR r2, =ZeroPage + [ FastCDA_Prof + LDREQ lr,[r2,#FastCDA_Prof_MMUChanging] + LDRNE lr,[r2,#FastCDA_Prof_MMUChangingUncached] + MRC p15,0,r0,c9,c13,0 + SUB lr,lr,r0 + STREQ lr,[r2,#FastCDA_Prof_MMUChanging] + STRNE lr,[r2,#FastCDA_Prof_MMUChangingUncached] + ] + ADR lr, %FT44 + ARMop MMU_Changing, EQ, tailcall, r2 + ARMop MMU_ChangingUncached, NE, tailcall, r2 +44 + [ FastCDA_Prof + MRC p15,0,r0,c9,c13,0 + TST r6, #DynAreaFlags_NotCacheable + LDREQ lr,[r2,#FastCDA_Prof_MMUChanging] + LDRNE lr,[r2,#FastCDA_Prof_MMUChangingUncached] + ADD lr,lr,r0 + STREQ lr,[r2,#FastCDA_Prof_MMUChanging] + STRNE lr,[r2,#FastCDA_Prof_MMUChangingUncached] + ] + Pull "r0,r2,r6" + ORR r6, r6, #PageFlags_Unsafe + ] + Push "r0, r1" + FastCDA_ProfStart DoublyMovePages, r0, r9, lr SUB r0, r1, r3 ; src starts at start of 1st copy = start of 2nd - old size SUB r1, r0, r2 ; dst start = src start - amount of room needed MOV r9, #0 ; no funny business while moving these pages @@ -3834,6 +4055,7 @@ DoTheGrowPagesSpecified ADD r1, r1, r5 ; advance dst ptr SUBS r4, r4, r5 ; one less page to move BNE %BT45 ; loop if more + FastCDA_ProfEnd DoublyMovePages, r0, r9, lr Pull "r0, r1" ; restore original regs 50 ADD r9, r3, r2 ; set up offset from 1st copy to 2nd copy (= new size) @@ -3851,6 +4073,7 @@ DoTheGrowPagesSpecified ; Now before we start, we must construct the second page block, with replacement page numbers ; DLINE "Start of 1st loop" + FastCDA_ProfStart FindSpare, r6, r1, lr 60 LDR r6, [r8], #12 ; r6 = page number required @@ -3907,11 +4130,21 @@ DoTheGrowPagesSpecified SUBS r9, r9, #1 ; one less entry to do BNE %BT60 + FastCDA_ProfEnd FindSpare, r7, r1, lr MOV r7, r3 ; r7 -> camentries ; Now we can go onto the 2nd loop which actually moves the pages + [ FastCDA_FIQs + ; Claim FIQs for this entire loop + ; (With the old behaviour, for large grows, total time in ReleaseFIQ could be several centiseconds, since the kernel reinstalls the default handler each time) + FastCDA_ProfStart ClaimFIQ, r6, r1, lr + MOV r1, #Service_ClaimFIQ + BL Issue_Service + FastCDA_ProfEnd ClaimFIQ, r6, r1, lr + ] + LDR r1, DestAddr MOV r4, #0 ; amount done MOV r0, r7 ; point r0 at camentries @@ -3922,9 +4155,22 @@ DoTheGrowPagesSpecified MRS r14, CPSR STR r14, SavedPSR ; save old PSR (note: stack must be flat when we do this!) + [ FastCDA_NoPhysical + ; Grab the flags for the page we're replacing; NoPhysical optimisation means the page may get mapped to its target pos earlier than before, causing the flags in the CAM map to be "wrong" when we read them back out later on + LDR r11, [r8, #0] ; need to get PPL for page being replaced + ADD lr, r0, #4 ; point at PPLs, not addresses + LDR r11, [lr, r11, LSL #3] + MOV lr, #1 + STRB lr, NeedToMoveFlag + ] + Push "r0-r4,r7-r12" ; save regs used during copy + [ :LNOT: FastCDA_FIQs + FastCDA_ProfStart ClaimFIQ, r6, r1, lr MOV r1, #Service_ClaimFIQ BL Issue_Service + FastCDA_ProfEnd ClaimFIQ, r6, r1, lr + ] WritePSRc I_bit+SVC_mode, r6 ; disable IRQs round here (we don't want interrupt code to update ; the old mapping behind us while we're trying to copy it) @@ -3941,24 +4187,50 @@ DoTheGrowPagesSpecified ; The old scheme, always copying from other mapping, had interrupt cache coherency hole, at least for ; ARM with writeback cache (bug in 3.7, fixed in Ursula, then lost) + LDR r6, [r8, #4] ;logical address of src page + + [ FastCDA_CorruptFreePool + ; If the source is in the free pool, we don't need to preserve its contents + LDR r3, =ZeroPage+FreePoolDANode + SUB r2, r6, #FreePoolAddress + LDR r3, [r3, #DANode_MaxSize] + CMP r2, r3 + BLO %FT74 + ] + LDR r0, [r0, lr, LSL #3] ; r0 = log. address for replacement page (NB use logical address to write to, for cache consistency) - LDR r6, [r8, #4] ;logical address of src page LDR r3, =Nowhere TEQ r6, r3 ;will be 'Nowhere' if not mapped in BNE %FT71 + [ FastCDA_NoPhysical + ; No need to use AccessPhysicalAddress here - if the required page isn't mapped in, all we need to do is map it in at the target address and use that mapping to copy the data out + LDR r2, [r8, #0] + LDR r3, [sp, #4] ; Get stacked r1 + LDR r11, [sp, #11*4+:INDEX:DestFlags] + FastCDA_ProfStart MoveNeeded, lr, r4, r7 + BL Call_CAM_Mapping ; move needed page to destination + FastCDA_ProfEnd MoveNeeded, lr, r4, r7 + MOV r6, r3 ; r6 = logical address of src for copy + MOV lr, #0 + STRB lr, [sp, #11*4+:INDEX:NeedToMoveFlag] + | ASSERT HAL SUB sp, sp, #4 ; for oldp Push "r0,r1" + FastCDA_ProfStart AccessPhysical, r0, r1, lr MOV r0, #0 LDR r1, [r8, #8] ; r1 = physical address of src for copy ADD r2, sp, #8 ; must use physical address, as page may be mapped to nowhere along with others BL RISCOS_AccessPhysicalAddress MOV r6, r0 ; r6 = logical address of src for copy + FastCDA_ProfEnd AccessPhysical, r0, r1, lr Pull "r0,r1" + ] 71 + FastCDA_ProfStart CopyPage, r2, r3, r4 ADD lr, r6, r5 ; lr = end src address 72 LDMIA r6!, {r2, r3, r4, r7, r9, r10, r11, r12} @@ -3966,12 +4238,24 @@ DoTheGrowPagesSpecified TEQ r6, lr BNE %BT72 + FastCDA_ProfEnd CopyPage, r2, r3, r4 + + [ :LNOT: FastCDA_NoPhysical LDR r0, [r8, #4] ;logical address of src page LDR r3, =Nowhere TEQ r0, r3 Pull "r0", EQ ; oldp + [ FastCDA_Prof + BNE %FT73 + FastCDA_ProfStart ReleasePhysical, r2, r3, r4 + ] BLEQ RISCOS_ReleasePhysicalAddress + [ FastCDA_Prof + FastCDA_ProfEnd ReleasePhysical, r2, r3, r4 +73 + ] + ] ; now check if page we're replacing is in L2PT, and if so then adjust L1PT entries (4 of these) @@ -4010,38 +4294,71 @@ DoTheGrowPagesSpecified ; to avoid serious grief in the awkward cases. Fortunately, these page substitutions are relatively ; rare, so performance is not critical. + [ :LNOT: FastCDA_NoPhysical LDR lr, =ZeroPage LDR lr, [lr, #CamEntriesPointer] ; lr -> soft cam map ADD lr, lr, #4 ; point at PPLs, not addresses LDR r2, [r8, #0] ; need to get PPL for page being replaced LDR r11, [lr, r2, LSL #3] + ] BIC r11, r11, #PageFlags_Required ; knock off bits that indicate that it was a required page ADD lr, r8, #PageBlockSize LDMIA lr, {r2, r3} ; get page number, logical address + LDR lr, =Nowhere ; There's no point in cleaning the nowhere page, and on some architectures it'll even trigger an abort handler due to the lack of mapping + TEQ r3, lr + BEQ %FT75 + [ FastCDA_Unnecessary + ; We only need to clean the cache/TLB if the page is cacheable + ; For uncacheable pages, BangL2PT will flush the TLB for us just before the mapping is updated + TST r11, #DynAreaFlags_NotCacheable + BNE %FT75 + ] Push "r0, r4" - LDR r4, =Nowhere ; There's no point in cleaning the nowhere page, and on some architectures it'll even trigger an abort handler due to the lack of mapping MOV r0, r3 - TEQ r3, r4 - LDRNE r4, =ZeroPage - ARMop MMU_ChangingEntry,NE,,r4 + [ FastCDA_Prof + FastCDA_ProfStart ChangingEntry, r6, lr, r4 + ] + LDR r4, =ZeroPage + ARMop MMU_ChangingEntry,,,r4 + [ FastCDA_Prof + FastCDA_ProfEnd ChangingEntry, r6, lr, r4 + ] Pull "r0, r4" +75 + FastCDA_ProfStart MoveReplacement, r6, lr, r5 BL Call_CAM_Mapping ; move replacement page in + FastCDA_ProfEnd MoveReplacement, r6, lr, r5 76 LDR r2, [r8, #0] MOV r3, r1 LDR r11, DestFlags + [ FastCDA_NoPhysical + LDRB lr, NeedToMoveFlag + TEQ lr, #0 + BEQ %FT77 ; don't bother if page already been moved to dest + ] + FastCDA_ProfStart MoveNeeded, r6, lr, r5 BL Call_CAM_Mapping ; move needed page to destination + FastCDA_ProfEnd MoveNeeded, r6, lr, r5 +77 LDR lr, SavedPSR MSR CPSR_cf, lr + [ :LNOT: FastCDA_FIQs Push "r1" + FastCDA_ProfStart ReleaseFIQ, r1, lr, r5 MOV r1, #Service_ReleaseFIQ BL Issue_Service + FastCDA_ProfEnd ReleaseFIQ, r1, lr, r5 Pull "r1" + ] + [ FastCDA_Prof + MOV r5, #4096 + ] ADD r1, r1, r5 ; advance dest ptr ADD r4, r4, r5 ; increment amount done @@ -4049,17 +4366,26 @@ DoTheGrowPagesSpecified CMP r4, r7 ; have we done all? BNE %BT70 ; [no, so loop] + [ FastCDA_FIQs + FastCDA_ProfStart ReleaseFIQ, r1, lr, r2 + MOV r1, #Service_ReleaseFIQ + BL Issue_Service + FastCDA_ProfEnd ReleaseFIQ, r1, lr, r2 + ] + LDR r3, [r12, #DANode_Size] ADD r3, r3, r7 STR r3, [r12, #DANode_Size] ; store increased destination size ; now issue Service_PagesSafe + FastCDA_ProfStart PagesSafe, r1, r2, r3 LDR r2, NumEntries ADR r3, PageBlock1 ADR r4, PageBlock2 MOV r1, #Service_PagesSafe BL Issue_Service + FastCDA_ProfEnd PagesSafe, r1, r2, r3 ; now call Post_Grow handler @@ -4090,7 +4416,8 @@ DoTheGrowPagesSpecified ; Note: Removal is from one area only, the calling routine breaks the chunk at free/app boundary. -DoTheGrowNotSpecified Entry "r3,r5,r10-r12", DoTheGrowNotSpecifiedStackSize +DoTheGrowNotSpecified ROUT + Entry "r3,r5,r10-r12", DoTheGrowNotSpecifiedStackSize STR r2, NumEntries ; save number of entries for use later STR r7, TotalAmount ; save amount growing by @@ -4124,6 +4451,23 @@ DoTheGrowNotSpecified Entry "r3,r5,r10-r12", DoTheGrowNotSpecifiedStackSize MOVS r4, r3 ; amount to do BEQ %FT20 ; [none, so skip all this] + + [ FastCDA_UpFront + ; Perform sledgehammer logic upfront + Push "r0,r2,r6" + MOV r0, r1 + MOV r2, r3 + BL CheckCacheabilityR0ByMinusR2 + TST r6, #DynAreaFlags_NotCacheable + LDR r0, =ZeroPage + ADR lr, %FT14 + ARMop MMU_Changing, EQ, tailcall, r0 + ARMop MMU_ChangingUncached, NE, tailcall, r0 +14 + Pull "r0,r2,r6" + ORR r6, r6, #PageFlags_Unsafe + ] + Push "r0, r1" SUB r0, r1, r3 ; src starts at start of 1st copy = start of 2nd - old size SUB r1, r0, r2 ; dst start = src start - amount of room needed @@ -4139,6 +4483,21 @@ DoTheGrowNotSpecified Entry "r3,r5,r10-r12", DoTheGrowNotSpecifiedStackSize ADD r9, r3, r2 ; set up offset from 1st copy to 2nd copy (= new size) 25 ADD r1, r1, r3 ; r1 -> address of 1st extra page + [ FastCDA_UpFront + ; Flush src region from cache + Push "r0-r1,r6" + BL CheckCacheabilityR0ByMinusR2 + TST r6, #DynAreaFlags_NotCacheable + LDR r4, =ZeroPage + MOV r1, r2, LSR #12 + ADR lr, %FT29 + ARMop MMU_ChangingEntries, EQ, tailcall, r4 + ARMop MMU_ChangingUncachedEntries, NE, tailcall, r4 +29 + Pull "r0-r1,r6" + ; Now BangCam for all the pages + ORR r6, r6, #PageFlags_Unsafe + ] MOV r4, #0 ; amount done so far MOV r10, r2 ; move amount to do into r10 30 @@ -4182,7 +4541,8 @@ DoTheGrowNotSpecified Entry "r3,r5,r10-r12", DoTheGrowNotSpecifiedStackSize ; endif ; -CheckAppSpace Entry "r0-r3" +CheckAppSpace ROUT + Entry "r0-r3" LDR r2, =ZeroPage LDR r3, [r2, #AplWorkSize] LDR r2, [r2, #Curr_Active_Object] @@ -4336,12 +4696,14 @@ CallPreGrow Entry "r0,r4, r12" CMP r0, #0 ; if none (V=0) EXIT EQ ; then exit + FastCDA_ProfStart CallPreGrow, r0, r4, lr MOV r0, #DAHandler_PreGrow ; r0 = reason code LDR r4, [r12, #DANode_Size] ; r4 = current size ASSERT DANode_Handler = DANode_Workspace +4 ADD r12, r12, #DANode_Workspace MOV lr, pc LDMIA r12, {r12, pc} ; load workspace pointer and jump to handler + FastCDA_ProfEnd CallPreGrow, r12, r4, lr EXIT VC ; if no error then exit TEQ r0, #0 ; if generic error returned @@ -4371,12 +4733,14 @@ CallPostGrow Entry "r0,r3,r4, r12" CMP r0, #0 ; if none (V=0) EXIT EQ ; then exit + FastCDA_ProfStart CallPostGrow, r0, r4, lr MOV r0, #DAHandler_PostGrow ; r0 = reason code LDR r4, [r12, #DANode_Size] ; r4 = new size ASSERT DANode_Handler = DANode_Workspace +4 ADD r12, r12, #DANode_Workspace MOV lr, pc LDMIA r12, {r12, pc} ; load workspace pointer and jump to handler + FastCDA_ProfEnd CallPostGrow, r12, r4, lr EXIT [ ShrinkableDAs diff --git a/s/VMSAv6 b/s/VMSAv6 index bcff6c6..7d65228 100644 --- a/s/VMSAv6 +++ b/s/VMSAv6 @@ -75,7 +75,8 @@ BangCamUpdate ROUT LDR r1, [r1, #CamEntriesPointer] ADD r1, r1, r2, LSL #3 ; point at cam entry (logaddr, PPL) LDMIA r1, {r0, r6} ; r0 = current logaddress, r6 = current PPL - STMIA r1, {r3, r11} ; store new address, PPL + BIC r4, r11, #PageFlags_Unsafe + STMIA r1, {r3, r4} ; store new address, PPL Push "r0, r6" ; save old logical address, PPL LDR r1, =ZeroPage+PhysRamTable ; go through phys RAM table MOV r6, r2 ; make copy of r2 (since that must be preserved) @@ -105,11 +106,15 @@ BangCamUpdate ROUT TEQ r4, r0, LSR #12 ; if equal to physical address of page being moved BNE %FT20 ; if not there, then just put in new page + AND r4, r11, #PageFlags_Unsafe Push "r0, r3, r11, r14" ; save phys.addr, new log.addr, new PPL, lr ADD r3, sp, #4*4 LDMIA r3, {r3, r11} ; reload old logical address, old PPL + LDR r0, =DuffEntry ; Nothing to do if wasn't mapped in + ORR r11, r11, r4 + TEQ r3, r0 MOV r0, #0 ; cause translation fault - BL BangL2PT ; map page out + BLNE BangL2PT ; map page out Pull "r0, r3, r11, r14" 20 ADD sp, sp, #8 ; junk old logical address, PPL @@ -197,6 +202,9 @@ BangL2PT ; internal entry point used only Push "lr" MOV r6, r0 + TST r11, #PageFlags_Unsafe + BNE %FT30 + TST r11, #DynAreaFlags_DoublyMapped BNE BangL2PT_sledgehammer ;if doubly mapped, don't try to be clever -- GitLab