; Copyright 1997 Acorn Computers Ltd ; ; Licensed under the Apache License, Version 2.0 (the "License"); ; you may not use this file except in compliance with the License. ; You may obtain a copy of the License at ; ; http://www.apache.org/licenses/LICENSE-2.0 ; ; Unless required by applicable law or agreed to in writing, software ; distributed under the License is distributed on an "AS IS" BASIS, ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ; See the License for the specific language governing permissions and ; limitations under the License. ; ;;; memcpyset.s: fast implementation of memcpy, memset, memmove ;;; Copyright (C) Advanced RISC Machines Ltd., 1991 GET objmacs.s CodeArea GBLS SLA ; shift towards low address end GBLS SHA ; shift towards high address end [ {ENDIAN} = "big" SLA SETS "LSL" SHA SETS "LSR" | ; assume little-endian SLA SETS "LSR" SHA SETS "LSL" ] [ make = "memcpy" :LOR: make = "all" :LOR: make = "shared-library" src RN a2 dst RN a1 n RN a3 tmp1 RN a4 tmp3 RN ip Function memmove, leaf Function memcpy, leaf CMP src, dst BLO CopyDown Return , "", LinkNotStacked, EQ ; dst == src, no move required FunctionEntry , "dst" ; Must return original dst. SUBS n, n, #4 ; need at least 4 bytes BLT Up_TrailingBytes ; < 4 bytes to go ; word align the dst - first find out how many bytes ; must be stored to do this. If the number is 0 ; check the src too. ANDS tmp3, dst, #3 ; eq means aligned! BNE Up_AlignDst ANDS tmp3, src, #3 BNE Up_SrcUnaligned ; more difficult! ; here when source and destination are both aligned. ; number of bytes to transfer is (n+4), n is >= 0. Up_SrcDstAligned SUBS n, n, #12-4 ; 12 bytes or more? BLT Up_TrailingWords ; We only have three registers to play with. It is ; worth gaining more only if the number of bytes to ; transfer is greater than 12+8*<registers stacked> ; We need to stack 8 (4+4) registers to gain 8 temporaries, ; so look for >=44 bytes. Since we would save 8*4 = 32 ; bytes at a time we actually compare with 64. SUBS n, n, #32-12 ; n+32 to go. BLT %F1 STMFD sp!, {v1} ; loop loading 4 registers per time, twice (32 bytes) 0 LDMIA src!, {tmp1, v1, tmp3, lr} STMIA dst!, {tmp1, v1, tmp3, lr} LDMIA src!, {tmp1, v1, tmp3, lr} STMIA dst!, {tmp1, v1, tmp3, lr} SUBS n, n, #32 BGE %B0 ; see if we can handle another 8 CMN n, #16 LDMGEIA src!, {tmp1, v1, tmp3, lr} STMGEIA dst!, {tmp1, v1, tmp3, lr} SUBGE n, n, #16 ; Reload the registers - note that we still have (n+32) ; bytes to go, and that this is <16. LDMFD sp!, {v1} ; Here when there are fewer than 16 bytes to go. 1 ADDS n, n, #32-12 ; (n-12) to go ; Ok - do three words at a time. 2 LDMGEIA src!, {tmp1, tmp3, lr} STMGEIA dst!, {tmp1, tmp3, lr} SUBGES n, n, #12 BGE %B2 ; (n-12) bytes to go - 0, 1 or 2 words. Check ; which. Up_TrailingWords ADDS n, n, #12-4 ; (n-4) to go BLT Up_TrailingBytes ; < 4 bytes to go SUBS n, n, #4 LDRLT tmp1, [src], #4 STRLT tmp1, [dst], #4 LDMGEIA src!, {tmp1, tmp3} STMGEIA dst!, {tmp1, tmp3} SUBGE n, n, #4 ; Here with less than 4 bytes to go Up_TrailingBytes ADDS n, n, #4 Return , "a1", , EQ ; 0 bytes CMP n, #2 ; 1, 2 or 3 bytes LDRB tmp1, [src], #1 STRB tmp1, [dst], #1 LDRGEB tmp1, [src], #1 STRGEB tmp1, [dst], #1 LDRGTB tmp1, [src], #1 STRGTB tmp1, [dst], #1 Return , "a1" ; recover old dst value ;------------------------------------------------------------ ; word align dst - tmp3 contains current destination ; alignment. We can store at least 4 bytes here. Up_AlignDst RSB tmp3, tmp3, #4 ; 1-3 bytes to go CMP tmp3, #2 LDRB tmp1, [src], #1 STRB tmp1, [dst], #1 LDRGEB tmp1, [src], #1 STRGEB tmp1, [dst], #1 LDRGTB tmp1, [src], #1 STRGTB tmp1, [dst], #1 SUBS n, n, tmp3 ; check number to go BLT Up_TrailingBytes ; less than 4 bytes ANDS tmp3, src, #3 BEQ Up_SrcDstAligned ; coaligned case ; The source is not coaligned with the destination, ; the destination IS currently word aligned. Up_SrcUnaligned BIC src, src, #3 ; tmp3 holds extra! LDR lr, [src], #4 ; 1-3 useful bytes CMP tmp3, #2 BGT Up_OneByte ; one byte in tmp1 BEQ Up_TwoBytes ; two bytes in tmp1 ; The next three source bytes are in tmp1, one byte must ; come from the next source word. At least four bytes ; more must be stored. Check first to see if there are a ; sufficient number of bytes to go to justify using stm/ldm ; instructions. Up_ThreeBytes CMP n, #16-4 ; at least 16 bytes? BLT %F1 ; no ; 1 SUB n, n, #16-4 ; (n+16) bytes to go ; 1 ; save some work registers. The point at which this ; is done is based on the ldm/stm time being = (n+3)+(n/4)S STMFD sp!, {v1, v2} ; 14 ???? ; loop doing 16 bytes at a time. There are currently ; three useful bytes in lr. 0 MOV tmp1, lr, $SLA #8 ; first three bytes ; 1 LDMIA src!, {v1, v2, tmp3, lr} ; 12/13 ORR tmp1, tmp1, v1, $SHA #24 ; word 1 ; 1 MOV v1, v1, $SLA #8 ; ... ORR v1, v1, v2, $SHA #24 ; word 2 ; 2 (1+1) MOV v2, v2, $SLA #8 ORR v2, v2, tmp3, $SHA #24 ; word 3 ; 2 MOV tmp3, tmp3, $SLA #8 ORR tmp3, tmp3, lr, $SHA #24 ; word 4 ; 2 STMIA dst!, {tmp1, v1, v2, tmp3} ; 12/13 SUBS n, n, #16 ; 1 BGE %B0 ; 4 / 1 ; loop timing (depends on alignment) for n loops:- ; pre: 17 ; ((45/46/47)n - 3) for 32n bytes ; post: 13/14 ; total: (45/46/47)n+(27/28) ; 32 bytes: 72-75 ; 64 bytes: 117-122 ; 96 bytes: 162-169 ; Reload registers LDMFD sp!, {v1, v2} ; 12/13 ???? ADDS n, n, #16-4 ; check for at least 4 BLT %F2 ; < 4 bytes 1 MOV tmp3, lr, $SLA #8 ; first three bytes ; 1 LDR lr, [src], #4 ; next four bytes ; 4 ORR tmp3, tmp3, lr, $SHA #24 ; 1 STR tmp3, [dst], #4 ; 4 SUBS n, n, #4 ; 1 BGE %B1 ; tmp1 contains three bytes 1 / 4 ; Loop timing: ; 15n-3 for 4n bytes ; 32: 117 ; 64: 237 ; Less than four bytes to go - readjust the src ; address. 2 SUB src, src, #3 B Up_TrailingBytes ; The next two source bytes are in tmp1, two bytes must ; come from the next source word. At least four bytes ; more must be stored. Up_TwoBytes CMP n, #16-4 ; at least 16 bytes? BLT %F1 ; no SUB n, n, #16-4 ; (n+16) bytes to go ; form a stack frame and save registers STMFD sp!, {v1, v2} ; loop doing 32 bytes at a time. There are currently ; two useful bytes in lr. 0 MOV tmp1, lr, $SLA #16 ; first two bytes LDMIA src!, {v1, v2, tmp3, lr} ORR tmp1, tmp1, v1, $SHA #16 ; word 1 MOV v1, v1, $SLA #16 ORR v1, v1, v2, $SHA #16 ; word 2 MOV v2, v2, $SLA #16 ORR v2, v2, tmp3, $SHA #16 ; word 3 MOV tmp3, tmp3, $SLA #16 ORR tmp3, tmp3, lr, $SHA #16 ; word 4 STMIA dst!, {tmp1, v1, v2, tmp3} SUBS n, n, #16 BGE %B0 ; Reload registers LDMFD sp!, {v1, v2} ADDS n, n, #16-4 ; check number of bytes BLT %F2 1 MOV tmp3, lr, $SLA #16 ; first two bytes LDR lr, [src], #4 ; next four bytes ORR tmp3, tmp3, lr, $SHA #16 STR tmp3, [dst], #4 SUBS n, n, #4 BGE %B1 ; tmp1 contains two bytes ; Less than four bytes to go - readjust the src ; address. 2 SUB src, src, #2 B Up_TrailingBytes ; The next source byte is in tmp1, three bytes must ; come from the next source word. At least four bytes ; more must be stored. Up_OneByte CMP n, #16-4 ; at least 16 bytes? BLT %F1 ; no SUB n, n, #16-4 ; (n+16) bytes to go ; form a stack frame and save registers STMFD sp!, {v1, v2} ; loop doing 32 bytes at a time. There is currently ; one useful byte in lr 0 MOV tmp1, lr, $SLA #24 ; first byte LDMIA src!, {v1, v2, tmp3, lr} ORR tmp1, tmp1, v1, $SHA #8 ; word 1 MOV v1, v1, $SLA #24 ORR v1, v1, v2, $SHA #8 ; word 2 MOV v2, v2, $SLA #24 ORR v2, v2, tmp3, $SHA #8 ; word 3 MOV tmp3, tmp3, $SLA #24 ORR tmp3, tmp3, lr, $SHA #8 ; word 4 STMIA dst!, {tmp1, v1, v2, tmp3} SUBS n, n, #16 BGE %B0 ; Reload registers LDMFD sp!, {v1, v2} ADDS n, n, #16-4 ; check number of bytes BLT %F2 1 MOV tmp3, lr, $SLA #24 ; first byte LDR lr, [src], #4 ; next four bytes ORR tmp3, tmp3, lr, $SHA #8 STR tmp3, [dst], #4 SUBS n, n, #4 BGE %B1 ; tmp1 contains one byte ; Less than four bytes to go - one already in tmp3. 2 SUB src, src, #1 B Up_TrailingBytes ;====================================================================== ; Copy down code ; ============== ; This is exactly the same as the copy up code - ; but it copies in the opposite direction. CopyDown ADD src, src, n ; points beyond end ADD dst, dst, n SUBS n, n, #4 ; need at least 4 bytes BLT Down_TrailingBytes ; < 4 bytes to go ; word align the dst - first find out how many bytes ; must be stored to do this. If the number is 0 ; check the src too. ANDS tmp3, dst, #3 ; eq means aligned! BNE Down_AlignDst ANDS tmp3, src, #3 BNE Down_SrcUnaligned ; more difficult! ; here when source and destination are both aligned. ; number of bytes to transfer is (n+4), n is >= 0. Down_SrcDstAligned SUBS n, n, #12-4 ; 12 bytes or more? BLT Down_TrailingWords ; We only have three registers to play with. It is ; worth gaining more only if the number of bytes to ; transfer is greater than 12+8*<registers stacked> ; We need to stack 8 (4+4) registers to gain 8 temporaries, ; so look for >=44 bytes. Since we would save 8*4 = 32 ; bytes at a time we actually compare with 64. STMFD sp!, {v1, lr} SUBS n, n, #32-12 ; n+32 to go. BLT %F1 ; loop loading 4 registers per time, twice (32 bytes) 0 LDMDB src!, {tmp1, v1, tmp3, lr} STMDB dst!, {tmp1, v1, tmp3, lr} LDMDB src!, {tmp1, v1, tmp3, lr} STMDB dst!, {tmp1, v1, tmp3, lr} SUBS n, n, #32 BGE %B0 ; see if we can handle another 8 1 CMN n, #16 LDMGEDB src!, {tmp1, v1, tmp3, lr} STMGEDB dst!, {tmp1, v1, tmp3, lr} SUBGE n, n, #16 ; Here when there are fewer than 16 bytes to go. ADDS n, n, #32-12 ; (n-12) to go ; Ok - do three words at a time. LDMGEDB src!, {tmp1, tmp3, lr} STMGEDB dst!, {tmp1, tmp3, lr} SUBGE n, n, #12 LDMFD sp!, {v1, lr} ; (n-12) bytes to go - 0, 1 or 2 words. Check ; which. Down_TrailingWords ADDS n, n, #12-4 ; (n-4) to go BLT Down_TrailingBytes ; < 4 bytes to go SUBS n, n, #4 LDRLT tmp1, [src, #-4]! STRLT tmp1, [dst, #-4]! LDMGEDB src!, {tmp1, tmp3} STMGEDB dst!, {tmp1, tmp3} SUBGE n, n, #4 ; Here with less than 4 bytes to go Down_TrailingBytes ADDS n, n, #4 Return , "", LinkNotStacked, EQ ; 0 bytes CMP n, #2 ; 1, 2 or 3 bytes LDRB tmp1, [src, #-1]! STRB tmp1, [dst, #-1]! LDRGEB tmp1, [src, #-1]! STRGEB tmp1, [dst, #-1]! LDRGTB tmp1, [src, #-1]! ; dst is now original dst STRGTB tmp1, [dst, #-1]! Return , "", LinkNotStacked ;------------------------------------------------------------ ; word align dst - tmp3 contains current destination ; alignment. We can store at least 4 bytes here. We are ; going downwards - so tmp3 is the actual number of bytes ; to store. Down_AlignDst CMP tmp3, #2 LDRB tmp1, [src, #-1]! STRB tmp1, [dst, #-1]! LDRGEB tmp1, [src, #-1]! STRGEB tmp1, [dst, #-1]! LDRGTB tmp1, [src, #-1]! STRGTB tmp1, [dst, #-1]! SUBS n, n, tmp3 ; check number to go BLT Down_TrailingBytes ; less than 4 bytes ANDS tmp3, src, #3 BEQ Down_SrcDstAligned ; coaligned case ; The source is not coaligned with the destination, ; the destination IS currently word aligned. Down_SrcUnaligned BIC src, src, #3 ; tmp3 holds extra! LDR tmp1, [src] ; 1-3 useful bytes CMP tmp3, #2 BLT Down_OneByte ; one byte in tmp1 BEQ Down_TwoBytes ; two bytes in tmp1 ; The last three source bytes are in tmp1, one byte must ; come from the previous source word. At least four bytes ; more must be stored. Check first to see if there are a ; sufficient number of bytes to go to justify using stm/ldm ; instructions. Down_ThreeBytes CMP n, #16-4 ; at least 16 bytes? BLT %F1 ; no SUB n, n, #16-4 ; (n+16) bytes to go ; form a stack frame and save registers STMFD sp!, {v1, v2, lr} ; loop doing 32 bytes at a time. There are currently ; three useful bytes in tmp1 (a4). 0 MOV lr, tmp1, $SHA #8 ; last three bytes LDMDB src!, {tmp1, v1, v2, tmp3} ORR lr, lr, tmp3, $SLA #24 ; word 4 MOV tmp3, tmp3, $SHA #8 ORR tmp3, tmp3, v2, $SLA #24 ; word 3 MOV v2, v2, $SHA #8 ORR v2, v2, v1, $SLA #24 ; word 2 MOV v1, v1, $SHA #8 ORR v1, v1, tmp1, $SLA #24 ; word 1 STMDB dst!, {v1, v2, tmp3, lr} SUBS n, n, #16 BGE %B0 ; Reload registers LDMFD sp!, {v1, v2, lr} ADDS n, n, #16-4 ; check for at least 4 BLT %F2 ; < 4 bytes 1 MOV tmp3, tmp1, $SHA #8 ; last three bytes LDR tmp1, [src, #-4]! ; previous four bytes ORR tmp3, tmp3, tmp1, $SLA #24 STR tmp3, [dst, #-4]! SUBS n, n, #4 BGE %B1 ; tmp1 contains three bytes ; Less than four bytes to go - readjust the src ; address. 2 ADD src, src, #3 B Down_TrailingBytes ; The last two source bytes are in tmp1, two bytes must ; come from the previous source word. At least four bytes ; more must be stored. Down_TwoBytes CMP n, #16-4 ; at least 16 bytes? BLT %F1 ; no SUB n, n, #16-4 ; (n+16) bytes to go ; form a stack frame and save registers STMFD sp!, {v1, v2, lr} ; loop doing 32 bytes at a time. There are currently ; two useful bytes in tmp1 (a4). 0 MOV lr, tmp1, $SHA #16 ; last two bytes LDMDB src!, {tmp1, v1, v2, tmp3} ORR lr, lr, tmp3, $SLA #16 ; word 4 MOV tmp3, tmp3, $SHA #16 ORR tmp3, tmp3, v2, $SLA #16 ; word 3 MOV v2, v2, $SHA #16 ORR v2, v2, v1, $SLA #16 ; word 2 MOV v1, v1, $SHA #16 ORR v1, v1, tmp1, $SLA #16 ; word 1 STMDB dst!, {v1, v2, tmp3, lr} SUBS n, n, #16 BGE %B0 ; Reload registers LDMFD sp!, {v1, v2, lr} ADDS n, n, #16-4 ; check for at least 4 BLT %F2 ; < 4 bytes 1 MOV tmp3, tmp1, $SHA #16 ; last two bytes LDR tmp1, [src, #-4]! ; previous four bytes ORR tmp3, tmp3, tmp1, $SLA #16 STR tmp3, [dst, #-4]! SUBS n, n, #4 BGE %B1 ; tmp1 contains two bytes ; Less than four bytes to go - readjust the src ; address. 2 ADD src, src, #2 B Down_TrailingBytes ; The last source byte is in tmp1, three bytes must ; come from the previous source word. At least four bytes ; more must be stored. Down_OneByte CMP n, #16-4 ; at least 16 bytes? BLT %F1 ; no SUB n, n, #16-4 ; (n+16) bytes to go ; form a stack frame and save registers STMFD sp!, {v1, v2, lr} ; loop doing 32 bytes at a time. There is currently ; one useful byte in tmp1 (a4). 0 MOV lr, tmp1, $SHA #24 ; last byte LDMDB src!, {tmp1, v1, v2, tmp3} ORR lr, lr, tmp3, $SLA #8 ; word 4 MOV tmp3, tmp3, $SHA #24 ORR tmp3, tmp3, v2, $SLA #8 ; word 3 MOV v2, v2, $SHA #24 ORR v2, v2, v1, $SLA #8 ; word 2 MOV v1, v1, $SHA #24 ORR v1, v1, tmp1, $SLA #8 ; word 1 STMDB dst!, {v1, v2, tmp3, lr} SUBS n, n, #16 BGE %B0 ; Reload registers LDMFD sp!, {v1, v2, lr} ADDS n, n, #16-4 ; check for at least 4 BLT %F2 ; < 4 bytes 1 MOV tmp3, tmp1, $SHA #24 ; last byte LDR tmp1, [src, #-4]! ; previous four bytes ORR tmp3, tmp3, tmp1, $SLA #8 STR tmp3, [dst, #-4]! SUBS n, n, #4 BGE %B1 ; tmp1 contains one byte ; Less than four bytes to go - one already in tmp3. 2 ADD src, src, #1 B Down_TrailingBytes ;------------------------------------------------------------ ] [ make = "memset" :LOR: make = "all" :LOR: make = "shared-library" src RN a2 dst RN a1 n RN a3 tmp1 RN a4 tmp3 RN ip ; extern void *memset(void * /*s*/, int /*c*/, size_t /*n*/) Function memset, leaf FunctionEntry , "dst" ; Must return original dst. SUBS n, n, #4 ; need at least 4 bytes BMI TrailingBytes ; < 4 bytes to go ; word align the dst - first find out how many bytes ; must be stored to do this. ANDS tmp3, dst, #3 ; eq means aligned! BNE AlignDst ; here when destination is word-aligned, ; number of bytes to transfer is (n+4), n is >= 0. DstAligned AND src, src, #&ff ORR src, src, src, ASL #8 ORR src, src, src, ASL #16 MOV tmp1, src MOV tmp3, src MOV lr, src SUBS n, n, #12-4 ; 12 bytes or more? BLT TrailingWords SUBS n, n, #32-12 ; n+32 to go. BLT %F1 0 STMIA dst!, {src, tmp1, tmp3, lr} STMIA dst!, {src, tmp1, tmp3, lr} SUBS n, n, #32 BGE %B0 ; see if we can handle another 8 CMN n, #16 STMGEIA dst!, {src, tmp1, tmp3, lr} SUBGE n, n, #16 ; note that we still have (n+32) bytes to go, and this is <16. ; Here when there are fewer than 16 bytes to go. 1 ADDS n, n, #32-12 ; (n-12) to go ; Ok - do three words at a time. 2 STMGEIA dst!, {tmp1, tmp3, lr} SUBGES n, n, #12 BGE %B2 ; (n-12) bytes to go - 0, 1 or 2 words. Check ; which. TrailingWords ADDS n, n, #12-4 ; (n-4) to go BLT TrailingBytes ; < 4 bytes to go SUBS n, n, #4 STRLT src, [dst], #4 STMGEIA dst!, {src, tmp1} SUBGE n, n, #4 ; Here with less than 4 bytes to go TrailingBytes ADDS n, n, #4 Return , "a1",, EQ ; 0 bytes CMP n, #2 ; 1, 2 or 3 bytes STRB src, [dst], #1 STRGEB src, [dst], #1 STRGTB src, [dst], #1 Return , "a1" ; recover old dst value ;------------------------------------------------------------ ; word align dst - tmp3 contains current destination ; alignment. We can store at least 4 bytes here. AlignDst RSB tmp3, tmp3, #4 ; 1-3 bytes to go CMP tmp3, #2 STRB src, [dst], #1 STRGEB src, [dst], #1 STRGTB src, [dst], #1 SUBS n, n, tmp3 ; check number to go BLT TrailingBytes ; less than 4 bytes B DstAligned ] END