; Copyright 1996 Acorn Computers Ltd ; ; Licensed under the Apache License, Version 2.0 (the "License"); ; you may not use this file except in compliance with the License. ; You may obtain a copy of the License at ; ; http://www.apache.org/licenses/LICENSE-2.0 ; ; Unless required by applicable law or agreed to in writing, software ; distributed under the License is distributed on an "AS IS" BASIS, ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ; See the License for the specific language governing permissions and ; limitations under the License. ; GBLS Calling_Standard Calling_Standard SETS "APCS_U" GET s.h_Regs AREA |C$$code|, CODE, READONLY src RN a2 dst RN a1 n RN a3 tmp1 RN a4 tmp3 RN ip EXPORT memmove EXPORT memcpy memmove memcpy CMP src, dst BLO CopyDown MOVEQS pc, lr ; dst == src, no move required STMFD sp!, {dst, lr} ; Must return original dst. SUBS n, n, #4 ; need at least 4 bytes BLT Up_TrailingBytes ; < 4 bytes to go ; word align the dst - first find out how many bytes ; must be stored to do this. If the number is 0 ; check the src too. ANDS tmp3, dst, #3 ; eq means aligned! BNE Up_AlignDst ANDS tmp3, src, #3 BNE Up_SrcUnaligned ; more difficult! ; here when source and destination are both aligned. ; number of bytes to transfer is (n+4), n is >= 0. Up_SrcDstAligned SUBS n, n, #12-4 ; 12 bytes or more? BLT Up_TrailingWords ; We only have three registers to play with. It is ; worth gaining more only if the number of bytes to ; transfer is greater than 12+8* ; We need to stack 8 (4+4) registers to gain 8 temporaries, ; so look for >=44 bytes. Since we would save 8*4 = 32 ; bytes at a time we actually compare with 64. SUBS n, n, #32-12 ; n+32 to go. BLT %F1 STMFD sp!, {v1} ; loop loading 4 registers per time, twice (32 bytes) 0 LDMIA src!, {tmp1, v1, tmp3, lr} STMIA dst!, {tmp1, v1, tmp3, lr} LDMIA src!, {tmp1, v1, tmp3, lr} STMIA dst!, {tmp1, v1, tmp3, lr} SUBS n, n, #32 BGE %B0 ; see if we can handle another 8 CMN n, #16 LDMGEIA src!, {tmp1, v1, tmp3, lr} STMGEIA dst!, {tmp1, v1, tmp3, lr} SUBGE n, n, #16 ; Reload the registers - note that we still have (n+32) ; bytes to go, and that this is <16. LDMFD sp!, {v1} ; Here when there are fewer than 16 bytes to go. 1 ADDS n, n, #32-12 ; (n-12) to go ; Ok - do three words at a time. 2 LDMGEIA src!, {tmp1, tmp3, lr} STMGEIA dst!, {tmp1, tmp3, lr} SUBGES n, n, #12 BGE %B2 ; (n-12) bytes to go - 0, 1 or 2 words. Check ; which. Up_TrailingWords ADDS n, n, #12-4 ; (n-4) to go BLT Up_TrailingBytes ; < 4 bytes to go SUBS n, n, #4 LDRLT tmp1, [src], #4 STRLT tmp1, [dst], #4 LDMGEIA src!, {tmp1, tmp3} STMGEIA dst!, {tmp1, tmp3} SUBGE n, n, #4 ; Here with less than 4 bytes to go Up_TrailingBytes ADDS n, n, #4 LDMEQFD sp!, {a1, pc}^ ; 0 bytes CMP n, #2 ; 1, 2 or 3 bytes LDRB tmp1, [src], #1 STRB tmp1, [dst], #1 LDRGEB tmp1, [src], #1 STRGEB tmp1, [dst], #1 LDRGTB tmp1, [src], #1 STRGTB tmp1, [dst], #1 LDMFD sp!, {a1, pc}^ ; recover old dst value ;------------------------------------------------------------ ; word align dst - tmp3 contains current destination ; alignment. We can store at least 4 bytes here. Up_AlignDst RSB tmp3, tmp3, #4 ; 1-3 bytes to go CMP tmp3, #2 LDRB tmp1, [src], #1 STRB tmp1, [dst], #1 LDRGEB tmp1, [src], #1 STRGEB tmp1, [dst], #1 LDRGTB tmp1, [src], #1 STRGTB tmp1, [dst], #1 SUBS n, n, tmp3 ; check number to go BLT Up_TrailingBytes ; less than 4 bytes ANDS tmp3, src, #3 BEQ Up_SrcDstAligned ; coaligned case ; The source is not coaligned with the destination, ; the destination IS currently word aligned. Up_SrcUnaligned BIC src, src, #3 ; tmp3 holds extra! LDR lr, [src], #4 ; 1-3 useful bytes CMP tmp3, #2 BGT Up_OneByte ; one byte in tmp1 BEQ Up_TwoBytes ; two bytes in tmp1 ; The next three source bytes are in tmp1, one byte must ; come from the next source word. At least four bytes ; more must be stored. Check first to see if there are a ; sufficient number of bytes to go to justify using stm/ldm ; instructions. Up_ThreeBytes CMP n, #16-4 ; at least 16 bytes? BLT %F1 ; no ; 1 SUB n, n, #16-4 ; (n+16) bytes to go ; 1 ; save some work registers. The point at which this ; is done is based on the ldm/stm time being = (n+3)+(n/4)S STMFD sp!, {v1, v2} ; 14 ???? ; loop doing 16 bytes at a time. There are currently ; three useful bytes in lr. 0 MOV tmp1, lr, LSR #8 ; first three bytes ; 1 LDMIA src!, {v1, v2, tmp3, lr} ; 12/13 ORR tmp1, tmp1, v1, LSL #24 ; word 1 ; 1 MOV v1, v1, LSR #8 ; ... ORR v1, v1, v2, LSL #24 ; word 2 ; 2 (1+1) MOV v2, v2, LSR #8 ORR v2, v2, tmp3, LSL #24 ; word 3 ; 2 MOV tmp3, tmp3, LSR #8 ORR tmp3, tmp3, lr, LSL #24 ; word 4 ; 2 STMIA dst!, {tmp1, v1, v2, tmp3} ; 12/13 SUBS n, n, #16 ; 1 BGE %B0 ; 4 / 1 ; loop timing (depends on alignment) for n loops:- ; pre: 17 ; ((45/46/47)n - 3) for 32n bytes ; post: 13/14 ; total: (45/46/47)n+(27/28) ; 32 bytes: 72-75 ; 64 bytes: 117-122 ; 96 bytes: 162-169 ; Reload registers LDMFD sp!, {v1, v2} ; 12/13 ???? ADDS n, n, #16-4 ; check for at least 4 BLT %F2 ; < 4 bytes 1 MOV tmp3, lr, LSR #8 ; first three bytes ; 1 LDR lr, [src], #4 ; next four bytes ; 4 ORR tmp3, tmp3, lr, LSL #24 ; 1 STR tmp3, [dst], #4 ; 4 SUBS n, n, #4 ; 1 BGE %B1 ; tmp1 contains three bytes 1 / 4 ; Loop timing: ; 15n-3 for 4n bytes ; 32: 117 ; 64: 237 ; Less than four bytes to go - readjust the src ; address. 2 SUB src, src, #3 B Up_TrailingBytes ; The next two source bytes are in tmp1, two bytes must ; come from the next source word. At least four bytes ; more must be stored. Up_TwoBytes CMP n, #16-4 ; at least 16 bytes? BLT %F1 ; no SUB n, n, #16-4 ; (n+16) bytes to go ; form a stack frame and save registers STMFD sp!, {v1, v2} ; loop doing 32 bytes at a time. There are currently ; two useful bytes in lr. 0 MOV tmp1, lr, LSR #16 ; first two bytes LDMIA src!, {v1, v2, tmp3, lr} ORR tmp1, tmp1, v1, LSL #16 ; word 1 MOV v1, v1, LSR #16 ORR v1, v1, v2, LSL #16 ; word 2 MOV v2, v2, LSR #16 ORR v2, v2, tmp3, LSL #16 ; word 3 MOV tmp3, tmp3, LSR #16 ORR tmp3, tmp3, lr, LSL #16 ; word 4 STMIA dst!, {tmp1, v1, v2, tmp3} SUBS n, n, #16 BGE %B0 ; Reload registers LDMFD sp!, {v1, v2} ADDS n, n, #16-4 ; check number of bytes BLT %F2 1 MOV tmp3, lr, LSR #16 ; first two bytes LDR lr, [src], #4 ; next four bytes ORR tmp3, tmp3, lr, LSL #16 STR tmp3, [dst], #4 SUBS n, n, #4 BGE %B1 ; tmp1 contains two bytes ; Less than four bytes to go - readjust the src ; address. 2 SUB src, src, #2 B Up_TrailingBytes ; The next source byte is in tmp1, three bytes must ; come from the next source word. At least four bytes ; more must be stored. Up_OneByte CMP n, #16-4 ; at least 16 bytes? BLT %F1 ; no SUB n, n, #16-4 ; (n+16) bytes to go ; form a stack frame and save registers STMFD sp!, {v1, v2} ; loop doing 32 bytes at a time. There is currently ; one useful byte in lr 0 MOV tmp1, lr, LSR #24 ; first byte LDMIA src!, {v1, v2, tmp3, lr} ORR tmp1, tmp1, v1, LSL #8 ; word 1 MOV v1, v1, LSR #24 ORR v1, v1, v2, LSL #8 ; word 2 MOV v2, v2, LSR #24 ORR v2, v2, tmp3, LSL #8 ; word 3 MOV tmp3, tmp3, LSR #24 ORR tmp3, tmp3, lr, LSL #8 ; word 4 STMIA dst!, {tmp1, v1, v2, tmp3} SUBS n, n, #16 BGE %B0 ; Reload registers LDMFD sp!, {v1, v2} ADDS n, n, #16-4 ; check number of bytes BLT %F2 1 MOV tmp3, lr, LSR #24 ; first byte LDR lr, [src], #4 ; next four bytes ORR tmp3, tmp3, lr, LSL #8 STR tmp3, [dst], #4 SUBS n, n, #4 BGE %B1 ; tmp1 contains one byte ; Less than four bytes to go - one already in tmp3. 2 SUB src, src, #1 B Up_TrailingBytes ;====================================================================== ; Copy down code ; ============== ; This is exactly the same as the copy up code - ; but it copies in the opposite direction. CopyDown ADD src, src, n ; points beyond end ADD dst, dst, n SUBS n, n, #4 ; need at least 4 bytes BLT Down_TrailingBytes ; < 4 bytes to go ; word align the dst - first find out how many bytes ; must be stored to do this. If the number is 0 ; check the src too. ANDS tmp3, dst, #3 ; eq means aligned! BNE Down_AlignDst ANDS tmp3, src, #3 BNE Down_SrcUnaligned ; more difficult! ; here when source and destination are both aligned. ; number of bytes to transfer is (n+4), n is >= 0. Down_SrcDstAligned SUBS n, n, #12-4 ; 12 bytes or more? BLT Down_TrailingWords ; We only have three registers to play with. It is ; worth gaining more only if the number of bytes to ; transfer is greater than 12+8* ; We need to stack 8 (4+4) registers to gain 8 temporaries, ; so look for >=44 bytes. Since we would save 8*4 = 32 ; bytes at a time we actually compare with 64. STMFD sp!, {v1, lr} SUBS n, n, #32-12 ; n+32 to go. BLT %F1 ; loop loading 4 registers per time, twice (32 bytes) 0 LDMDB src!, {tmp1, v1, tmp3, lr} STMDB dst!, {tmp1, v1, tmp3, lr} LDMDB src!, {tmp1, v1, tmp3, lr} STMDB dst!, {tmp1, v1, tmp3, lr} SUBS n, n, #32 BGE %B0 ; see if we can handle another 8 1 CMN n, #16 LDMGEDB src!, {tmp1, v1, tmp3, lr} STMGEDB dst!, {tmp1, v1, tmp3, lr} SUBGE n, n, #16 ; Here when there are fewer than 16 bytes to go. ADDS n, n, #32-12 ; (n-12) to go ; Ok - do three words at a time. LDMGEDB src!, {tmp1, tmp3, lr} STMGEDB dst!, {tmp1, tmp3, lr} SUBGE n, n, #12 LDMFD sp!, {v1, lr} ; (n-12) bytes to go - 0, 1 or 2 words. Check ; which. Down_TrailingWords ADDS n, n, #12-4 ; (n-4) to go BLT Down_TrailingBytes ; < 4 bytes to go SUBS n, n, #4 LDRLT tmp1, [src, #-4]! STRLT tmp1, [dst, #-4]! LDMGEDB src!, {tmp1, tmp3} STMGEDB dst!, {tmp1, tmp3} SUBGE n, n, #4 ; Here with less than 4 bytes to go Down_TrailingBytes ADDS n, n, #4 MOVEQS pc, lr ; 0 bytes CMP n, #2 ; 1, 2 or 3 bytes LDRB tmp1, [src, #-1]! STRB tmp1, [dst, #-1]! LDRGEB tmp1, [src, #-1]! STRGEB tmp1, [dst, #-1]! LDRGTB tmp1, [src, #-1]! ; dst is now original dst STRGTB tmp1, [dst, #-1]! MOVS pc, lr ;------------------------------------------------------------ ; word align dst - tmp3 contains current destination ; alignment. We can store at least 4 bytes here. We are ; going downwards - so tmp3 is the actual number of bytes ; to store. Down_AlignDst CMP tmp3, #2 LDRB tmp1, [src, #-1]! STRB tmp1, [dst, #-1]! LDRGEB tmp1, [src, #-1]! STRGEB tmp1, [dst, #-1]! LDRGTB tmp1, [src, #-1]! STRGTB tmp1, [dst, #-1]! SUBS n, n, tmp3 ; check number to go BLT Down_TrailingBytes ; less than 4 bytes ANDS tmp3, src, #3 BEQ Down_SrcDstAligned ; coaligned case ; The source is not coaligned with the destination, ; the destination IS currently word aligned. Down_SrcUnaligned BIC src, src, #3 ; tmp3 holds extra! LDR tmp1, [src] ; 1-3 useful bytes CMP tmp3, #2 BLT Down_OneByte ; one byte in tmp1 BEQ Down_TwoBytes ; two bytes in tmp1 ; The last three source bytes are in tmp1, one byte must ; come from the previous source word. At least four bytes ; more must be stored. Check first to see if there are a ; sufficient number of bytes to go to justify using stm/ldm ; instructions. Down_ThreeBytes CMP n, #16-4 ; at least 16 bytes? BLT %F1 ; no SUB n, n, #16-4 ; (n+16) bytes to go ; form a stack frame and save registers STMFD sp!, {v1, v2, lr} ; loop doing 32 bytes at a time. There are currently ; three useful bytes in tmp1 (a4). 0 MOV lr, tmp1, LSL #8 ; last three bytes LDMDB src!, {tmp1, v1, v2, tmp3} ORR lr, lr, tmp3, LSR #24 ; word 4 MOV tmp3, tmp3, LSL #8 ORR tmp3, tmp3, v2, LSR #24 ; word 3 MOV v2, v2, LSL #8 ORR v2, v2, v1, LSR #24 ; word 2 MOV v1, v1, LSL #8 ORR v1, v1, tmp1, LSR #24 ; word 1 STMDB dst!, {v1, v2, tmp3, lr} SUBS n, n, #16 BGE %B0 ; Reload registers LDMFD sp!, {v1, v2, lr} ADDS n, n, #16-4 ; check for at least 4 BLT %F2 ; < 4 bytes 1 MOV tmp3, tmp1, LSL #8 ; last three bytes LDR tmp1, [src, #-4]! ; previous four bytes ORR tmp3, tmp3, tmp1, LSR #24 STR tmp3, [dst, #-4]! SUBS n, n, #4 BGE %B1 ; tmp1 contains three bytes ; Less than four bytes to go - readjust the src ; address. 2 ADD src, src, #3 B Down_TrailingBytes ; The last two source bytes are in tmp1, two bytes must ; come from the previous source word. At least four bytes ; more must be stored. Down_TwoBytes CMP n, #16-4 ; at least 16 bytes? BLT %F1 ; no SUB n, n, #16-4 ; (n+16) bytes to go ; form a stack frame and save registers STMFD sp!, {v1, v2, lr} ; loop doing 32 bytes at a time. There are currently ; two useful bytes in tmp1 (a4). 0 MOV lr, tmp1, LSL #16 ; last two bytes LDMDB src!, {tmp1, v1, v2, tmp3} ORR lr, lr, tmp3, LSR #16 ; word 4 MOV tmp3, tmp3, LSL #16 ORR tmp3, tmp3, v2, LSR #16 ; word 3 MOV v2, v2, LSL #16 ORR v2, v2, v1, LSR #16 ; word 2 MOV v1, v1, LSL #16 ORR v1, v1, tmp1, LSR #16 ; word 1 STMDB dst!, {v1, v2, tmp3, lr} SUBS n, n, #16 BGE %B0 ; Reload registers LDMFD sp!, {v1, v2, lr} ADDS n, n, #16-4 ; check for at least 4 BLT %F2 ; < 4 bytes 1 MOV tmp3, tmp1, LSL #16 ; last two bytes LDR tmp1, [src, #-4]! ; previous four bytes ORR tmp3, tmp3, tmp1, LSR #16 STR tmp3, [dst, #-4]! SUBS n, n, #4 BGE %B1 ; tmp1 contains two bytes ; Less than four bytes to go - readjust the src ; address. 2 ADD src, src, #2 B Down_TrailingBytes ; The last source byte is in tmp1, three bytes must ; come from the previous source word. At least four bytes ; more must be stored. Down_OneByte CMP n, #16-4 ; at least 16 bytes? BLT %F1 ; no SUB n, n, #16-4 ; (n+16) bytes to go ; form a stack frame and save registers STMFD sp!, {v1, v2, lr} ; loop doing 32 bytes at a time. There is currently ; one useful byte in tmp1 (a4). 0 MOV lr, tmp1, LSL #24 ; last byte LDMDB src!, {tmp1, v1, v2, tmp3} ORR lr, lr, tmp3, LSR #8 ; word 4 MOV tmp3, tmp3, LSL #24 ORR tmp3, tmp3, v2, LSR #8 ; word 3 MOV v2, v2, LSL #24 ORR v2, v2, v1, LSR #8 ; word 2 MOV v1, v1, LSL #24 ORR v1, v1, tmp1, LSR #8 ; word 1 STMDB dst!, {v1, v2, tmp3, lr} SUBS n, n, #16 BGE %B0 ; Reload registers LDMFD sp!, {v1, v2, lr} ADDS n, n, #16-4 ; check for at least 4 BLT %F2 ; < 4 bytes 1 MOV tmp3, tmp1, LSL #24 ; last byte LDR tmp1, [src, #-4]! ; previous four bytes ORR tmp3, tmp3, tmp1, LSR #8 STR tmp3, [dst, #-4]! SUBS n, n, #4 BGE %B1 ; tmp1 contains one byte ; Less than four bytes to go - one already in tmp3. 2 ADD src, src, #1 B Down_TrailingBytes ;------------------------------------------------------------ END