;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; ;;; Microsoft Research Singularity ;;; ;;; Copyright (c) Microsoft Corporation. All rights reserved. ;;; ;;; This file contains ARM-specific assembly code. ;;; ;********************************************************************** ; void * ; memcpy( void *dest, const void *src, size_t count ); ; The memcpy function copies count bytes of src to dest. ; If the source and destination overlap, this function does ; not ensure that the original source bytes in the overlapping ; region are copied before being overwritten. Use memmove to ; handle overlapping regions. ; ;********************************************************************** OPT 2 ; disable listing INCLUDE kxarm.inc OPT 1 ; reenable listing dest RN R0 source RN R1 count RN R2 temp1 RN R3 temp2 RN R4 temp3 RN R5 temp4 RN R12 IF Thumbing THUMBAREA ENDIF NESTED_ENTRY memcpy ROUT IF Thumbing ; Switch from Thumb mode to ARM mode DCW 0x4778 ; bx pc DCW 0x46C0 ; nop ENDIF ;//Save registers onto the stack STMDB sp!, {dest,temp2,temp3,lr} ; save registers PROLOG_END ; Use a threshold to determine which code to use: ; ; if destination & source are naturally aligned, then ; threshold = 512 ; else ; threshold = 128 ; ; if copy size > threshold, then ; use memcpybigblk ; else ; use .NET code ORR temp1, dest, source TST temp1, #3 MOVEQ temp1, #512 MOVNE temp1, #128 CMP count, temp1 BHI UNDO_PROLOG ; revert and continue to memcpybigblk ; NOTE: UNDO_PROLOG just restores SP, so do NOT modify anything other ; than r3 (temp1) and r12 (temp4) before this point ;********************************************************************** ; Copy from head to tail to avoid source overwrite because the source ; destination the source ;********************************************************************** HEAD_TO_TAIL ;if LT 8 bytes store them and exit CMP count, #8 ; 2-3 cycles BLT BYTEMOVE4 ;Check alignment of parameters ANDS temp1, dest, #3 ; 2-3 cycles BEQ SRCALIGN ; destination is at least 1 byte misaligned ; Read and write (4 - alignment) bytes to align destination. RSB temp1, temp1, #4 ; 9 cycles LDRB temp2, [source], #1 CMP temp1, #2 STRB temp2, [dest], #1 LDRGEB temp3, [source], #1 ; >= 2 == at least 2 bytes LDRGTB temp2, [source], #1 ; > 2 == 3 bytes unaligned SUB count, count, temp1 STRGEB temp3, [dest], #1 STRGTB temp2, [dest], #1 SRCALIGN ; 3 - 7 cycles TST source, #1 ; save alignment of src BNE UNALIGNED ; src 3 byte unaligned. TST source, #2 BNE HWORDMOVE ; src and dst are hword aligned ; ;word aligned source and destination, move blocks of 32 bytes ;until we have less than 32 bytes left, then divide moves in ;half down to less than 4, where we will move the last 3 or less ;bytes ; WORDMOVE SUBS count, count, #32 ; 2-3 cycles BLT BLK16 BLK32 ; 20 cycles/32 bytes LDMIA source!, {temp1,temp2,temp3,lr} STMIA dest!, {temp1,temp2,temp3,lr} LDMIA source!, {temp1,temp2,temp3,lr} SUBS count, count, #32 STMIA dest!, {temp1,temp2,temp3,lr} BGE BLK32 BLK16 ; 11-4 cycles/16 bytes ADDS count, count, #16 LDMGEIA source!, {temp1, temp2, temp3, lr} STMGEIA dest!, {temp1, temp2, temp3, lr} BEQ WORD_BYTES_EXIT SUBGTS count, count, #16 BLK8 ; 6 cycles/8 bytes ADDS count, count, #8 LDMGEIA source!, {temp1, temp2} SUBGE count, count, #8 STMGEIA dest!, {temp1, temp2} BLK4 ADDS count, count, #4 ; 6-9 cycles/4 bytes LDRGE temp1, [source], #4 STRGE temp1, [dest], #4 WORD_BYTES ADDLTS count, count, #4 BEQ WORD_BYTES_EXIT ; On zero, Return to caller LDR temp1, [source], #4 ; 10 cycles/1-3 bytes CMP count, #2 STRGEH temp1, [dest], #2 STRLTB temp1, [dest], #1 MOVGT temp1, temp1, LSR #16 STRGTB temp1, [dest], #1 WORD_BYTES_EXIT IF Interworking :LOR: Thumbing LDMIA sp!, {dest, temp2, temp3, lr} BX lr ELSE LDMIA sp!, {dest, temp2, temp3, pc} ENDIF ; ; half word align source and destination ; HWORDMOVE ; 2-3 cycles LDRH temp1, [source], #2 SUBS count, count, #32 BLT HWORD8_TST HWORD32 ; 35 cycles/32 bytes LDMIA source!, {temp2,temp3,temp4,lr} ORR temp1, temp1, temp2, LSL #16 MOV temp2, temp2, LSR #16 ORR temp2, temp2, temp3, LSL #16 MOV temp3, temp3, LSR #16 ORR temp3, temp3, temp4, LSL #16 MOV temp4, temp4, LSR #16 ORR temp4, temp4, lr, LSL #16 STMIA dest!, {temp1,temp2,temp3,temp4} ; Store bytes 1-16 MOV temp1, lr, LSR #16 LDMIA source!, {temp2,temp3,temp4,lr} ORR temp1, temp1, temp2, LSL #16 MOV temp2, temp2, LSR #16 ORR temp2, temp2, temp3, LSL #16 MOV temp3, temp3, LSR #16 ORR temp3, temp3, temp4, LSL #16 MOV temp4, temp4, LSR #16 ORR temp4, temp4, lr, LSL #16 STMIA dest!, {temp1,temp2,temp3,temp4} ; Store bytes 17-32 SUBS count, count, #32 MOV temp1, lr, LSR #16 BGE HWORD32 HWORD8_TST ADDS count, count, #24 BLT HWORD4 HWORD8 ; 11 cycles/8 bytes LDMIA source!, {temp2,temp3} ORR temp1, temp1, temp2, LSL #16 MOV temp2, temp2, LSR #16 ORR temp2, temp2, temp3, LSL #16 STMIA dest!, {temp1, temp2} SUBS count, count, #8 MOV temp1, temp3, LSR #16 BGE HWORD8 HWORD4 ; 3-7 cycles/4 bytes ADDS count, count, #4 BLT HWORD_BYTES LDR temp2, [source], #4 ORR temp1, temp1, temp2, LSL #16 STR temp1, [dest], #4 MOV temp1, temp2, LSR #16 HWORD_BYTES ; 5-11 cycles/1-3 bytes ADDLTS count, count, #4 BEQ HWORD_BYTES_EXIT ; On zero, Return to caller CMP count, #2 STRLTB temp1, [dest], #1 LDRGTB temp2, [source], #1 STRGEH temp1, [dest], #2 STRGTB temp2, [dest], #1 HWORD_BYTES_EXIT IF Interworking :LOR: Thumbing LDMIA sp!, {dest, temp2, temp3, lr} BX lr ELSE LDMIA sp!, {dest, temp2, temp3, pc} ENDIF ; ; Unaligned Moves ; UNALIGNED TST source, #2 BEQ UNALIGNED1 UNALIGNED3 ; 3-4 cycles LDRB temp1, [source], #1 SUBS count, count, #32 BLT OFFTHREE8_TST OFFTHREE32 ; 35 cycles/32 bytes LDMIA source!, {temp2,temp3,temp4,lr} ORR temp1, temp1, temp2, LSL #8 MOV temp2, temp2, LSR #24 ORR temp2, temp2, temp3, LSL #8 MOV temp3, temp3, LSR #24 ORR temp3, temp3, temp4, LSL #8 MOV temp4, temp4, LSR #24 ORR temp4, temp4, lr, LSL #8 STMIA dest!, {temp1,temp2,temp3,temp4} ; Store bytes 1-16 MOV temp1, lr, LSR #24 LDMIA source!, {temp2,temp3,temp4,lr} ORR temp1, temp1, temp2, LSL #8 MOV temp2, temp2, LSR #24 ORR temp2, temp2, temp3, LSL #8 MOV temp3, temp3, LSR #24 ORR temp3, temp3, temp4, LSL #8 MOV temp4, temp4, LSR #24 ORR temp4, temp4, lr, LSL #8 STMIA dest!, {temp1,temp2,temp3,temp4} ; Store bytes 17-32 SUBS count, count, #32 MOV temp1, lr, LSR #24 BGE OFFTHREE32 OFFTHREE8_TST ADDS count, count, #24 BLT OFFTHREE4 OFFTHREE8 ; 11 cycles/8 bytes LDMIA source!, {temp2,temp3} ORR temp1, temp1, temp2, LSL #8 MOV temp2, temp2, LSR #24 ORR temp2, temp2, temp3, LSL #8 STMIA dest!, {temp1, temp2} SUBS count, count, #8 MOV temp1, temp3, LSR #24 BGE OFFTHREE8 OFFTHREE4 ; 3-7 cycles/4 bytes ADDS count, count, #4 BLT OFFTHREE_BYTES LDR temp2, [source], #4 ORR temp1, temp1, temp2, LSL #8 STR temp1, [dest], #4 MOV temp1, temp2, LSR #24 OFFTHREE_BYTES ; 5-12 cycles/ 1-3 bytes ADDLTS count, count, #4 BEQ OFFTHREE_EXIT ; On zero, Return to caller CMP count, #2 LDRGEH temp2, [source], #2 STRB temp1, [dest], #1 STRGEB temp2, [dest], #1 MOVGT temp2, temp2, LSR #8 STRGTB temp2, [dest], #1 OFFTHREE_EXIT IF Interworking :LOR: Thumbing LDMIA sp!, {dest, temp2, temp3, lr} BX lr ELSE LDMIA sp!, {dest, temp2, temp3, pc} ; On zero, Return to caller ENDIF ; ; Source is one byte from word alignment. ; Read a byte & half word then multiple words and a byte. Then ; shift and ORR them into consecutive words for STM writes UNALIGNED1 ; 5-6 cycles LDRB temp1, [source], #1 LDRH temp2, [source], #2 SUBS count, count, #32 ORR temp1, temp1, temp2, LSL #8 BLT OFFONE8_TST OFFONE32 ; 35 cycles/32 bytes LDMIA source!, {temp2, temp3, temp4, lr} ORR temp1, temp1, temp2, LSL #24 MOV temp2, temp2, LSR #8 ORR temp2, temp2, temp3, LSL #24 MOV temp3, temp3, LSR #8 ORR temp3, temp3, temp4, LSL #24 MOV temp4, temp4, LSR #8 ORR temp4, temp4, lr, LSL #24 STMIA dest!, {temp1,temp2,temp3,temp4} ; Store bytes 1-16 MOV temp1, lr, LSR #8 LDMIA source!, {temp2,temp3,temp4,lr} ORR temp1, temp1, temp2, LSL #24 MOV temp2, temp2, LSR #8 ORR temp2, temp2, temp3, LSL #24 MOV temp3, temp3, LSR #8 ORR temp3, temp3, temp4, LSL #24 MOV temp4, temp4, LSR #8 ORR temp4, temp4, lr, LSL #24 STMIA dest!, {temp1,temp2,temp3,temp4} ; Store bytes 17-32 SUBS count, count, #32 MOV temp1, lr, LSR #8 BGE OFFONE32 OFFONE8_TST ADDS count, count, #24 BLT OFFONE4 OFFONE8 ; 11 cycles/8 bytes LDMIA source!, {temp2,temp3} ORR temp1, temp1, temp2, LSL #24 MOV temp2, temp2, LSR #8 ORR temp2, temp2, temp3, LSL #24 STMIA dest!, {temp1,temp2} SUBS count, count, #8 MOV temp1, temp3, LSR #8 BGE OFFONE8 OFFONE4 ; 3-9 cycles/4 bytes ADDS count, count, #4 BLT OFFONE_BYTES LDR temp2, [source], #4 ORR temp1, temp1, temp2, LSL #24 STR temp1, [dest], #4 BEQ OFFONE_EXIT MOV temp1, temp2, LSR #8 OFFONE_BYTES ; 11 cycles/1-3 bytes ADDLTS count, count, #4 BEQ OFFONE_EXIT CMP count, #2 STRLTB temp1, [dest], #1 STRGEH temp1, [dest], #2 MOVGT temp1, temp1, LSR #16 STRGTB temp1, [dest], #1 OFFONE_EXIT IF Interworking :LOR: Thumbing LDMIA sp!, {dest, temp2, temp3, lr} BX lr ELSE LDMIA sp!, {dest, temp2, temp3, pc} ; Return to caller ENDIF BYTEMOVE4 ; 12 cycles/4 bytes CMP count, #4 BLT MMOVEXIT LDRB temp1, [source], #1 SUB count, count, #4 LDRB temp2, [source], #1 LDRB temp3, [source], #1 LDRB lr, [source], #1 STRB temp1, [dest], #1 STRB temp2, [dest], #1 STRB temp3, [dest], #1 STRB lr, [dest], #1 MMOVEXIT ; 2-5 cycles CMP count, #0 IF Interworking :LOR: Thumbing LDMEQIA sp!, {dest, temp2, temp3, lr} BXEQ lr ELSE LDMEQIA sp!, {dest, temp2, temp3, pc} ; On zero, Return to caller ENDIF ; ; Store last 3 or so bytes and exit ; BYTEMOVE ; 4-7 cycles/1 byte LDRB temp1, [source], #1 CMP count, #2 STRB temp1, [dest], #1 BLT BYTEMOVE_EXIT LDRGEB temp2, [source], #1 ; 8 cycles/1-2 bytes LDRGTB temp3, [source], #1 STRGEB temp2, [dest], #1 STRGTB temp3, [dest], #1 BYTEMOVE_EXIT IF Interworking :LOR: Thumbing LDMIA sp!, {dest, temp2, temp3, lr} BX lr ELSE LDMIA sp!, {dest, temp2, temp3, pc} ; Return to caller ENDIF ; THIS IS NOT A RETURN ; The following reverts the stack to its state at the point of entry ; of memcpy. It then falls through to memcpybigblk to perform the ; large copy UNDO_PROLOG ADD sp, sp, #0x10 ; ; FALLTHRU ; ENTRY_END memcpy NESTED_ENTRY memcpybigblk ROUT ;//Save registers onto the stack ;//R3 should be OK to destroy. If not, we stack it off too. stmfd sp!, {r0,r4-r11, lr} PROLOG_END prefetch_setup ;//Prefetch the source. ;//Have to align source register with word boundary first mov r5, r1 and r5, r5, #~0x3 ;//The PLD instruction just happens to be a Never Execute on ARM V4, ;//so we can in-line the PLD instruction and still maintain V4 compatibility ;// 0x0000000c: f5d5f000 .... PLD [r5,#0] ;// 0x00000010: f5d5f020 ... PLD [r5,#0x20] ;// 0x00000014: f5d5f040 @... PLD [r5,#0x40] DCD 0xf5d5f000 DCD 0xf5d5f020 DCD 0xf5d5f040 ;//If there are 4 or less bytes to copy, we just jump to the end ;//and do a straight byte copy. cmp r2, #4 bls finish ;//Align the destination to a word boundary. rsb r4, r0, #0 ;//Figure out how many bytes ands r4, r4, #0x2 ;//See if we need to do 2 copies ldrneb r5, [r1], #1 ;//Read the two bytes ldrneb r6, [r1], #1 subne r2, r2, #2 ;//Decrement count by 2 strneb r5, [r0], #1 ;//Now store the two bytes strneb r6, [r0], #1 ;//Have to do two seperate byte stores ;//because of possible address misalignment ands r4, r0, #0x1 ;//See if we need to do 1 copy ldrneb r5, [r1], #1 ;//Load the single byte subne r2, r2, #1 ;//Decrement count by 1 strneb r5, [r0], #1 ;//Store the single byte ;//We need to choose which memcpy we use based ;//on how the source is now aligned. If the destination and source ;//are both aligned, then we fall through to the aligned copy ;//Check the byte alignment of the source ;//We do it in reverse order just because. If most memcopies are ;//expected to be off by a certain #, that should be placed first. and r3, r1, #3 cmp r3, #3 ;//If both bits are set, go do case 3, off by 3 bytes beq memcpyoffby3 ;//Goto case 3 cmp r3, #2 ;//Check for case 2, off by 2 bytes beq memcpyoffby2 ;//Goto case 2 cmp r3, #1 ;//Check for case 1, off by 1 byte beq memcpyoffby1 ;//Goto case 1 ;//The source and destination are word aligned. We get an easy job. memcpyoffby0 ;//Now we need to align the destination to a cache line boundary ;//We need to figure out how many words are needed to align it. ;//If the number of words to align it are less than the number of words ;//we're asked to copy, just copy the required number of words. and r4, r0, #0x1C ;//Grab the low bits of the destination rsb r4, r4, #32 ;//Negate them and ;//add 32 to the low bits(this is ;//how many we need to move to get aligned) and r5, r2, #0x1C ;//Check only the number of words from count cmp r4, r2 ;//Compare low bits to align against the words from count movhi r4, r5 ;//If words to align is greater than the count, then ;//use the words from count instead cmp r4, #0 beq offby0mainloop ;//r4 now contains the number of times we need to do a word load/store ;//So we need to sortof back-calculate how many of the word load/stores to ;//skip in memcpyoffby0cachelinealignload/store rsb r3, r4, #32 and r3, r3, #0x1C ;//r3 now contains the number of *instructions* to skip over. ;//Deduct words from size sub r2, r2, r4 ;//Because the & 0x1C corresponds to words, we don't have to shift anything ;//when we jump into load table ;//Using two jump tables is faster because it gives the processor a chance to load ;//data before we try to store it out. adr r12, offby0cachelinealignload add pc, r12, r3 offby0cachelinealignload ;//Need to have up to 8 words (1 cache line) ldr r4, [r1], #4 ;//Could also do load/store pairs, and shift ldr r5, [r1], #4 ;//r3 left 1 bit to calculate jump address ldr r6, [r1], #4 ldr r7, [r1], #4 ldr r8, [r1], #4 ldr r9, [r1], #4 ldr r10,[r1], #4 ldr r11,[r1], #4 ;//Now jump into the store table adr r12, offby0cachelinealignstore add pc, r12, r3 offby0cachelinealignstore str r4, [r0], #4 str r5, [r0], #4 str r6, [r0], #4 str r7, [r0], #4 str r8, [r0], #4 str r9, [r0], #4 str r10,[r0], #4 str r11,[r0], #4 ;//We are now cache line aligned. ;//We loop around doing prefetches and copies based on how far ahead we want to look offby0mainloop cmp r2, #(32*3 + 32) ;//Only keep looking ahead by 4 cache lines bmi offby0endofmainloop ;//Preload the data ;// 0x000000f4: f5d1f060 `... PLD [r1,#0x60] ;// 0x000000f8: f5d1f080 .... PLD [r1,#0x80] DCD 0xf5d1f060 DCD 0xf5d1f080 ;//Here is the main loop that handles pipelining the loads ldmia r1!, {r4-r11} stmia r0!, {r4-r11} ldmia r1!, {r4-r11} stmia r0!, {r4-r11} sub r2, r2, #64 ;//Take 64 bytes off of count b offby0mainloop offby0endofmainloop ;//If we still have more than 32*4 words to move, do one more preload cmp r2, #32*4 bls offby0nopreload ;// 0x0000011c: f5d1f080 .... PLD [r1,#0x80] DCD 0xf5d1f080 offby0nopreload ;//Now we finish up the copy without any preloads. The data should have already ;//been loaded into the caches ;//Copy 32 bytes at a time offby0finishcachelines cmp r2, #32 bmi offby0endoffinishcachelines ldmia r1!, {r4-r11} stmia r0!, {r4-r11} sub r2, r2, #32 ;//Take 32 bytes off of count b offby0finishcachelines offby0endoffinishcachelines ;//Now we need to finish off any partial cache lines that may be left. We do a similar ;//algorithm to the cachelinealign loop above. ands r3, r2, #0x1C ;//Get number of words left beq finish ;//If words left==0, then branch to finish sub r2, r2, r3 ;//Subtract words left from count rsb r3, r3, #32 ;//Get 32-number of words left adr r12, offby0finishload ;//That's the instructions to skip add pc, r12, r3 offby0finishload ;//Need to have up to 8 words (1 cache line) ldr r4, [r1], #4 ;//Could also do load/store pairs, and shift ldr r5, [r1], #4 ;//r3 left 1 bit to calculate jump address ldr r6, [r1], #4 ldr r7, [r1], #4 ldr r8, [r1], #4 ldr r9, [r1], #4 ldr r10,[r1], #4 ldr r11,[r1], #4 ;//Now jump into the store table adr r12, offby0finishstore add pc, r12, r3 offby0finishstore str r4, [r0], #4 str r5, [r0], #4 str r6, [r0], #4 str r7, [r0], #4 str r8, [r0], #4 str r9, [r0], #4 str r10,[r0], #4 str r11,[r0], #4 ;//Copy the last 4 bytes, if necessary rsb r2, r2, #4 ;//Find how many bytes to copy (0, 1,2,3, or 4) adr r12, finishloadby0 add pc, r12, r2, LSL #2 ;//Need to shift r2 left by 2 bits to jump instructions finishloadby0 ldrb r3, [r1], #1 ldrb r4, [r1], #1 ldrb r5, [r1], #1 ldrb r6, [r1], #1 adr r12, finishstoreby0 add pc, r12, r2, LSL #2 finishstoreby0 strb r3, [r0], #1 strb r4, [r0], #1 strb r5, [r0], #1 strb r6, [r0], #1 ;//Return to calling function IF Interworking :LOR: Thumbing ldmfd sp!, {r0,r4-r11, lr} bx lr ELSE ldmfd sp!, {r0,r4-r11, pc} ENDIF ;//The source and destination are not aligned. We're going to have ;//to load and shift data from a temporary buffer. Stuff needs to be ;//shifted to the right by 8 bits to align properly memcpyoffby1 ;//First we need to word align the source and r3, r1, #~0x3 ;//Load the first value into the holding buffer (lr) ldr lr, [r3], #4 mov lr, lr, LSR #8 ;//Now we need to align the destination to a cache line boundary ;//We need to figure out how many words are needed to align it. ;//If the number of words to align it are less than the number of words ;//we're asked to copy, just copy the required number of words. and r4, r0, #0x1C ;//Grab the low bits of the destination rsb r4, r4, #32 ;//Negate them ;//Add 32 to the low bits(this is ;//how many we need to move to get aligned) and r5, r2, #0x1C ;//Check only the number of words from count cmp r4, r2 ;//Compare low bits to align against the words from count movhi r4, r5 ;//If words to align is greater than the count, then ;//use the words from count instead cmp r4, #0 beq offby1mainloop ;//r4 now contains the number of times we need to do a word load/store ;//So we need to sortof back-calculate how many of the word load/stores to ;//skip in memcpyoffby1cachelinealignload rsb r6, r4, #32 and r6, r6, #0x1C ;//r3 now contains the number of *words* to skip over. ;//Deduct words from size sub r2, r2, r4 ;//Because the & 0x1C corresponds to words, we DO need to shift this time around ;//when we jump into load table adr r12, offby1cachelinealignload add pc, r12, r6, LSL #2 ;//Allows 4 instructions per byteblit ;//Because there is no convenient way to split the load/store into multiples of 2 ;//unless we keep them together, for misaligned data we leave them together. offby1cachelinealignload ;//Need to have up to 8 words (1 cache line) ldr r4, [r3], #4 orr r12,lr, r4, LSL #24 str r12,[r0], #4 mov lr, r4, LSR #8 ldr r4, [r3], #4 orr r12,lr, r4, LSL #24 str r12,[r0], #4 mov lr, r4, LSR #8 ldr r4, [r3], #4 orr r12,lr, r4, LSL #24 str r12,[r0], #4 mov lr, r4, LSR #8 ldr r4, [r3], #4 orr r12,lr, r4, LSL #24 str r12,[r0], #4 mov lr, r4, LSR #8 ldr r4, [r3], #4 orr r12,lr, r4, LSL #24 str r12,[r0], #4 mov lr, r4, LSR #8 ldr r4, [r3], #4 orr r12,lr, r4, LSL #24 str r12,[r0], #4 mov lr, r4, LSR #8 ldr r4, [r3], #4 orr r12,lr, r4, LSL #24 str r12,[r0], #4 mov lr, r4, LSR #8 ldr r4, [r3], #4 orr r12,lr, r4, LSL #24 str r12,[r0], #4 mov lr, r4, LSR #8 ;//We are now cache line aligned. ;//We loop around doing prefetches and copies based on how far ahead we want to look offby1mainloop cmp r2, #(32*4 + 32) ;//Only keep looking ahead by 4 cache lines bmi offby1endofmainloop ;//Preload ;// 0x00000264: f5d3f060 `... PLD [r3,#0x60] ;// 0x00000268: f5d3f080 .... PLD [r3,#0x80] DCD 0xf5d3f060 DCD 0xf5d3f080 ;//Here is the main loop that handles pipelining the loads for off by 1 ldmia r3!, {r4, r5, r6, r7, r8, r9, r10, r11} orr r1,lr, r4, LSL #24 mov lr, r4, LSR #8 orr r4, lr, r5, LSL #24 mov lr, r5, LSR #8 orr r5, lr, r6, LSL #24 mov lr, r6, LSR #8 orr r6, lr, r7, LSL #24 mov lr, r7, LSR #8 orr r7, lr, r8, LSL #24 mov lr, r8, LSR #8 orr r8, lr, r9, LSL #24 mov lr, r9, LSR #8 orr r9, lr, r10, LSL #24 mov lr, r10, LSR #8 orr r10, lr, r11, LSL #24 mov lr, r11, LSR #8 stmia r0!, {r1, r4, r5, r6, r7, r8, r9, r10} ldmia r3!, {r4, r5, r6, r7, r8, r9, r10, r11} orr r1,lr, r4, LSL #24 mov lr, r4, LSR #8 orr r4, lr, r5, LSL #24 mov lr, r5, LSR #8 orr r5, lr, r6, LSL #24 mov lr, r6, LSR #8 orr r6, lr, r7, LSL #24 mov lr, r7, LSR #8 orr r7, lr, r8, LSL #24 mov lr, r8, LSR #8 orr r8, lr, r9, LSL #24 mov lr, r9, LSR #8 orr r9, lr, r10, LSL #24 mov lr, r10, LSR #8 orr r10, lr, r11, LSL #24 mov lr, r11, LSR #8 stmia r0!, {r1, r4, r5, r6, r7, r8, r9, r10} sub r2, r2, #64 ;//Take 64 bytes off of count b offby1mainloop offby1endofmainloop ;//If we still have more than 32*4 words to move, do one more preload cmp r2, #32*4 bls offby1nopreload ;// 0x00000338: f5d3f080 .... PLD [r3,#0x80] DCD 0xf5d3f080 offby1nopreload ;//Now we finish up the copy without any preloads. The data should have alread ;//been loaded into the caches ;//Copy 32 bytes at a time offby1finishcachelines cmp r2, #32 bmi offby1endoffinishcachelines ldmia r3!, {r4, r5, r6, r7, r8, r9, r10, r11} orr r1,lr, r4, LSL #24 mov lr, r4, LSR #8 orr r4, lr, r5, LSL #24 mov lr, r5, LSR #8 orr r5, lr, r6, LSL #24 mov lr, r6, LSR #8 orr r6, lr, r7, LSL #24 mov lr, r7, LSR #8 orr r7, lr, r8, LSL #24 mov lr, r8, LSR #8 orr r8, lr, r9, LSL #24 mov lr, r9, LSR #8 orr r9, lr, r10, LSL #24 mov lr, r10, LSR #8 orr r10, lr, r11, LSL #24 mov lr, r11, LSR #8 stmia r0!, {r1, r4, r5, r6, r7, r8, r9, r10} sub r2, r2, #32 ;//Take 32 bytes off of count b offby1finishcachelines offby1endoffinishcachelines ;//Now we need to finish off any partial cache lines that may be left. We do a similar ;//algorithm to the cachelinealign loop above. ands r6, r2, #0x1C ;//Get number of words left subeq r1, r3, #3 ;//Realign source on exact byte if need to branch beq finish ;//If words left==0, then branch to finish sub r2, r2, r6 ;//Subtract words left from count rsb r6, r6, #32 ;//Get 32-number of words left adr r12, offby1finishload ;//That's the copies to skip add pc, r12, r6, LSL #2 ;//..but need to multiply by 4 to get instructions offby1finishload ;//Need to have up to 8 words (1 cache line) ldr r4, [r3], #4 orr r12,lr, r4, LSL #24 str r12,[r0], #4 mov lr, r4, LSR #8 ldr r4, [r3], #4 orr r12,lr, r4, LSL #24 str r12,[r0], #4 mov lr, r4, LSR #8 ldr r4, [r3], #4 orr r12,lr, r4, LSL #24 str r12,[r0], #4 mov lr, r4, LSR #8 ldr r4, [r3], #4 orr r12,lr, r4, LSL #24 str r12,[r0], #4 mov lr, r4, LSR #8 ldr r4, [r3], #4 orr r12,lr, r4, LSL #24 str r12,[r0], #4 mov lr, r4, LSR #8 ldr r4, [r3], #4 orr r12,lr, r4, LSL #24 str r12,[r0], #4 mov lr, r4, LSR #8 ldr r4, [r3], #4 orr r12,lr, r4, LSL #24 str r12,[r0], #4 mov lr, r4, LSR #8 ldr r4, [r3], #4 orr r12,lr, r4, LSL #24 str r12,[r0], #4 mov lr, r4, LSR #8 sub r1, r3, #3 ;//Realign source on exact byte ;//Copy the last 4 bytes, if necessary rsb r2, r2, #4 ;//Find how many bytes to copy (1,2,3, or 4) adr r12, finishloadby1 add pc, r12, r2, LSL #2 ;//Need to shift r2 left by 2 bits to jump instructions finishloadby1 ldrb r3, [r1], #1 ldrb r4, [r1], #1 ldrb r5, [r1], #1 ldrb r6, [r1], #1 adr r12, finishstoreby1 add pc, r12, r2, LSL #2 finishstoreby1 strb r3, [r0], #1 strb r4, [r0], #1 strb r5, [r0], #1 strb r6, [r0], #1 ;//Return to calling function IF Interworking :LOR: Thumbing ldmfd sp!, {r0,r4-r11, lr} bx lr ELSE ldmfd sp!, {r0,r4-r11, pc} ENDIF ;//The source and destination are not aligned. We're going to have to load ;//and shift data from a temporary buffer. Stuff needs to be shifted to the ;//right by 16 bits to align properly memcpyoffby2 ;//First we need to word align the source and r3, r1, #~0x3 ;//Load the first value into the holding buffer (lr) ldr lr, [r3], #4 mov lr, lr, LSR #16 ;//Now we need to align the destination to a cache line boundary ;//We need to figure out how many words are needed to align it. ;//If the number of words to align it are less than the number of words ;//we're asked to copy, just copy the required number of words. and r4, r0, #0x1C ;//Grab the low bits of the destination rsb r4, r4, #32 ;//Negate them ;//Add 32 to the low bits(this is ;//how many we need to move to get aligned) and r5, r2, #0x1C ;//Check only the number of words from count cmp r4, r2 ;//Compare low bits to align against the words from count movhi r4, r5 ;//If words to align is greater than the count, then ;//use the words from count instead cmp r4, #0 beq offby2mainloop ;//r4 now contains the number of times we need to do a word load/store ;//So we need to sortof back-calculate how many of the word load/stores to ;//skip in memcpyoffby2cachelinealignload rsb r6, r4, #32 and r6, r6, #0x1C ;//r3 now contains the number of *words* to skip over. ;//Deduct words from size sub r2, r2, r4 ;//Because the & 0x1C corresponds to words, we DO need to shift this time around ;//when we jump into load table adr r12, offby2cachelinealignload add pc, r12, r6, LSL #2 ;//Allows 4 instructions per byteblit ;//Because there is no convenient way to split the load/store into multiples of 2 ;//unless we keep them together, for misaligned data we leave them together. offby2cachelinealignload ;//Need to have up to 8 words (1 cache line) ldr r4, [r3], #4 orr r12,lr, r4, LSL #16 str r12,[r0], #4 mov lr, r4, LSR #16 ldr r4, [r3], #4 orr r12,lr, r4, LSL #16 str r12,[r0], #4 mov lr, r4, LSR #16 ldr r4, [r3], #4 orr r12,lr, r4, LSL #16 str r12,[r0], #4 mov lr, r4, LSR #16 ldr r4, [r3], #4 orr r12,lr, r4, LSL #16 str r12,[r0], #4 mov lr, r4, LSR #16 ldr r4, [r3], #4 orr r12,lr, r4, LSL #16 str r12,[r0], #4 mov lr, r4, LSR #16 ldr r4, [r3], #4 orr r12,lr, r4, LSL #16 str r12,[r0], #4 mov lr, r4, LSR #16 ldr r4, [r3], #4 orr r12,lr, r4, LSL #16 str r12,[r0], #4 mov lr, r4, LSR #16 ldr r4, [r3], #4 orr r12,lr, r4, LSL #16 str r12,[r0], #4 mov lr, r4, LSR #16 ;//So in theory we should now be cache line aligned. ;//We loop around doing prefetches and copies based on how far ahead we want to look offby2mainloop cmp r2, #(32*4 + 32) ;//Only keep looking ahead by 4 cache lines bmi offby2endofmainloop ;//Preload ;// 0x00000514: f5d3f060 `... PLD [r3,#0x60] ;// 0x00000518: f5d3f080 .... PLD [r3,#0x80] DCD 0xf5d3f060 DCD 0xf5d3f080 ;//Here is the main loop that handles pipelining the loads for off by 2 ldmia r3!, {r4, r5, r6, r7, r8, r9, r10, r11} orr r1,lr, r4, LSL #16 mov lr, r4, LSR #16 orr r4, lr, r5, LSL #16 mov lr, r5, LSR #16 orr r5, lr, r6, LSL #16 mov lr, r6, LSR #16 orr r6, lr, r7, LSL #16 mov lr, r7, LSR #16 orr r7, lr, r8, LSL #16 mov lr, r8, LSR #16 orr r8, lr, r9, LSL #16 mov lr, r9, LSR #16 orr r9, lr, r10, LSL #16 mov lr, r10, LSR #16 orr r10, lr, r11, LSL #16 mov lr, r11, LSR #16 stmia r0!, {r1, r4, r5, r6, r7, r8, r9, r10} ldmia r3!, {r4, r5, r6, r7, r8, r9, r10, r11} orr r1,lr, r4, LSL #16 mov lr, r4, LSR #16 orr r4, lr, r5, LSL #16 mov lr, r5, LSR #16 orr r5, lr, r6, LSL #16 mov lr, r6, LSR #16 orr r6, lr, r7, LSL #16 mov lr, r7, LSR #16 orr r7, lr, r8, LSL #16 mov lr, r8, LSR #16 orr r8, lr, r9, LSL #16 mov lr, r9, LSR #16 orr r9, lr, r10, LSL #16 mov lr, r10, LSR #16 orr r10, lr, r11, LSL #16 mov lr, r11, LSR #16 stmia r0!, {r1, r4, r5, r6, r7, r8, r9, r10} sub r2, r2, #64 ;//Take 64 bytes off of count b offby2mainloop offby2endofmainloop ;//If we still have more than 32*4 words to move, do one more preload cmp r2, #32*4 bls offby2nopreload ;// 0x000005e8: f5d3f080 .... PLD [r3,#0x80] DCD 0xf5d3f080 offby2nopreload ;//Now we finish up the copy without any preloads. The data should have already ;//been loaded into the caches ;//Copy 32 bytes at a time offby2finishcachelines cmp r2, #32 bmi offby2endoffinishcachelines ldmia r3!, {r4, r5, r6, r7, r8, r9, r10, r11} orr r1,lr, r4, LSL #16 mov lr, r4, LSR #16 orr r4, lr, r5, LSL #16 mov lr, r5, LSR #16 orr r5, lr, r6, LSL #16 mov lr, r6, LSR #16 orr r6, lr, r7, LSL #16 mov lr, r7, LSR #16 orr r7, lr, r8, LSL #16 mov lr, r8, LSR #16 orr r8, lr, r9, LSL #16 mov lr, r9, LSR #16 orr r9, lr, r10, LSL #16 mov lr, r10, LSR #16 orr r10, lr, r11, LSL #16 mov lr, r11, LSR #16 stmia r0!, {r1, r4, r5, r6, r7, r8, r9, r10} sub r2, r2, #32 ;//Take 32 bytes off of count b offby2finishcachelines offby2endoffinishcachelines ;//Now we need to finish off any partial cache lines that may be left. We do a similar ;//algorithm to the cachelinealign loop above. ands r6, r2, #0x1C ;//Get number of words left subeq r1, r3, #2 ;//Realign source on exact byte if need to branch beq finish ;//If words left==0, then branch to finish sub r2, r2, r6 ;//Subtract words left from count rsb r6, r6, #32 ;//Get 32-number of words left adr r12, offby2finishload ;//That's the copies to skip add pc, r12, r6, LSL #2 ;//..but need to multiply by 4 to get instructions offby2finishload ;//Need to have up to 8 words (1 cache line) ldr r4, [r3], #4 orr r12,lr, r4, LSL #16 str r12,[r0], #4 mov lr, r4, LSR #16 ldr r4, [r3], #4 orr r12,lr, r4, LSL #16 str r12,[r0], #4 mov lr, r4, LSR #16 ldr r4, [r3], #4 orr r12,lr, r4, LSL #16 str r12,[r0], #4 mov lr, r4, LSR #16 ldr r4, [r3], #4 orr r12,lr, r4, LSL #16 str r12,[r0], #4 mov lr, r4, LSR #16 ldr r4, [r3], #4 orr r12,lr, r4, LSL #16 str r12,[r0], #4 mov lr, r4, LSR #16 ldr r4, [r3], #4 orr r12,lr, r4, LSL #16 str r12,[r0], #4 mov lr, r4, LSR #16 ldr r4, [r3], #4 orr r12,lr, r4, LSL #16 str r12,[r0], #4 mov lr, r4, LSR #16 ldr r4, [r3], #4 orr r12,lr, r4, LSL #16 str r12,[r0], #4 mov lr, r4, LSR #16 sub r1, r3, #2 ;//Realign source on exact byte ;//Copy the last 4 bytes, if necessary rsb r2, r2, #4 ;//Find how many bytes to copy (1,2,3, or 4) adr r12, finishloadby2 add pc, r12, r2, LSL #2 ;//Need to shift r2 left by 2 bits to jump instructions finishloadby2 ldrb r3, [r1], #1 ldrb r4, [r1], #1 ldrb r5, [r1], #1 ldrb r6, [r1], #1 adr r12, finishstoreby2 add pc, r12, r2, LSL #2 finishstoreby2 strb r3, [r0], #1 strb r4, [r0], #1 strb r5, [r0], #1 strb r6, [r0], #1 ;//Return to calling function IF Interworking :LOR: Thumbing ldmfd sp!, {r0,r4-r11, lr} bx lr ELSE ldmfd sp!, {r0,r4-r11, pc} ENDIF ;//The source and destination are not aligned. We're going to have to load ;//and shift data from a temporary buffer. Stuff needs to be shifted to the ;//right by 24 bits to align properly memcpyoffby3 ;//First we need to word align the source and r3, r1, #~0x3 ;//Load the first value into the holding buffer (lr) ldr lr, [r3], #4 mov lr, lr, LSR #24 ;//Now we need to align the destination to a cache line boundary ;//We need to figure out how many words are needed to align it. ;//If the number of words to align it are less than the number of words ;//we're asked to copy, just copy the required number of words. and r4, r0, #0x1C ;//Grab the low bits of the destination rsb r4, r4, #32 ;//Negate them ;//Add 32 to the low bits(this is ;//how many we need to move to get aligned) and r5, r2, #0x1C ;//Check only the number of words from count cmp r4, r2 ;//Compare low bits to align against the words from count movhi r4, r5 ;//If words to align is greater than the count, then ;//use the words from count instead cmp r4, #0 beq offby3mainloop ;//r4 now contains the number of times we need to do a word load/store ;//So we need to sortof back-calculate how many of the word load/stores to ;//skip in memcpyoffby3cachelinealignload rsb r6, r4, #32 and r6, r6, #0x1C ;//r3 now contains the number of *words* to skip over. ;//Deduct words from size sub r2, r2, r4 ;//Because the & 0x1C corresponds to words, we DO need to shift this time around ;//when we jump into load table adr r12, offby3cachelinealignload add pc, r12, r6, LSL #2 ;//Allows 4 instructions per byteblit ;//Because there is no convenient way to split the load/store into multiples of 2 ;//unless we keep them together, for misaligned data we leave them together. offby3cachelinealignload ;//Need to have up to 8 words (1 cache line) ldr r4, [r3], #4 orr r12,lr, r4, LSL #8 str r12,[r0], #4 mov lr, r4, LSR #24 ldr r4, [r3], #4 orr r12,lr, r4, LSL #8 str r12,[r0], #4 mov lr, r4, LSR #24 ldr r4, [r3], #4 orr r12,lr, r4, LSL #8 str r12,[r0], #4 mov lr, r4, LSR #24 ldr r4, [r3], #4 orr r12,lr, r4, LSL #8 str r12,[r0], #4 mov lr, r4, LSR #24 ldr r4, [r3], #4 orr r12,lr, r4, LSL #8 str r12,[r0], #4 mov lr, r4, LSR #24 ldr r4, [r3], #4 orr r12,lr, r4, LSL #8 str r12,[r0], #4 mov lr, r4, LSR #24 ldr r4, [r3], #4 orr r12,lr, r4, LSL #8 str r12,[r0], #4 mov lr, r4, LSR #24 ldr r4, [r3], #4 orr r12,lr, r4, LSL #8 str r12,[r0], #4 mov lr, r4, LSR #24 ;//So in theory we should now be cache line aligned. ;//We loop around doing prefetches and copies based on how far ahead we want to look offby3mainloop cmp r2, #(32*4 + 32) ;//Only keep looking ahead by 4 cache lines bmi offby3endofmainloop ;//Preload ;// 0x000007c4: f5d3f060 `... PLD [r3,#0x60] ;// 0x000007c8: f5d3f080 .... PLD [r3,#0x80] DCD 0xf5d3f060 DCD 0xf5d3f080 ;//Here is the main loop that handles pipelining the loads for off by 1 ldmia r3!, {r4, r5, r6, r7, r8, r9, r10, r11} orr r1,lr, r4, LSL #8 mov lr, r4, LSR #24 orr r4, lr, r5, LSL #8 mov lr, r5, LSR #24 orr r5, lr, r6, LSL #8 mov lr, r6, LSR #24 orr r6, lr, r7, LSL #8 mov lr, r7, LSR #24 orr r7, lr, r8, LSL #8 mov lr, r8, LSR #24 orr r8, lr, r9, LSL #8 mov lr, r9, LSR #24 orr r9, lr, r10, LSL #8 mov lr, r10, LSR #24 orr r10, lr, r11, LSL #8 mov lr, r11, LSR #24 stmia r0!, {r1, r4, r5, r6, r7, r8, r9, r10} ldmia r3!, {r4, r5, r6, r7, r8, r9, r10, r11} orr r1,lr, r4, LSL #8 mov lr, r4, LSR #24 orr r4, lr, r5, LSL #8 mov lr, r5, LSR #24 orr r5, lr, r6, LSL #8 mov lr, r6, LSR #24 orr r6, lr, r7, LSL #8 mov lr, r7, LSR #24 orr r7, lr, r8, LSL #8 mov lr, r8, LSR #24 orr r8, lr, r9, LSL #8 mov lr, r9, LSR #24 orr r9, lr, r10, LSL #8 mov lr, r10, LSR #24 orr r10, lr, r11, LSL #8 mov lr, r11, LSR #24 stmia r0!, {r1, r4, r5, r6, r7, r8, r9, r10} sub r2, r2, #64 ;//Take 64 bytes off of count b offby3mainloop offby3endofmainloop ;//If we still have more than 32*4 words to move, do one more preload cmp r2, #32*4 bls offby3nopreload ;// 0x00000898: f5d3f080 .... PLD [r3,#0x80] DCD 0xf5d3f080 offby3nopreload ;//Now we finish up the copy without any preloads. The data should have alread ;//been loaded into the caches ;//Copy 32 bytes at a time offby3finishcachelines cmp r2, #32 bmi offby3endoffinishcachelines ldmia r3!, {r4, r5, r6, r7, r8, r9, r10, r11} orr r1,lr, r4, LSL #8 mov lr, r4, LSR #24 orr r4, lr, r5, LSL #8 mov lr, r5, LSR #24 orr r5, lr, r6, LSL #8 mov lr, r6, LSR #24 orr r6, lr, r7, LSL #8 mov lr, r7, LSR #24 orr r7, lr, r8, LSL #8 mov lr, r8, LSR #24 orr r8, lr, r9, LSL #8 mov lr, r9, LSR #24 orr r9, lr, r10, LSL #8 mov lr, r10, LSR #24 orr r10, lr, r11, LSL #8 mov lr, r11, LSR #24 stmia r0!, {r1, r4, r5, r6, r7, r8, r9, r10} sub r2, r2, #32 ;//Take 32 bytes off of count b offby3finishcachelines offby3endoffinishcachelines ;//Now we need to finish off any partial cache lines that may be left. We do a similar ;//algorithm to the cachelinealign loop above. ands r6, r2, #0x1C ;//Get number of words left subeq r1, r3, #1 ;//Realign source on exact byte if need to branch beq finish ;//If words left==0, then branch to finish sub r2, r2, r6 ;//Subtract words left from count rsb r6, r6, #32 ;//Get 32-number of words left adr r12, offby3finishload ;//That's the copies to skip add pc, r12, r6, LSL #2 ;//..but need to multiply by 4 to get instructions offby3finishload ;//Need to have up to 8 words (1 cache line) ldr r4, [r3], #4 orr r12,lr, r4, LSL #8 str r12,[r0], #4 mov lr, r4, LSR #24 ldr r4, [r3], #4 orr r12,lr, r4, LSL #8 str r12,[r0], #4 mov lr, r4, LSR #24 ldr r4, [r3], #4 orr r12,lr, r4, LSL #8 str r12,[r0], #4 mov lr, r4, LSR #24 ldr r4, [r3], #4 orr r12,lr, r4, LSL #8 str r12,[r0], #4 mov lr, r4, LSR #24 ldr r4, [r3], #4 orr r12,lr, r4, LSL #8 str r12,[r0], #4 mov lr, r4, LSR #24 ldr r4, [r3], #4 orr r12,lr, r4, LSL #8 str r12,[r0], #4 mov lr, r4, LSR #24 ldr r4, [r3], #4 orr r12,lr, r4, LSL #8 str r12,[r0], #4 mov lr, r4, LSR #24 ldr r4, [r3], #4 orr r12,lr, r4, LSL #8 str r12,[r0], #4 mov lr, r4, LSR #24 sub r1, r3, #1 ;//Realign source on exact byte ;// b finish ;//Not needed, just fall through ;//Copy the last 4 bytes, if necessary finish ;//This finish also used in < 4 bytes case rsb r2, r2, #4 ;//Find how many bytes to copy (1,2,3, or 4) adr r12, finishloadby3 add pc, r12, r2, LSL #2 ;//Need to shift r2 left by 2 bits to jump instructions finishloadby3 ldrb r3, [r1], #1 ldrb r4, [r1], #1 ldrb r5, [r1], #1 ldrb r6, [r1], #1 adr r12, finishstoreby3 add pc, r12, r2, LSL #2 finishstoreby3 strb r3, [r0], #1 strb r4, [r0], #1 strb r5, [r0], #1 strb r6, [r0], #1 ;//Return to calling function IF Interworking :LOR: Thumbing ldmfd sp!, {r0,r4-r11, lr} bx lr ELSE ldmfd sp!, {r0,r4-r11, pc} ENDIF ENTRY_END memcpybigblk END