;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; Microsoft Research Singularity
;;; 
;;; Copyright (c) Microsoft Corporation.  All rights reserved.
;;;
;;; This file contains ARM-specific assembly code.
;;;

;**********************************************************************
; void *
; memcpy( void *dest, const void *src, size_t count );
;   The memcpy function copies count bytes of src to dest. 
;   If the source and destination overlap, this function does 
;   not ensure that the original source bytes in the overlapping 
;   region are copied before being overwritten. Use memmove to 
;   handle overlapping regions.
;
;**********************************************************************

        OPT     2       ; disable listing
        INCLUDE kxarm.inc
        OPT     1       ; reenable listing

dest    RN  R0
source  RN  R1
count   RN  R2
temp1   RN  R3
temp2   RN  R4
temp3   RN  R5
temp4   RN  R12

  IF Thumbing
	THUMBAREA
  ENDIF

    NESTED_ENTRY memcpy

	ROUT

  IF Thumbing
; Switch from Thumb mode to ARM mode
	DCW     0x4778              ; bx pc
	DCW     0x46C0              ; nop
  ENDIF

	;//Save registers onto the stack
    STMDB   sp!, {dest,temp2,temp3,lr}  ; save registers

	PROLOG_END

; Use a threshold to determine which code to use:
;
;   if destination & source are naturally aligned, then
;     threshold = 512
;   else
;     threshold = 128
;
;   if copy size > threshold, then
;     use memcpybigblk
;   else
;     use .NET code

    ORR     temp1, dest, source
    TST     temp1, #3
    MOVEQ   temp1, #512
    MOVNE   temp1, #128
    CMP     count, temp1
    BHI     UNDO_PROLOG    ; revert and continue to memcpybigblk

; NOTE: UNDO_PROLOG just restores SP, so do NOT modify anything other
;       than r3 (temp1) and r12 (temp4) before this point

;**********************************************************************
; Copy from head to tail to avoid source overwrite because the source 
; destination the source
;**********************************************************************
HEAD_TO_TAIL
    ;if LT 8 bytes store them and exit
    CMP     count, #8                               ; 2-3 cycles
    BLT     BYTEMOVE4

    ;Check alignment of parameters
    ANDS    temp1, dest, #3                         ; 2-3 cycles
    BEQ     SRCALIGN

    ; destination is at least 1 byte misaligned
    ; Read and write (4 - alignment) bytes to align destination.
    RSB     temp1, temp1, #4                        ; 9 cycles
    LDRB    temp2, [source], #1
    CMP     temp1, #2
    STRB    temp2, [dest],   #1
    LDRGEB  temp3, [source], #1         ; >= 2 == at least 2 bytes
    LDRGTB  temp2, [source], #1         ; > 2 == 3 bytes unaligned
    SUB     count, count, temp1
    STRGEB  temp3, [dest],   #1
    STRGTB  temp2, [dest],   #1

SRCALIGN                                            ; 3 - 7 cycles
    TST     source, #1                  ; save alignment of src
    BNE     UNALIGNED                   ; src 3 byte unaligned.
    TST     source, #2
    BNE     HWORDMOVE                   ; src and dst are hword aligned

;
;word aligned source and destination, move blocks of 32 bytes
;until we have less than 32 bytes left, then divide moves in
;half down to less than 4, where we will move the last 3 or less
;bytes
;
WORDMOVE
    SUBS    count, count, #32                       ; 2-3 cycles
    BLT     BLK16

BLK32                                               ; 20 cycles/32 bytes
    LDMIA   source!, {temp1,temp2,temp3,lr}
    STMIA   dest!,   {temp1,temp2,temp3,lr}
    LDMIA   source!, {temp1,temp2,temp3,lr}
    SUBS    count, count, #32
    STMIA   dest!,   {temp1,temp2,temp3,lr}
    BGE     BLK32

BLK16                                               ; 11-4 cycles/16 bytes
    ADDS    count, count, #16
    LDMGEIA source!, {temp1, temp2, temp3, lr}
    STMGEIA dest!,   {temp1, temp2, temp3, lr}
    BEQ     WORD_BYTES_EXIT
    SUBGTS    count, count, #16

BLK8                                                ; 6 cycles/8 bytes
    ADDS    count, count, #8
    LDMGEIA source!, {temp1, temp2}
    SUBGE   count, count, #8
    STMGEIA dest!,   {temp1, temp2}

BLK4
    ADDS    count, count, #4                        ; 6-9 cycles/4 bytes
    LDRGE   temp1, [source], #4
    STRGE   temp1, [dest], #4

WORD_BYTES
    ADDLTS  count, count, #4
    BEQ     WORD_BYTES_EXIT                         ; On zero, Return to caller

    LDR     temp1, [source], #4                     ; 10 cycles/1-3 bytes
    CMP     count, #2
    STRGEH  temp1, [dest], #2
    STRLTB  temp1, [dest], #1
    MOVGT   temp1, temp1, LSR #16
    STRGTB  temp1, [dest], #1

WORD_BYTES_EXIT

  IF Interworking :LOR: Thumbing
    LDMIA   sp!, {dest, temp2, temp3, lr}
    BX      lr
  ELSE
    LDMIA   sp!, {dest, temp2, temp3, pc}
  ENDIF

;
; half word align source and destination
;
HWORDMOVE                                           ; 2-3 cycles
    LDRH    temp1, [source], #2
    SUBS    count, count, #32
    BLT     HWORD8_TST

HWORD32                                             ; 35 cycles/32 bytes
    LDMIA   source!, {temp2,temp3,temp4,lr}
    ORR     temp1, temp1, temp2, LSL #16
    MOV     temp2, temp2, LSR #16
    ORR     temp2, temp2, temp3, LSL #16
    MOV     temp3, temp3, LSR #16
    ORR     temp3, temp3, temp4, LSL #16
    MOV     temp4, temp4, LSR #16
    ORR     temp4, temp4, lr, LSL #16
    STMIA   dest!, {temp1,temp2,temp3,temp4}    ; Store bytes 1-16
    MOV     temp1, lr, LSR #16
    LDMIA   source!, {temp2,temp3,temp4,lr}
    ORR     temp1, temp1, temp2, LSL #16
    MOV     temp2, temp2, LSR #16
    ORR     temp2, temp2, temp3, LSL #16
    MOV     temp3, temp3, LSR #16
    ORR     temp3, temp3, temp4, LSL #16
    MOV     temp4, temp4, LSR #16
    ORR     temp4, temp4, lr, LSL #16
    STMIA   dest!, {temp1,temp2,temp3,temp4}    ; Store bytes 17-32
    SUBS    count, count, #32
    MOV     temp1, lr, LSR #16
    BGE     HWORD32

HWORD8_TST
    ADDS    count, count, #24
    BLT     HWORD4

HWORD8                                              ; 11 cycles/8 bytes
    LDMIA   source!, {temp2,temp3}
    ORR     temp1, temp1, temp2, LSL #16
    MOV     temp2, temp2, LSR #16
    ORR     temp2, temp2, temp3, LSL #16
    STMIA   dest!, {temp1, temp2}
    SUBS    count, count, #8
    MOV     temp1, temp3, LSR #16
    BGE     HWORD8

HWORD4                                              ; 3-7 cycles/4 bytes
    ADDS    count, count, #4
    BLT     HWORD_BYTES
    LDR     temp2, [source], #4
    ORR     temp1, temp1, temp2, LSL #16
    STR     temp1, [dest], #4
    MOV     temp1, temp2, LSR #16

HWORD_BYTES                                         ; 5-11 cycles/1-3 bytes
    ADDLTS  count, count, #4
    BEQ     HWORD_BYTES_EXIT                        ; On zero, Return to caller
    CMP     count, #2
    STRLTB  temp1, [dest], #1
    LDRGTB  temp2, [source], #1
    STRGEH  temp1, [dest], #2
    STRGTB  temp2, [dest], #1

HWORD_BYTES_EXIT

  IF Interworking :LOR: Thumbing
    LDMIA   sp!, {dest, temp2, temp3, lr}
    BX      lr
  ELSE
    LDMIA   sp!, {dest, temp2, temp3, pc}
  ENDIF

;
; Unaligned Moves
;
UNALIGNED
    TST     source, #2
    BEQ     UNALIGNED1

UNALIGNED3                                          ; 3-4 cycles
    LDRB    temp1, [source], #1
    SUBS    count, count, #32
    BLT     OFFTHREE8_TST

OFFTHREE32                                          ; 35 cycles/32 bytes
    LDMIA   source!, {temp2,temp3,temp4,lr}
    ORR     temp1, temp1, temp2, LSL #8
    MOV     temp2, temp2, LSR #24
    ORR     temp2, temp2, temp3, LSL #8
    MOV     temp3, temp3, LSR #24
    ORR     temp3, temp3, temp4, LSL #8
    MOV     temp4, temp4, LSR #24
    ORR     temp4, temp4, lr, LSL #8
    STMIA   dest!, {temp1,temp2,temp3,temp4}    ; Store bytes 1-16
    MOV     temp1, lr, LSR #24
    LDMIA   source!, {temp2,temp3,temp4,lr}
    ORR     temp1, temp1, temp2, LSL #8
    MOV     temp2, temp2, LSR #24
    ORR     temp2, temp2, temp3, LSL #8
    MOV     temp3, temp3, LSR #24
    ORR     temp3, temp3, temp4, LSL #8
    MOV     temp4, temp4, LSR #24
    ORR     temp4, temp4, lr, LSL #8
    STMIA   dest!, {temp1,temp2,temp3,temp4}    ; Store bytes 17-32
    SUBS    count, count, #32
    MOV     temp1, lr, LSR #24
    BGE     OFFTHREE32

OFFTHREE8_TST
    ADDS    count, count, #24
    BLT     OFFTHREE4

OFFTHREE8                                           ; 11 cycles/8 bytes
    LDMIA   source!, {temp2,temp3}
    ORR     temp1, temp1, temp2, LSL #8
    MOV     temp2, temp2, LSR #24
    ORR     temp2, temp2, temp3, LSL #8
    STMIA   dest!, {temp1, temp2}
    SUBS    count, count, #8
    MOV     temp1, temp3, LSR #24
    BGE     OFFTHREE8

OFFTHREE4                                           ; 3-7 cycles/4 bytes
    ADDS    count, count, #4
    BLT     OFFTHREE_BYTES
    LDR     temp2, [source], #4
    ORR     temp1, temp1, temp2, LSL #8
    STR     temp1, [dest], #4
    MOV     temp1, temp2, LSR #24

OFFTHREE_BYTES                                      ; 5-12 cycles/ 1-3 bytes
    ADDLTS  count, count, #4
    BEQ     OFFTHREE_EXIT                           ; On zero, Return to caller
    CMP     count, #2
    LDRGEH  temp2, [source], #2
    STRB    temp1, [dest], #1
    STRGEB  temp2, [dest], #1
    MOVGT   temp2, temp2, LSR #8
    STRGTB  temp2, [dest], #1

OFFTHREE_EXIT

  IF Interworking :LOR: Thumbing
    LDMIA   sp!, {dest, temp2, temp3, lr}
    BX      lr
  ELSE
    LDMIA   sp!, {dest, temp2, temp3, pc}           ; On zero, Return to caller
  ENDIF

;
; Source is one byte from word alignment.
;   Read a byte & half word then multiple words and a byte.  Then
;   shift and ORR them into consecutive words for STM writes
UNALIGNED1                                          ; 5-6 cycles
    LDRB    temp1, [source], #1
    LDRH    temp2, [source], #2
    SUBS    count, count, #32
    ORR     temp1, temp1, temp2, LSL #8
    BLT     OFFONE8_TST

OFFONE32                                            ; 35 cycles/32 bytes
    LDMIA   source!, {temp2, temp3, temp4, lr}
    ORR     temp1, temp1, temp2, LSL #24
    MOV     temp2, temp2, LSR #8
    ORR     temp2, temp2, temp3, LSL #24
    MOV     temp3, temp3, LSR #8
    ORR     temp3, temp3, temp4, LSL #24
    MOV     temp4, temp4, LSR #8
    ORR     temp4, temp4, lr, LSL #24
    STMIA   dest!, {temp1,temp2,temp3,temp4}    ; Store bytes 1-16
    MOV     temp1, lr, LSR #8
    LDMIA   source!, {temp2,temp3,temp4,lr}
    ORR     temp1, temp1, temp2, LSL #24
    MOV     temp2, temp2, LSR #8
    ORR     temp2, temp2, temp3, LSL #24
    MOV     temp3, temp3, LSR #8
    ORR     temp3, temp3, temp4, LSL #24
    MOV     temp4, temp4, LSR #8
    ORR     temp4, temp4, lr, LSL #24
    STMIA   dest!, {temp1,temp2,temp3,temp4}    ; Store bytes 17-32
    SUBS    count, count, #32
    MOV     temp1, lr, LSR #8
    BGE     OFFONE32

OFFONE8_TST
    ADDS    count, count, #24
    BLT     OFFONE4

OFFONE8                                             ; 11 cycles/8 bytes
    LDMIA   source!, {temp2,temp3}
    ORR     temp1, temp1, temp2, LSL #24
    MOV     temp2, temp2, LSR #8
    ORR     temp2, temp2, temp3, LSL #24
    STMIA   dest!, {temp1,temp2}
    SUBS    count, count, #8
    MOV     temp1, temp3, LSR #8
    BGE     OFFONE8

OFFONE4                                             ; 3-9 cycles/4 bytes
    ADDS    count, count, #4
    BLT     OFFONE_BYTES
    LDR     temp2, [source], #4
    ORR     temp1, temp1, temp2, LSL #24
    STR     temp1, [dest], #4
    BEQ     OFFONE_EXIT
    MOV     temp1, temp2, LSR #8

OFFONE_BYTES                                        ; 11 cycles/1-3 bytes
    ADDLTS  count, count, #4
    BEQ     OFFONE_EXIT
    CMP     count, #2
    STRLTB  temp1, [dest], #1
    STRGEH  temp1, [dest], #2
    MOVGT   temp1, temp1, LSR #16
    STRGTB  temp1, [dest], #1

OFFONE_EXIT

  IF Interworking :LOR: Thumbing
    LDMIA   sp!, {dest, temp2, temp3, lr}
    BX      lr
  ELSE
    LDMIA   sp!, {dest, temp2, temp3, pc}           ; Return to caller
  ENDIF
    
BYTEMOVE4                                           ; 12 cycles/4 bytes
    CMP     count, #4
    BLT     MMOVEXIT
    LDRB    temp1, [source], #1
    SUB     count, count, #4
    LDRB    temp2, [source], #1
    LDRB    temp3, [source], #1
    LDRB    lr, [source], #1
    STRB    temp1, [dest], #1
    STRB    temp2, [dest], #1
    STRB    temp3, [dest], #1
    STRB    lr, [dest], #1

MMOVEXIT                                            ; 2-5 cycles
    CMP     count, #0
  IF Interworking :LOR: Thumbing
    LDMEQIA sp!, {dest, temp2, temp3, lr}
    BXEQ    lr
  ELSE
    LDMEQIA sp!, {dest, temp2, temp3, pc}           ; On zero, Return to caller
  ENDIF

;
; Store last 3 or so bytes and exit
;
BYTEMOVE                                            ; 4-7 cycles/1 byte
    LDRB    temp1, [source], #1
    CMP     count, #2
    STRB    temp1, [dest], #1
    BLT     BYTEMOVE_EXIT
    LDRGEB  temp2, [source], #1                     ; 8 cycles/1-2 bytes
    LDRGTB  temp3, [source], #1
    STRGEB  temp2, [dest], #1
    STRGTB  temp3, [dest], #1

BYTEMOVE_EXIT

  IF Interworking :LOR: Thumbing
    LDMIA   sp!, {dest, temp2, temp3, lr}
    BX      lr
  ELSE
    LDMIA   sp!, {dest, temp2, temp3, pc}           ; Return to caller
  ENDIF


; THIS IS NOT A RETURN
; The following reverts the stack to its state at the point of entry
; of memcpy.  It then falls through to memcpybigblk to perform the
; large copy
UNDO_PROLOG
	ADD     sp, sp, #0x10
;
; FALLTHRU
;
    ENTRY_END memcpy


    NESTED_ENTRY memcpybigblk

	ROUT

	;//Save registers onto the stack
	;//R3 should be OK to destroy.  If not, we stack it off too.
	stmfd 	sp!, {r0,r4-r11, lr}

	PROLOG_END

prefetch_setup
;//Prefetch the source.
;//Have to align source register with word boundary first
	mov 	r5, r1
	and		r5, r5, #~0x3
	
;//The PLD instruction just happens to be a Never Execute on ARM V4,
;//so we can in-line the PLD instruction and still maintain V4 compatibility
;//        0x0000000c:    f5d5f000    ....    PLD      [r5,#0]
;//        0x00000010:    f5d5f020     ...    PLD      [r5,#0x20]
;//        0x00000014:    f5d5f040    @...    PLD      [r5,#0x40]
	DCD		0xf5d5f000
	DCD		0xf5d5f020
	DCD		0xf5d5f040
	
;//If there are 4 or less bytes to copy, we just jump to the end
;//and do a straight byte copy.
	cmp 	r2, #4
	bls 	finish

;//Align the destination to a word boundary.
	rsb		r4, r0, #0				;//Figure out how many bytes
	ands	r4, r4, #0x2			;//See if we need to do 2 copies
	ldrneb 	r5, [r1], #1			;//Read the two bytes
	ldrneb	r6, [r1], #1
	subne	r2, r2, #2				;//Decrement count by 2
	strneb	r5, [r0], #1			;//Now store the two bytes
	strneb  r6, [r0], #1			;//Have to do two seperate byte stores 
									;//because of possible address misalignment

	ands 	r4, r0, #0x1			;//See if we need to do 1 copy
	ldrneb	r5, [r1], #1			;//Load the single byte
	subne	r2, r2, #1				;//Decrement count by 1
	strneb	r5, [r0], #1			;//Store the single byte

;//We need to choose which memcpy we use based
;//on how the source is now aligned.  If the destination and source
;//are both aligned, then we fall through to the aligned copy

;//Check the byte alignment of the source
;//We do it in reverse order just because.  If most memcopies are
;//expected to be off by a certain #, that should be placed first.
	and		r3, r1, #3
	cmp 	r3, #3						;//If both bits are set, go do case 3, off by 3 bytes
	beq		memcpyoffby3				;//Goto case 3
	cmp		r3, #2						;//Check for case 2, off by 2 bytes
	beq		memcpyoffby2				;//Goto case 2
	cmp		r3, #1						;//Check for case 1, off by 1 byte
	beq		memcpyoffby1				;//Goto case 1

;//The source and destination are word aligned.  We get an easy job.
memcpyoffby0

;//Now we need to align the destination to a cache line boundary
;//We need to figure out how many words are needed to align it.
;//If the number of words to align it are less than the number of words
;//we're asked to copy, just copy the required number of words.
	and		r4, r0, #0x1C				;//Grab the low bits of the destination
	rsb		r4, r4, #32					;//Negate them and
										;//add 32 to the low bits(this is  
										;//how many we need to move to get aligned)
 	and		r5, r2, #0x1C				;//Check only the number of words from count
	cmp		r4, r2	 					;//Compare low bits to align against the words from count
	movhi	r4, r5						;//If words to align is greater than the count, then
										;//use the words from count instead

	cmp		r4, #0
	beq		offby0mainloop

;//r4 now contains the number of times we need to do a word load/store
;//So we need to sortof back-calculate how many of the word load/stores to
;//skip in memcpyoffby0cachelinealignload/store
	rsb 	r3, r4, #32
	and		r3, r3, #0x1C
;//r3 now contains the number of *instructions* to skip over.

;//Deduct words from size
	sub 	r2, r2, r4
		
;//Because the & 0x1C corresponds to words, we don't have to shift anything
;//when we jump into load table
;//Using two jump tables is faster because it gives the processor a chance to load
;//data before we try to store it out.
	adr 	r12, offby0cachelinealignload
	add 	pc, r12, r3

offby0cachelinealignload				;//Need to have up to 8 words (1 cache line)
	ldr 	r4, [r1], #4				;//Could also do load/store pairs, and shift
	ldr 	r5, [r1], #4				;//r3 left 1 bit to calculate jump address
	ldr 	r6, [r1], #4
	ldr 	r7, [r1], #4
	ldr 	r8, [r1], #4
	ldr 	r9, [r1], #4
	ldr 	r10,[r1], #4
	ldr 	r11,[r1], #4

;//Now jump into the store table
	adr 	r12, offby0cachelinealignstore
	add		pc, r12, r3

offby0cachelinealignstore
	str		r4, [r0], #4
	str		r5, [r0], #4
	str		r6, [r0], #4
	str		r7, [r0], #4
	str		r8, [r0], #4
	str		r9, [r0], #4
	str		r10,[r0], #4
	str		r11,[r0], #4

;//We are now cache line aligned. 
;//We loop around doing prefetches and copies based on how far ahead we want to look

offby0mainloop
	cmp 	r2, #(32*3 + 32)			;//Only keep looking ahead by 4 cache lines
	bmi 	offby0endofmainloop
	
;//Preload the data
;//        0x000000f4:    f5d1f060    `...    PLD      [r1,#0x60]
;//        0x000000f8:    f5d1f080    ....    PLD      [r1,#0x80]

	DCD		0xf5d1f060
	DCD		0xf5d1f080

;//Here is the main loop that handles pipelining the loads
	
	ldmia	r1!, {r4-r11}
	stmia	r0!, {r4-r11}

	ldmia	r1!, {r4-r11}
	stmia	r0!, {r4-r11}
	
	sub		r2, r2, #64					;//Take 64 bytes off of count
	
	b 		offby0mainloop
	
offby0endofmainloop	
;//If we still have more than 32*4 words to move, do one more preload
	cmp		r2, #32*4
	bls		offby0nopreload
;//        0x0000011c:    f5d1f080    ....    PLD      [r1,#0x80]
	DCD		0xf5d1f080
	
offby0nopreload

;//Now we finish up the copy without any preloads.  The data should have already
;//been loaded into the caches
;//Copy 32 bytes at a time
offby0finishcachelines
	cmp 	r2, #32
	bmi		offby0endoffinishcachelines

	ldmia	r1!, {r4-r11}
	stmia	r0!, {r4-r11}
	
	sub		r2, r2, #32					;//Take 32 bytes off of count
	b		offby0finishcachelines

offby0endoffinishcachelines

;//Now we need to finish off any partial cache lines that may be left. We do a similar
;//algorithm to the cachelinealign loop above.
	ands	r3, r2, #0x1C				;//Get number of words left
	beq		finish						;//If words left==0, then branch to finish
	sub		r2, r2, r3					;//Subtract words left from count
	rsb		r3, r3, #32					;//Get 32-number of words left
	
	adr		r12, offby0finishload		;//That's the instructions to skip
	add 	pc, r12, r3

offby0finishload						;//Need to have up to 8 words (1 cache line)
	ldr 	r4, [r1], #4				;//Could also do load/store pairs, and shift
	ldr 	r5, [r1], #4				;//r3 left 1 bit to calculate jump address
	ldr 	r6, [r1], #4
	ldr 	r7, [r1], #4
	ldr 	r8, [r1], #4
	ldr 	r9, [r1], #4
	ldr 	r10,[r1], #4
	ldr 	r11,[r1], #4

;//Now jump into the store table
	adr 	r12, offby0finishstore
	add		pc, r12, r3

offby0finishstore
	str		r4, [r0], #4
	str		r5, [r0], #4
	str		r6, [r0], #4
	str		r7, [r0], #4
	str		r8, [r0], #4
	str		r9, [r0], #4
	str		r10,[r0], #4
	str		r11,[r0], #4

;//Copy the last 4 bytes, if necessary
	rsb		r2, r2, #4					;//Find how many bytes to copy (0, 1,2,3, or 4)
	adr 	r12, finishloadby0
	add		pc, r12, r2, LSL #2			;//Need to shift r2 left by 2 bits to jump instructions

finishloadby0
	ldrb		r3, [r1], #1
	ldrb		r4, [r1], #1 
	ldrb		r5, [r1], #1
	ldrb		r6, [r1], #1
	
	adr		r12, finishstoreby0
	add		pc, r12, r2, LSL #2

finishstoreby0
	strb		r3, [r0], #1
	strb		r4, [r0], #1
	strb		r5, [r0], #1
	strb		r6, [r0], #1

;//Return to calling function
    IF Interworking :LOR: Thumbing
        ldmfd sp!, {r0,r4-r11, lr}
        bx      lr
    ELSE
        ldmfd sp!, {r0,r4-r11, pc}
    ENDIF


;//The source and destination are not aligned.  We're going to have 
;//to load and shift data from a temporary buffer.  Stuff needs to be 
;//shifted to the right by 8 bits to align properly
memcpyoffby1

;//First we need to word align the source
	and 	r3, r1, #~0x3
;//Load the first value into the holding buffer (lr)
	ldr		lr, [r3], #4
	mov		lr, lr, LSR #8

;//Now we need to align the destination to a cache line boundary
;//We need to figure out how many words are needed to align it.
;//If the number of words to align it are less than the number of words
;//we're asked to copy, just copy the required number of words.
	and		r4, r0, #0x1C				;//Grab the low bits of the destination
	rsb		r4, r4, #32					;//Negate them
										;//Add 32 to the low bits(this is  
										;//how many we need to move to get aligned)
 	and		r5, r2, #0x1C				;//Check only the number of words from count
	cmp		r4, r2	 					;//Compare low bits to align against the words from count
	movhi	r4, r5						;//If words to align is greater than the count, then
										;//use the words from count instead

	cmp		r4, #0
	beq		offby1mainloop
;//r4 now contains the number of times we need to do a word load/store
;//So we need to sortof back-calculate how many of the word load/stores to
;//skip in memcpyoffby1cachelinealignload
	rsb 	r6, r4, #32
	and		r6, r6, #0x1C
;//r3 now contains the number of *words* to skip over.

;//Deduct words from size
	sub 	r2, r2, r4
		
;//Because the & 0x1C corresponds to words, we DO need to shift this time around
;//when we jump into load table
	adr 	r12, offby1cachelinealignload
	add 	pc, r12, r6, LSL #2			;//Allows 4 instructions per byteblit

;//Because there is no convenient way to split the load/store into multiples of 2
;//unless we keep them together, for misaligned data we leave them together.
offby1cachelinealignload				;//Need to have up to 8 words (1 cache line)
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #24
	str 	r12,[r0], #4
	mov		lr, r4, LSR #8
		
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #24
	str 	r12,[r0], #4
	mov		lr, r4, LSR #8
		
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #24
	str 	r12,[r0], #4
	mov		lr, r4, LSR #8
		
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #24
	str 	r12,[r0], #4
	mov		lr, r4, LSR #8
		
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #24
	str 	r12,[r0], #4
	mov		lr, r4, LSR #8
		
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #24
	str 	r12,[r0], #4
	mov		lr, r4, LSR #8
		
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #24
	str 	r12,[r0], #4
	mov		lr, r4, LSR #8
		
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #24
	str 	r12,[r0], #4
	mov		lr, r4, LSR #8
		
;//We are now cache line aligned.
;//We loop around doing prefetches and copies based on how far ahead we want to look
offby1mainloop
	cmp 	r2, #(32*4 + 32)			;//Only keep looking ahead by 4 cache lines
	bmi 	offby1endofmainloop
	
;//Preload		
;//        0x00000264:    f5d3f060    `...    PLD      [r3,#0x60]
;//        0x00000268:    f5d3f080    ....    PLD      [r3,#0x80]
	DCD		0xf5d3f060
	DCD		0xf5d3f080
	
;//Here is the main loop that handles pipelining the loads for off by 1
	ldmia	r3!, {r4, r5, r6, r7, r8, r9, r10, r11}
	
	orr		r1,lr, r4, LSL #24
	mov		lr, r4, LSR #8
	
	orr		r4, lr, r5, LSL #24
	mov		lr, r5, LSR #8

	orr		r5, lr, r6, LSL #24
	mov		lr, r6, LSR #8

	orr		r6, lr, r7, LSL #24
	mov		lr, r7, LSR #8

	orr		r7, lr, r8, LSL #24
	mov		lr, r8, LSR #8

	orr		r8, lr, r9, LSL #24
	mov		lr, r9, LSR #8

	orr		r9, lr, r10, LSL #24
	mov		lr, r10, LSR #8

	orr		r10, lr, r11, LSL #24
	mov		lr, r11, LSR #8

	stmia	r0!, {r1, r4, r5, r6, r7, r8, r9, r10}

	ldmia	r3!, {r4, r5, r6, r7, r8, r9, r10, r11}

	orr		r1,lr, r4, LSL #24
	mov		lr, r4, LSR #8
	
	orr		r4, lr, r5, LSL #24
	mov		lr, r5, LSR #8

	orr		r5, lr, r6, LSL #24
	mov		lr, r6, LSR #8

	orr		r6, lr, r7, LSL #24
	mov		lr, r7, LSR #8

	orr		r7, lr, r8, LSL #24
	mov		lr, r8, LSR #8

	orr		r8, lr, r9, LSL #24
	mov		lr, r9, LSR #8

	orr		r9, lr, r10, LSL #24
	mov		lr, r10, LSR #8

	orr		r10, lr, r11, LSL #24
	mov		lr, r11, LSR #8

	stmia	r0!, {r1, r4, r5, r6, r7, r8, r9, r10}

	sub		r2, r2, #64					;//Take 64 bytes off of count

	b 		offby1mainloop

offby1endofmainloop	
;//If we still have more than 32*4 words to move, do one more preload
	cmp		r2, #32*4
	bls		offby1nopreload
;//        0x00000338:    f5d3f080    ....    PLD      [r3,#0x80]
	DCD		0xf5d3f080

offby1nopreload

;//Now we finish up the copy without any preloads.  The data should have alread
;//been loaded into the caches
;//Copy 32 bytes at a time
offby1finishcachelines
	cmp 	r2, #32
	bmi		offby1endoffinishcachelines
	
	ldmia	r3!, {r4, r5, r6, r7, r8, r9, r10, r11}
	
	orr		r1,lr, r4, LSL #24
	mov		lr, r4, LSR #8
	
	orr		r4, lr, r5, LSL #24
	mov		lr, r5, LSR #8

	orr		r5, lr, r6, LSL #24
	mov		lr, r6, LSR #8

	orr		r6, lr, r7, LSL #24
	mov		lr, r7, LSR #8

	orr		r7, lr, r8, LSL #24
	mov		lr, r8, LSR #8

	orr		r8, lr, r9, LSL #24
	mov		lr, r9, LSR #8

	orr		r9, lr, r10, LSL #24
	mov		lr, r10, LSR #8

	orr		r10, lr, r11, LSL #24
	mov		lr, r11, LSR #8

	stmia	r0!, {r1, r4, r5, r6, r7, r8, r9, r10}
		
	sub		r2, r2, #32					;//Take 32 bytes off of count
	b		offby1finishcachelines

offby1endoffinishcachelines

;//Now we need to finish off any partial cache lines that may be left. We do a similar
;//algorithm to the cachelinealign loop above.
	ands	r6, r2, #0x1C				;//Get number of words left
	subeq	r1, r3, #3					;//Realign source on exact byte if need to branch
	beq		finish						;//If words left==0, then branch to finish
	sub		r2, r2, r6					;//Subtract words left from count
	rsb		r6, r6, #32					;//Get 32-number of words left
	
	adr		r12, offby1finishload		;//That's the copies to skip
	add 	pc, r12, r6, LSL #2			;//..but need to multiply by 4 to get instructions

offby1finishload				;//Need to have up to 8 words (1 cache line)
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #24
	str 	r12,[r0], #4
	mov		lr, r4, LSR #8
		
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #24
	str 	r12,[r0], #4
	mov		lr, r4, LSR #8
		
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #24
	str 	r12,[r0], #4
	mov		lr, r4, LSR #8
		
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #24
	str 	r12,[r0], #4
	mov		lr, r4, LSR #8
		
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #24
	str 	r12,[r0], #4
	mov		lr, r4, LSR #8
		
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #24
	str 	r12,[r0], #4
	mov		lr, r4, LSR #8
		
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #24
	str 	r12,[r0], #4
	mov		lr, r4, LSR #8
		
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #24
	str 	r12,[r0], #4
	mov		lr, r4, LSR #8

	sub		r1, r3, #3					;//Realign source on exact byte
	
;//Copy the last 4 bytes, if necessary
	rsb		r2, r2, #4					;//Find how many bytes to copy (1,2,3, or 4)
	adr 	r12, finishloadby1
	add		pc, r12, r2, LSL #2			;//Need to shift r2 left by 2 bits to jump instructions

finishloadby1
	ldrb		r3, [r1], #1
	ldrb		r4, [r1], #1 
	ldrb		r5, [r1], #1
	ldrb		r6, [r1], #1
	
	adr		r12, finishstoreby1
	add		pc, r12, r2, LSL #2
	
finishstoreby1
	strb		r3, [r0], #1
	strb		r4, [r0], #1
	strb		r5, [r0], #1
	strb		r6, [r0], #1

;//Return to calling function
    IF Interworking :LOR: Thumbing
        ldmfd sp!, {r0,r4-r11, lr}
        bx      lr
    ELSE
        ldmfd sp!, {r0,r4-r11, pc}
    ENDIF
	
;//The source and destination are not aligned.  We're going to have to load
;//and shift data from a temporary buffer.  Stuff needs to be shifted to the
;//right by 16 bits to align properly
memcpyoffby2

;//First we need to word align the source
	and 	r3, r1, #~0x3
;//Load the first value into the holding buffer (lr)
	ldr		lr, [r3], #4
	mov		lr, lr, LSR #16

;//Now we need to align the destination to a cache line boundary
;//We need to figure out how many words are needed to align it.
;//If the number of words to align it are less than the number of words
;//we're asked to copy, just copy the required number of words.
	and		r4, r0, #0x1C				;//Grab the low bits of the destination
	rsb		r4, r4, #32					;//Negate them
										;//Add 32 to the low bits(this is  
										;//how many we need to move to get aligned)
 	and		r5, r2, #0x1C				;//Check only the number of words from count
	cmp		r4, r2	 					;//Compare low bits to align against the words from count
	movhi	r4, r5						;//If words to align is greater than the count, then
										;//use the words from count instead

	cmp		r4, #0
	beq		offby2mainloop

;//r4 now contains the number of times we need to do a word load/store
;//So we need to sortof back-calculate how many of the word load/stores to
;//skip in memcpyoffby2cachelinealignload
	rsb 	r6, r4, #32
	and		r6, r6, #0x1C
;//r3 now contains the number of *words* to skip over.

;//Deduct words from size
	sub 	r2, r2, r4
		
;//Because the & 0x1C corresponds to words, we DO need to shift this time around
;//when we jump into load table
	adr 	r12, offby2cachelinealignload
	add 	pc, r12, r6, LSL #2			;//Allows 4 instructions per byteblit

;//Because there is no convenient way to split the load/store into multiples of 2
;//unless we keep them together, for misaligned data we leave them together.
offby2cachelinealignload				;//Need to have up to 8 words (1 cache line)
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #16
	str 	r12,[r0], #4
	mov		lr, r4, LSR #16
		
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #16
	str 	r12,[r0], #4
	mov		lr, r4, LSR #16
		
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #16
	str 	r12,[r0], #4
	mov		lr, r4, LSR #16
		
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #16
	str 	r12,[r0], #4
	mov		lr, r4, LSR #16
		
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #16
	str 	r12,[r0], #4
	mov		lr, r4, LSR #16
		
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #16
	str 	r12,[r0], #4
	mov		lr, r4, LSR #16
		
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #16
	str 	r12,[r0], #4
	mov		lr, r4, LSR #16
		
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #16
	str 	r12,[r0], #4
	mov		lr, r4, LSR #16
		
;//So in theory we should now be cache line aligned.
;//We loop around doing prefetches and copies based on how far ahead we want to look
offby2mainloop
	cmp 	r2, #(32*4 + 32)			;//Only keep looking ahead by 4 cache lines
	bmi 	offby2endofmainloop
	
;//Preload
;//        0x00000514:    f5d3f060    `...    PLD      [r3,#0x60]
;//        0x00000518:    f5d3f080    ....    PLD      [r3,#0x80]
	DCD		0xf5d3f060
	DCD		0xf5d3f080

;//Here is the main loop that handles pipelining the loads for off by 2
	ldmia	r3!, {r4, r5, r6, r7, r8, r9, r10, r11}
	
	orr		r1,lr, r4, LSL #16
	mov		lr, r4, LSR #16
	
	orr		r4, lr, r5, LSL #16
	mov		lr, r5, LSR #16

	orr		r5, lr, r6, LSL #16
	mov		lr, r6, LSR #16

	orr		r6, lr, r7, LSL #16
	mov		lr, r7, LSR #16

	orr		r7, lr, r8, LSL #16
	mov		lr, r8, LSR #16

	orr		r8, lr, r9, LSL #16
	mov		lr, r9, LSR #16

	orr		r9, lr, r10, LSL #16
	mov		lr, r10, LSR #16

	orr		r10, lr, r11, LSL #16
	mov		lr, r11, LSR #16

	stmia	r0!, {r1, r4, r5, r6, r7, r8, r9, r10}

	ldmia	r3!, {r4, r5, r6, r7, r8, r9, r10, r11}

	orr		r1,lr, r4, LSL #16
	mov		lr, r4, LSR #16
	
	orr		r4, lr, r5, LSL #16
	mov		lr, r5, LSR #16

	orr		r5, lr, r6, LSL #16
	mov		lr, r6, LSR #16

	orr		r6, lr, r7, LSL #16
	mov		lr, r7, LSR #16

	orr		r7, lr, r8, LSL #16
	mov		lr, r8, LSR #16

	orr		r8, lr, r9, LSL #16
	mov		lr, r9, LSR #16

	orr		r9, lr, r10, LSL #16
	mov		lr, r10, LSR #16

	orr		r10, lr, r11, LSL #16
	mov		lr, r11, LSR #16

	stmia	r0!, {r1, r4, r5, r6, r7, r8, r9, r10}

	sub		r2, r2, #64					;//Take 64 bytes off of count
	b 		offby2mainloop
	
offby2endofmainloop	
;//If we still have more than 32*4 words to move, do one more preload
	cmp		r2, #32*4
	bls		offby2nopreload
;//        0x000005e8:    f5d3f080    ....    PLD      [r3,#0x80]
	DCD		0xf5d3f080

offby2nopreload

;//Now we finish up the copy without any preloads.  The data should have already
;//been loaded into the caches
;//Copy 32 bytes at a time
offby2finishcachelines
	cmp 	r2, #32
	bmi		offby2endoffinishcachelines

	ldmia	r3!, {r4, r5, r6, r7, r8, r9, r10, r11}

	orr		r1,lr, r4, LSL #16
	mov		lr, r4, LSR #16
	
	orr		r4, lr, r5, LSL #16
	mov		lr, r5, LSR #16

	orr		r5, lr, r6, LSL #16
	mov		lr, r6, LSR #16

	orr		r6, lr, r7, LSL #16
	mov		lr, r7, LSR #16

	orr		r7, lr, r8, LSL #16
	mov		lr, r8, LSR #16

	orr		r8, lr, r9, LSL #16
	mov		lr, r9, LSR #16

	orr		r9, lr, r10, LSL #16
	mov		lr, r10, LSR #16

	orr		r10, lr, r11, LSL #16
	mov		lr, r11, LSR #16

	stmia	r0!, {r1, r4, r5, r6, r7, r8, r9, r10}
		
	sub		r2, r2, #32					;//Take 32 bytes off of count
	b		offby2finishcachelines

offby2endoffinishcachelines

;//Now we need to finish off any partial cache lines that may be left. We do a similar
;//algorithm to the cachelinealign loop above.
	ands		r6, r2, #0x1C			;//Get number of words left
	subeq	r1, r3, #2					;//Realign source on exact byte if need to branch
	beq		finish						;//If words left==0, then branch to finish
	sub		r2, r2, r6					;//Subtract words left from count
	rsb		r6, r6, #32					;//Get 32-number of words left
	
	adr		r12, offby2finishload		;//That's the copies to skip
	add 	pc, r12, r6, LSL #2			;//..but need to multiply by 4 to get instructions

offby2finishload				;//Need to have up to 8 words (1 cache line)
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #16
	str 	r12,[r0], #4
	mov		lr, r4, LSR #16
		
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #16
	str 	r12,[r0], #4
	mov		lr, r4, LSR #16
		
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #16
	str 	r12,[r0], #4
	mov		lr, r4, LSR #16
		
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #16
	str 	r12,[r0], #4
	mov		lr, r4, LSR #16
		
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #16
	str 	r12,[r0], #4
	mov		lr, r4, LSR #16
		
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #16
	str 	r12,[r0], #4
	mov		lr, r4, LSR #16
		
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #16
	str 	r12,[r0], #4
	mov		lr, r4, LSR #16
		
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #16
	str 	r12,[r0], #4
	mov		lr, r4, LSR #16

	sub		r1, r3, #2					;//Realign source on exact byte
	
;//Copy the last 4 bytes, if necessary
	rsb		r2, r2, #4					;//Find how many bytes to copy (1,2,3, or 4)
	adr 	r12, finishloadby2
	add		pc, r12, r2, LSL #2			;//Need to shift r2 left by 2 bits to jump instructions

finishloadby2
	ldrb		r3, [r1], #1
	ldrb		r4, [r1], #1 
	ldrb		r5, [r1], #1
	ldrb		r6, [r1], #1
	
	adr		r12, finishstoreby2
	add		pc, r12, r2, LSL #2
	
finishstoreby2
	strb		r3, [r0], #1
	strb		r4, [r0], #1
	strb		r5, [r0], #1
	strb		r6, [r0], #1
		
;//Return to calling function
    IF Interworking :LOR: Thumbing
        ldmfd sp!, {r0,r4-r11, lr}
        bx      lr
    ELSE
        ldmfd sp!, {r0,r4-r11, pc}
    ENDIF

;//The source and destination are not aligned.  We're going to have to load
;//and shift data from a temporary buffer.  Stuff needs to be shifted to the
;//right by 24 bits to align properly
memcpyoffby3

;//First we need to word align the source
	and 	r3, r1, #~0x3
;//Load the first value into the holding buffer (lr)
	ldr		lr, [r3], #4
	mov		lr, lr, LSR #24
	

;//Now we need to align the destination to a cache line boundary
;//We need to figure out how many words are needed to align it.
;//If the number of words to align it are less than the number of words
;//we're asked to copy, just copy the required number of words.
	and		r4, r0, #0x1C				;//Grab the low bits of the destination
	rsb		r4, r4, #32					;//Negate them
										;//Add 32 to the low bits(this is  
										;//how many we need to move to get aligned)
 	and		r5, r2, #0x1C				;//Check only the number of words from count
	cmp		r4, r2	 					;//Compare low bits to align against the words from count
	movhi	r4, r5						;//If words to align is greater than the count, then
										;//use the words from count instead

	cmp		r4, #0
	beq		offby3mainloop

;//r4 now contains the number of times we need to do a word load/store
;//So we need to sortof back-calculate how many of the word load/stores to
;//skip in memcpyoffby3cachelinealignload
	rsb 	r6, r4, #32
	and		r6, r6, #0x1C
;//r3 now contains the number of *words* to skip over.

;//Deduct words from size
	sub 	r2, r2, r4
		
;//Because the & 0x1C corresponds to words, we DO need to shift this time around
;//when we jump into load table
	adr 	r12, offby3cachelinealignload
	add 	pc, r12, r6, LSL #2			;//Allows 4 instructions per byteblit

;//Because there is no convenient way to split the load/store into multiples of 2
;//unless we keep them together, for misaligned data we leave them together.
offby3cachelinealignload				;//Need to have up to 8 words (1 cache line)
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #8
	str 	r12,[r0], #4
	mov		lr, r4, LSR #24
		
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #8
	str 	r12,[r0], #4
	mov		lr, r4, LSR #24
		
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #8
	str 	r12,[r0], #4
	mov		lr, r4, LSR #24
		
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #8
	str 	r12,[r0], #4
	mov		lr, r4, LSR #24
		
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #8
	str 	r12,[r0], #4
	mov		lr, r4, LSR #24
		
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #8
	str 	r12,[r0], #4
	mov		lr, r4, LSR #24
		
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #8
	str 	r12,[r0], #4
	mov		lr, r4, LSR #24
		
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #8
	str 	r12,[r0], #4
	mov		lr, r4, LSR #24
		
;//So in theory we should now be cache line aligned.
;//We loop around doing prefetches and copies based on how far ahead we want to look
offby3mainloop
	cmp 	r2, #(32*4 + 32)			;//Only keep looking ahead by 4 cache lines
	bmi 	offby3endofmainloop
	
;//Preload
;//        0x000007c4:    f5d3f060    `...    PLD      [r3,#0x60]
;//        0x000007c8:    f5d3f080    ....    PLD      [r3,#0x80]
	DCD		0xf5d3f060
	DCD		0xf5d3f080

;//Here is the main loop that handles pipelining the loads for off by 1
	ldmia	r3!, {r4, r5, r6, r7, r8, r9, r10, r11}
	
	orr		r1,lr, r4, LSL #8
	mov		lr, r4, LSR #24

	orr		r4, lr, r5, LSL #8
	mov		lr, r5, LSR #24

	orr		r5, lr, r6, LSL #8
	mov		lr, r6, LSR #24

	orr		r6, lr, r7, LSL #8
	mov		lr, r7, LSR #24

	orr		r7, lr, r8, LSL #8
	mov		lr, r8, LSR #24

	orr		r8, lr, r9, LSL #8
	mov		lr, r9, LSR #24

	orr		r9, lr, r10, LSL #8
	mov		lr, r10, LSR #24

	orr		r10, lr, r11, LSL #8
	mov		lr, r11, LSR #24

	stmia	r0!, {r1, r4, r5, r6, r7, r8, r9, r10}

	ldmia	r3!, {r4, r5, r6, r7, r8, r9, r10, r11}

	orr		r1,lr, r4, LSL #8
	mov		lr, r4, LSR #24
	
	orr		r4, lr, r5, LSL #8
	mov		lr, r5, LSR #24

	orr		r5, lr, r6, LSL #8
	mov		lr, r6, LSR #24

	orr		r6, lr, r7, LSL #8
	mov		lr, r7, LSR #24

	orr		r7, lr, r8, LSL #8
	mov		lr, r8, LSR #24

	orr		r8, lr, r9, LSL #8
	mov		lr, r9, LSR #24

	orr		r9, lr, r10, LSL #8
	mov		lr, r10, LSR #24

	orr		r10, lr, r11, LSL #8
	mov		lr, r11, LSR #24

	stmia	r0!, {r1, r4, r5, r6, r7, r8, r9, r10}

	sub		r2, r2, #64					;//Take 64 bytes off of count
	b 		offby3mainloop
	
offby3endofmainloop	
;//If we still have more than 32*4 words to move, do one more preload
	cmp		r2, #32*4
	bls		offby3nopreload
;//        0x00000898:    f5d3f080    ....    PLD      [r3,#0x80]
	DCD		0xf5d3f080

offby3nopreload

;//Now we finish up the copy without any preloads.  The data should have alread
;//been loaded into the caches
;//Copy 32 bytes at a time
offby3finishcachelines
	cmp 	r2, #32
	bmi		offby3endoffinishcachelines
	
	ldmia	r3!, {r4, r5, r6, r7, r8, r9, r10, r11}

	orr		r1,lr, r4, LSL #8
	mov		lr, r4, LSR #24
	
	orr		r4, lr, r5, LSL #8
	mov		lr, r5, LSR #24

	orr		r5, lr, r6, LSL #8
	mov		lr, r6, LSR #24

	orr		r6, lr, r7, LSL #8
	mov		lr, r7, LSR #24

	orr		r7, lr, r8, LSL #8
	mov		lr, r8, LSR #24

	orr		r8, lr, r9, LSL #8
	mov		lr, r9, LSR #24

	orr		r9, lr, r10, LSL #8
	mov		lr, r10, LSR #24

	orr		r10, lr, r11, LSL #8
	mov		lr, r11, LSR #24

	stmia	r0!, {r1, r4, r5, r6, r7, r8, r9, r10}
		
	sub		r2, r2, #32					;//Take 32 bytes off of count
	b		offby3finishcachelines

offby3endoffinishcachelines

;//Now we need to finish off any partial cache lines that may be left. We do a similar
;//algorithm to the cachelinealign loop above.
	ands		r6, r2, #0x1C			;//Get number of words left
	subeq	r1, r3, #1					;//Realign source on exact byte if need to branch
	beq		finish						;//If words left==0, then branch to finish
	sub		r2, r2, r6					;//Subtract words left from count
	rsb		r6, r6, #32					;//Get 32-number of words left
	
	adr		r12, offby3finishload		;//That's the copies to skip
	add 	pc, r12, r6, LSL #2			;//..but need to multiply by 4 to get instructions

offby3finishload						;//Need to have up to 8 words (1 cache line)
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #8
	str 	r12,[r0], #4
	mov		lr, r4, LSR #24
		
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #8
	str 	r12,[r0], #4
	mov		lr, r4, LSR #24
		
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #8
	str 	r12,[r0], #4
	mov		lr, r4, LSR #24
		
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #8
	str 	r12,[r0], #4
	mov		lr, r4, LSR #24
		
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #8
	str 	r12,[r0], #4
	mov		lr, r4, LSR #24
		
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #8
	str 	r12,[r0], #4
	mov		lr, r4, LSR #24
		
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #8
	str 	r12,[r0], #4
	mov		lr, r4, LSR #24
		
	ldr 	r4, [r3], #4
	orr		r12,lr, r4, LSL #8
	str 	r12,[r0], #4
	mov		lr, r4, LSR #24

	sub		r1, r3, #1					;//Realign source on exact byte
	
;//	b finish							;//Not needed, just fall through

;//Copy the last 4 bytes, if necessary
finish									;//This finish also used in < 4 bytes case
	rsb		r2, r2, #4					;//Find how many bytes to copy (1,2,3, or 4)
	adr 	r12, finishloadby3
	add		pc, r12, r2, LSL #2			;//Need to shift r2 left by 2 bits to jump instructions

finishloadby3
	ldrb		r3, [r1], #1
	ldrb		r4, [r1], #1 
	ldrb		r5, [r1], #1
	ldrb		r6, [r1], #1
	
	adr		r12, finishstoreby3
	add		pc, r12, r2, LSL #2
	
finishstoreby3
	strb		r3, [r0], #1
	strb		r4, [r0], #1
	strb		r5, [r0], #1
	strb		r6, [r0], #1
		
;//Return to calling function
    IF Interworking :LOR: Thumbing
	ldmfd sp!, {r0,r4-r11, lr}
	bx      lr
    ELSE
	ldmfd sp!, {r0,r4-r11, pc}
    ENDIF


    ENTRY_END memcpybigblk
	
    END