singrdk/base/Kernel/Native/arm/Crt/r_addd.asm

635 lines
22 KiB
NASM
Raw Permalink Normal View History

2008-11-17 18:29:00 -05:00
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; Microsoft Research Singularity
;;;
;;; Copyright (c) Microsoft Corporation. All rights reserved.
;;;
;;; This file contains ARM-specific assembly code.
;;;
GBLL add_s
GET veneer_d.asm
END
;;;; THE BELOW ROUTINE SHOULD WORK, BUT THE ARM ROUTINES SHOULD BE FASTER.
;
; Translated to ARM from SH3 FP emulation routines.
;
; __addd Double precision floating point addition.
; Input:
; r0 - Arg1.low
; r1 - Arg1.high
; r2 - Arg2.low
; r3 - Arg2.high
; Output:
; r0 - Result.low
; r1 - Result.high
;
; Note:
; If any FP exceptions are enabled, this routine may raise an exception.
;
;
; IEEE DOUBLE FORMAT
;
; 8 BYTES (LONG WORD * 2)
; 63 62 52 51 0
; +-+-----------+----------------------------------------------------+
; |s| e(11) | m(52) |
; +-+-----------+----------------------------------------------------+
; ^ point
;
; INFINITY NUMBER : e = 2047 m = 0
; ZERO : e = 0 m = 0
; NaN : e = 2047 m != 0
; DENORMAL NUMBER : e = 0 m != 0
;
GET fpe.asm
Export __addd
Export __subd
IMPORT FPE_Raise
AREA |.text|, CODE, READONLY
CARRY_CHECK EQU 0x01000000
MSB EQU 0x00800000
NORMAL EQU 0x00100000
; Note: the SEH prolog below must match the SEH prolog for __addd.
__subd
STMFD sp!, {r0-r10, lr} ; Save off args and non-volatiles and lr
MOV r8, r1 ; Load parameter1 as R8 R0
MOV r4, r2 ; Load parameter2 as R2 R4
MOV r2, r3 ; ...
MOV r5, #_FpSubD ; Double add, assume no exceptions
EOR r2, r2, #0x80000000
; Toggle sign bit on parameter2
B add_in ; Then go add
; Note: the SEH prolog below must match the SEH prolog for __subd
__addd
STMFD sp!, {r0-r10, lr} ; Save off args and non-volatiles and lr
MOV r8, r1 ; Load parameter1 as R8 R0
MOV r4, r2 ; Load parameter2 as R2 R4
MOV r2, r3 ; ...
MOV r5, #_FpAddD ; Double add, assume no exceptions
add_in
; If abs(parameter1) < abs(parameter2) then swap them so that the resulting
; parameter1 has the larger magnitude. This guarantees that only parameter2
; might need to be shifted right before adding. Because of denormal numbers,
; it's not sufficient to compare only the exponents; the entire mantissa must
; be checked as well.
;
; if ((abs(parameter1).hi < abs(parameter1).hi) ||
; ((abs(parameter1.hi == abs(parameter2)) &&
; (parameter1.lo < parameter2.lo)))
; swap parameter1 and parameter2
MOV r3, r8, LSL #1 ; Extract copies of just the magnitudes
CMP r3, r2, LSL #1 ; of each parameter
CMPEQ r0, r4 ; if ((abs(param1).hi < abs(param2)).hi
; ||
BHS end_swap ; ((abs(param1).hi == abs(param2).hi)
; &&
; (param1.lo < param2.lo)))
; ..
swap
MOV r3,r8 ; Swap parameter1 and parameter2
MOV r8,r2 ; ..
MOV r2,r3 ; ..
MOV r3,r0 ; ..
MOV r0,r4 ; ..
MOV r4,r3 ; ..
end_swap
; Unpack parameters.
;
; R8 R0: mantissa1 R2 R4: mantissa2
; R9: exponent1 R1: exponent2
; R10: sign1 R6: sign2
;
; R5: Exception flags
MOV r9, r8, LSL #1 ; Extract exponent1
MOV r9, r9, LSR #21 ; ...
MOV r1, r2, LSL #1 ; Extract exponent2
MOV r1, r1, LSR #21 ; ...
MVN r3, #0 ; Set up to extract mantissas
MOV r10, r8 ; Extract sign1
MOV r6, r2 ; Extract sign2
AND r8, r8, r3, LSR #12; Extract mantissa1
AND r2, r2, r3, LSR #12; Extract mantissa2
; Check for exceptional cases. All NaNs, infinities, and 0's are eliminated.
; Denormal numbers return here after normalizing them. After these checks,
; both parameters are normalized numbers.
;
; After potentially swapping the parameters above, it's sufficient to test
; just parameter1 for non-finite values (NaN, inf) to eliminate non-finite
; values in either parameter. Similarly, it's sufficient to test just
; parameter2 for the unnormalized numbers (exponent2 = 0; denormals and 0).
;
; if (exponent1 == 2047)
; exception1; parameter1 is nonfinite, parameter2 might be too
; if (exponent2 == 0)
; exception2; parameter is 0 or denormal, parameter1 might be too
ADD r3, r9, #1 ; if (exponent1==2047)
CMP r3, #2048 ; ...
BEQ exception1 ; exception1
CMP r1, #0 ; if (exponent2==0)
BEQ exception2 ; exception2
exception_return2
; Shift the mantissas left 3 bits to make room for guard, round and sticky bits
; (G,R,S). Then set their hidden bits.
MOV r8, r8, LSL #3 ; Shift mantissa1 left 3 for (G,R,S)
ORR r8, r8, r0, LSR #29; ...
MOV r0, r0, LSL #3 ; ...
MOV r2, r2, LSL #3 ; Shift mantissa2 left 3 for (G,R,S)
ORR r2, r2, r4, LSR #29; ...
MOV r4, r4, LSL #3 ; ...
ORR r8, r8, #0x00800000 ; Set each mantissa's hidden bit
ORR r2, r2, #0x00800000 ; ..
; Scale parameter2 so that its exponent matches that of parameter1, preparing
; for the addition. Because of the swap earlier, parameter2 always scales by
; shifting right (if it shifts at all).
;
; shift = exponent1 - exponent2
; if shift <= -55
; // entire mantissa2 shifts into the sticky bit; just set S
; else
; if (shift <= -32)
; // "shift" by moving high word to low word
; if (shift != 0)
; // shift by dynamic shifting
scale
SUBS r1, r9, r1 ; shift = exponent2 - exponent1
BEQ scale_end ; Shift == 0?
CMP r1, #3 ;**;
BLE scale_le_3 ;**; 0 < shift <= 3? ..
CMP r1, #55 ; If shift <= 55 then
BLE scale_le_55 ; ..
MOV R2, #0 ; Else (mantissa2,G,R,S) = 1
MOV R4, #1 ; ..
B scale_end
scale_le_3 ;**; No bits are ever lost
MOV r4, r4, LSR r1 ; mantissa2 >>= x where 0 < x <= 3
RSB r3, r1, #32
ORR r4, r4, r2, LSL r3
MOV r2, r2, LSR r1
B scale_end
scale_le_55 ; Else shift <= 55
CMP r1, #31 ; If shift < 32
BLE scale_le_31 ; ..
CMP r4, #0 ; Else S = mantissa2.l != 0
SUB r1, r1, #32 ; (32 fewer bits to shift)
MOV r4, r2 ; Shift 32 bits by moving
MOV r2, #0 ; ..
ORRNE r4, r4, #1 ; Set S if shifted out bits
scale_le_31
CMP r1, #0 ; If shift != 0
BEQ scale_end ; ..
RSB r3, r1, #32 ; Get 32 - shift
MOVS r7, r4, LSL r3 ; Extract low mantissa shifted out (Sticky==NE)
MOV r7, r2, LSL r3 ; Extract high mantissa shifted into lower
MOV r2, r2, LSR r1 ; Shift high mantissa into position
MOV r4, r4, LSR r1 ; Shift low mantissa into position
ORR r4, r4, r7 ; Insert bits from high mantissa into low
ORRNE r4, r4, #1 ; Set sticky if shifted out bits
scale_end
; Add the mantissas.
;
; if (sign1 == sign2)
; result = mantissa1 + mantissa2 // Same signs => addition
; Scale result right if it carried
; if (result overflowed)
; return properly signed inf
; else if (mantissa1 == mantissa2)
; return +0 // Equal values => result = +0
; else
; result = mantissa1 - mantissa2 // Opposite signs => subtraction
; Scale result left // High-order bits were lost
EORS r7, r10, r6 ; If sign1 != sign2
BMI mantissa_sub ; do subtract
ADDS r0, r0, r4 ; Else result = mantissa1 + mantissa2
ADC r8, r8, r2 ; ..
CMP r8, #CARRY_CHECK; If the result carried
BLT end_calc ; ..
MOVS r8, r8, LSR #1 ; Then scale right one
MOVS r0, r0, RRX ; ..
ORRCS r0, r0, #1 ; (fold lost bit into S)
ADD r9, r9, #1 ; Add 1 to exponent for shift
ADD r3, r9, #1 ; Add 1 to exponent for compare
CMP r3, #2048 ; EQ if overflow
BLT end_calc ; ..
; Overflowed so
ORR r5, r5, #OVF_bit :OR: INX_bit
; set exception flags
MOV r0, #0 ; and return properly signed inf
MOV r8, #0 ; ..
B return_value ; ..
; Return +0.
plus_zero
MOV r8, #0 ; Return +0
MOV r0, #0 ; ..
B return ; ..
mantissa_sub
CMP r8, r2 ; Else if mantissa1 = mantissa2
CMPEQ r0, r4 ; ..
BEQ plus_zero ; return +0
man_sub1
SUBS r0, r0, r4 ; Else result = mantissa1 - mantissa2
SBC r8, r8, r2 ; ..
;**; Parameter1 always has the larger magnitude; result is always its sign.
; Normalize since high-order bits are lost when subtracting. Do this in
; chunks.
normalize
CMP r8, #0 ; If mantissa.h = 0
BNE norm32_end ; ..
MOV r8, r0 ; mantissa <<= 32 by moving
MOV r0, #0 ; ..
SUB r9, r9, #32 ; exponent -= 32
norm32_end
MVN r3, #0 ; If (mantissa.h & 0xffff0000) = 0
TST r8, r3, LSL #16 ; ..
BNE norm16_end ;
MOV r8, r8, LSL #16 ; mantissa <<= 16
ORR r8, r8, r0, LSR #16
MOV r0, r0, LSL #16
SUB r9, r9, #16 ; exponent -= 16
norm16_end
CMP r8, #CARRY_CHECK ; If mantissa is not too far left
BLO overnorm_end ; keep normalizing, otherwise, undo
over_norm_loop
MOVS r8, r8, LSR #1 ; mantissa1 >>= 1
MOV r0, r0, RRX ; ..
ADD r9, r9, #1 ; exponent1++
CMP r8, #CARRY_CHECK ; If mantissa is still too far left
BHS over_norm_loop ; ..
B end_norm ; Done
overnorm_end
CMP r8, #MSB ; If mantissa is too far right
BGE end_norm ; ..
norm_loop
MOVS r0, r0, LSL #1 ; mantissa1 <<= 1
MOV r8, r8, LSL #1 ; ..
ORRCS r8, r8, #1 ; ..
SUB r9, r9, #1 ; exponent1--
CMP r8, #MSB ; If mantissa is still too far right
BLT norm_loop ; ..
end_norm
end_calc
; Denormalize the result if necessary, with no concern for performance.
; Addition (and thus subtraction) can never generate less significant bits than
; those of the original operands. Thus, denormalization never results in lost
; bits to fold into S.
CMP r9, #0 ; If exponent < 0
BGT end_denormal ; ..
RSB r9, r9, #0 ; Then shift right exponent1 places
ADD r9, r9, #1 ; +1 for the non-hidden bit
denormal_loop
MOVS r8, r8, LSR #1 ; ..
MOV r0, r0, RRX ; ..
SUBS r9, r9, #1 ; ..
BNE denormal_loop ; ..
end_denormal
; Round to nearest. If rounding occurs, set inexact and
; mantissa += G & ( L | R | S ). If the rounding carries, then renormalize.
;
; Addition (and thus subtraction) can never generate less significant bits than
; those of the original operands. Thus, rounding can never meet either of the
; IEEE loss of accuracy tests for underflow. Nor can rounding cause MaxDenorm
; to carry to MinNormal.
;
; Test for inexact.
TST r0, #0x7 ; If G|R|S (=> rounding required)
BEQ end_round ; ..
ORR r5, r5, #INX_bit; result is inexact (can't underflow)
; Round to nearest.
TST r0, #0x4 ; If G &&
BEQ end_round ; ..
TST r0, #0xB ; L|R|S
BEQ end_round ; ..
ADDS r0, r0, #0x8 ; Then round the mantissa up
ADC r8, r8, #0 ;
CMP r8, #CARRY_CHECK; If the rounding carried
BLT end_round ; (mantissa >= 0x01000000)
ADD r9, r9, #2 ; Then renormalize
CMP r9, #2048 ; If rounding caused overflow
SUB r9, r9, #1
ORREQ r5, r5, #OVF_bit :OR: INX_bit
; Report overflow (=> inexact)
end_round
; Pack the result back into IEEE format.
return_value
MOV r0, r0, LSR #3 ; Shift mantissa right 3
ORR r0, r0, r8, LSL #29 ; ..
MOV r1, r8, LSR #3 ; ..
BIC r1, r1, #0x0FF00000
; Mask away the hidden bit and possibly one bit
; higher if round incremented mantissa.
; 0xFF<<20 is probably overkill, but safe.
ORR r1, r1, r9, LSL #20
; Merge exponent and mantissa
AND r10, r10, #0x80000000
ORR r1, r1, r10 ; Merge sign with exponent and mantissa
; If any trap enable flags are set corresponding to exception flags set,
; set the corresponding cause bits and cause a trap.
;
; if (exception)
; call handler
; extract the possibly updated result
; return
return
TST r5, #FPECause_mask ; If any exceptions occurred ...
BEQ done
;;
;; Register usage:
;; r0 - Default result.low
;; r1 - Default result.high
;; r5 - Exception information
;;
;; Stack:
;; 0x10(sp) - up: Saved registers
;; 0xC(sp): Original Arg2.high
;; 0x8(sp): Original Arg2.low
;; 0x4(sp): Original Arg1.high
;; 0x0(sp): Original Arg1.low
;;
LDR r2, [sp, #0x8] ; Load original Arg2.low
LDR r3, [sp, #0xC] ; Load original Arg2.high
SUB sp, sp, #0x8 ; Make room for exception information
STR r2, [sp, #0x0] ; Store original Arg2.low
STR r3, [sp, #0x4] ; Store original Arg2.high
LDR r3, [sp, #0x8] ; Load original Arg1.low
LDR r2, [sp, #0xC] ; Load original Arg1.high
STR r0, [sp, #0x8] ; Store default result.low
STR r1, [sp, #0xC] ; Store default result.high
MOV r1, r5 ; Move exception information
ADD r0, sp, #0x10 ; Pointer for return value
;; Register Usage:
;; r0 - Address for return value = 0x10(sp)
;; r1 - Exception information
;; r2 - Original arg1.low
;; r3 - Original arg1.high
;;
;; Stack Usage:
;; 0x14(sp): Return result.high
;; 0x10(sp): Return result.low
;; 0xC(sp): Default result.high
;; 0x8(sp): Default result.low
;; 0x4(sp): Original arg2.high
;; 0x0(sp): Original arg2.low
CALL FPE_Raise ; Deal with exception information
IF Thumbing :LAND: :LNOT: Interworking
CODE16
bx pc ; switch back to ARM mode
nop
CODE32
ENDIF
LDR r0, [sp, #0x10] ; Load up returned result
LDR r1, [sp, #0x14] ; ...
ADD sp, sp, #0x8 ; Restore extra arg passing space
done
ADD sp, sp, #0x10 ; Pop off original args
IF Interworking :LOR: Thumbing
LDMIA sp!, {r4-r10, lr}
BX lr
ELSE
LDMIA sp!, {r4-r10, pc}
ENDIF
; Restore off non-volatiles and return
;%%%%%%%%%%%%%%%%%%%%%%%%%
;% Exceptional process %
;%%%%%%%%%%%%%%%%%%%%%%%%%
; Exception 1: parameter1 is non-finite (exponent1 == 2047). The mantissa has
; not been shifted left for the guard bits yet. The choice of ARM SNaN
; versus QNaN (mantissa<51> = 1 => QNaN) means that abs(<any QNaN>) >
; abs(<any SNaN>) > abs(<any inf>).
;
; exception1:
; if (mantissa1 == 0)
; CheckArg2INF(); // Arg1 is an INF. Must check Arg2 for INF.
; else if (mantissa1[MSb] == 0)
; SignalInvalid(); // Arg1 is an SNaN so signal invalid and return it.
; else
; CheckArg2SNaN(); // Arg1 is a QNaN. Check Arg2 for SNaN.
;
; CheckArg2SNaN:
; if (exponent2 == 2047 &&
; mantissa2 != 0 &&
; mantissa2[MSb] == 0)
; SignalInvalid();
; else
; ReturnQNaN();
;
; CheckArg2INF:
; if (exponent2 == 2047 &&
; mantissa2 == 0)
; if (sign1 ^ sign2)
; SignalInvalid(); // Arg1 and Arg2 are opposite INFs.
; else
; ReturnINF(); // Arg1 and Arg2 are same signed INFs.
; else
; ReturnINF(); // Arg1 is INF. Arg2 is not.
;
; SignalInvalid:
; cause |= INVALID_OPERATION;
; ReturnQNaN();
;
; ReturnQNaN:
; exponent1 = 2047;
; mantissa1[MSb] = 1;
; return();
;
; ReturnINF
; exponent1 = 2047;
; mantissa1 = 0;
; return();
;
exception1
ORRS r3, r8, r0 ; if (mantissa1 == 0)
BEQ CheckArg2INF ; CheckArg2INF
TST r8, #dSignalBit ; else if (mantissa1[MSb] == 0)
BEQ SignalInvalid ; SignalInvalid
; else
; CheckArg2SNaN
CheckArg2SNaN
ADD r3, r1, #1 ; if (exponent2 == 2047 &&
CMP r3, #2048 ; ..
BNE ReturnQNaN ; ..
ORRS r3, r2, r4 ; mantissa2 != 0 &&
BEQ ReturnQNaN ; ..
TST r2, #dSignalBit ; mantissa2[MSb] == 0)
BEQ SignalInvalid ; SignalInvalid
B ReturnQNaN ; else
; ReturnQNaN
CheckArg2INF
ADD r3, r1, #1 ; if (exponent2 == 2047 &&
CMP r3, #2048 ; ..
BNE ReturnINF ; ..
ORRS r3, r2, r4 ; mantissa2 == 0 &&
BNE ReturnINF ; ..
EORS r3, r10, r6 ; if (sign1 ^ sign2)
BMI SignalInvalid ; SignalInvalid
; else
; ReturnINF
ReturnINF
AND r1, r10, #0x80000000 ; Get sign bit
ORR r1, r1, r9, LSL #20 ; Insert exponent (exponent == 2047)
B return ; r0 is already 0 so just return
SignalInvalid
ORR r5, r5, #IVO_bit ; Set invalid operation
ReturnQNaN
AND r1, r10, #0x80000000 ; Get sign bit
ORR r1, r1, r9, LSL #20 ; Insert exponent (exponent == 2047)
ORR r1, r1, #dSignalBit ; Insert mantissa high bit to ensure QNaN
ORR r1, r1, r8 ; OR in rest of high mantissa bits
B return ; r0 already has the low mantissa bits so
; just return
; Exception 2: parameter1 is finite, parameter2 is not normal (0 or denormal).
;
; if (exponent1 == 0) // parameter1 is not normal
; if (mantissa1 == 0) // parameter1 is 0
; return properly signed 0
; else if (mantissa2 == 0) // denormal+denormal
; go normalize both and add
; else // denormal+0
; return parameter1
; else if (mantissa != 0) // parameter2 is denormal
; go normalize parameter2 and add
; else // parameter2 is 0
; return parameter1
exception2
CMP r9, #0 ; if parameter1 is not normal
BNE p1_normal ; ..
ORRS r7, r8, r0 ; if parameter1 is 0
BNE p1_denormal ; ..
;*** Rounding mode: proper sign is a function of the rounding mode.
AND r10, r10, r6 ; return properly signed 0
B return_value ;
p1_denormal
ORRS r7, r2, r4 ; else if parameter2 is denormal
BNE p1_normalize ; go normalize both and add
B return_p1 ; else parameter2 is 0
; return parameter1
p1_normal ; (parameter2 is denormal or 0)
ORRS r7, r2, r4 ; else if parameter2 is denormal
BNE p2_normalize ; go normalize parameter2 and add
return_p1 ; else parameter2 is 0
MOV r8, r8, LSL #3 ; return parameter1
ORR r8, r8, r0, LSR #29
; ..
MOV r0, r0, LSL #3 ; ..
B return_value ; ..
; Both parameter1 and parameter2 are denormal. Normalize both then go add.
p1_normalize ; Stop when we shift into 1.0 bit
MOVS r0, r0, LSL #1 ; Account for the hidden mantissa bit
MOV r8, r8, LSL #1 ; that denormals don't have
ORRCS r8, r8, #1 ; ..
CMP r8, #NORMAL ; While mantissa1 < 1.0
BGE end_p1_norm ; ..
p1_norm_loop
MOVS r0, r0, LSL #1 ; Scale mantissa1 up by 1 place
MOV r8, r8, LSL #1 ; ..
ORRCS r8, r8, #1 ; ..
SUB r9, r9, #1 ; and exponent1 down by 1
CMP r8, #NORMAL ; ..
BLT p1_norm_loop ; ..
end_p1_norm
; parameter1 is (now) normalized, parameter2 is denormal. Normalize
; parameter2 then go add.
p2_normalize ; Stop when we shift into 1.0 bit
MOVS r4, r4, LSL #1 ; Account for the hidden mantissa bit
MOV r2, r2, LSL #1 ; that denormals don't have
ORRCS r2, r2, #1 ; ..
CMP r2, #NORMAL ; While mantissa2 < 1.0
BGE end_p2_norm ; ..
p2_norm_loop
MOVS r4, r4, LSL #1 ; Scale mantissa2 up by 1 place
MOV r2, r2, LSL #1 ; ..
ORRCS r2, r2, #1 ; ..
SUB r1, r1, #1 ; and exponent2 down by 1
CMP r2, #NORMAL ; ..
BLT p2_norm_loop ; ..
end_p2_norm
B exception_return2
; Done
END