singrdk/base/Kernel/Native/arm/Crt/veneer_f.asm

960 lines
38 KiB
NASM
Raw Permalink Normal View History

2008-11-17 18:29:00 -05:00
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; Microsoft Research Singularity
;;;
;;; Copyright (c) Microsoft Corporation. All rights reserved.
;;;
;;; This file contains ARM-specific assembly code.
;;;
; veneer_f.s - float add/sub/mul/div
;
; Copyright (C) Advanced RISC Machines Limited, 1994. All rights reserved.
;
; RCS Revision: 1
; Checkin Date: 2007/06/29 02:59:16
; Revising Author
;
; Local storage size and offsets
LOC_SIZE EQU 0x18
OrgOp2l EQU 0x14
OrgOp1l EQU 0x10
ExDResl EQU 0x08
ExOp2l EQU 0x00
NewResl EQU 0x10
GET fpe.asm
GET kxarm.inc
a RN 0
b RN 1
tmp RN 12
mask RN 12
expa RN 2
expb RN 3
exp RN expa
sign RN expb
shift RN expb
res RN expb
guess RN 14
num RN b
den RN a
div RN 3
;===============================================================================
;
; RDCFix:
; BUGBUG: These comments aren't necessarily right anymore.
;
;
; __adds/__subs:
;
; Upon entry the signs are checked, and if they are not equal, control is given
; to the inverse routine while negating the second operand. Ie. __adds(+A,-B) ->
; __subs(+A,+B) and __subs(-A,+B) -> __adds(-A,-B). After this check the signs are
; known to be equal.
;
; The operands are sorted to ensure that A >= B. This enables many checks to
; be simplified: (A == 0) || (B == 0) reduces to (B == 0). The calculations
; are also simpler: only operand B needs to be shifted. Unsigned arithmetic
; is used to compare the packed numbers, since we want to have the operand with
; the largest magnitude as operand A.
;
; Special cases, namely zeroes, infinities, denormals and Not-a-Numbers (NaNs)
; are checked for on entry. If one of the operands is special, a jump is made
; to handle these cases out of line to keep overhead for the general case as
; low as possible. Because the operands are sorted, only 2 checks need to be
; made: operand A is checked for NaN, while operand B is checked for zero.
;
; As the signs of the operands are known to be equal and the operands
; are ordered, the sign of the result is the sign of one of the operands.
; Since the exponent can only change a little (one in __adds, and often little
; in __subs), the sign and exponent are not separated.
;
; In __adds, the operands are added with the smallest one shifted right with the
; exponent difference. The fraction might be larger then 1.0, and is renormalised
; if neccesary (max 1 right shift needed). The exponent is adjusted with -1
; (+ 1 if the fraction was >= 2.0) to counter for the leading bit when the
; fraction and exponent are combined (using an ADD instruction).
;
; In __subs, operand B is subtracted from a, after being shifted right with the
; exponent difference. The result cannot be negative since A >= B, but it can
; result in a unnormalized number (as the high bits of A and B might cancel out).
; The common case results in the exponent being adjusted with +0 or -1, this is
; when the MSB is still set, or when the next bit is set. In the last case
; underflow to a denormalized number is possible. Rounding proceeds as normal.
; When 2 or more leading bits of the result are clear, the result must be
; normalized. If the resulting exponent is smaller than zero, denormalization
; follows. No rounding is necessary (the round bit is zero since we shifted
; left by at least 2 bits).
;
; In the rounding stage, the exponent is recombined with the fraction
; which leading bit is still set (if it is normalized). This causes the
; exponent to increment by one. Therefore, the exponent has been decremented
; in an earlier stage.
; The round-bit is calculated in the result by using more precision than
; necessary. After the result is shifted right to remove thse, the carry
; contains the roundbit.
; The guard bits are in the second operand, which are calculated by
; left shifting. This is only necessary if the roundbit was set.
; Round to even is implemented by always rounding upwards, and
; clearing the LSB in case the guard bits were all zero. Thus an odd value will
; be rounded up, while an even value will not. While rounding, the fraction may
; become too large (>= 2.0), at which time the exponent must be incremented and
; the fraction shifted right. However, this doesn't need extra code, since
; exponent and fraction are already combined: the overflow from the fraction
; increments the exponent. Note that this means a denormalized number might
; become normalized while rounding!
;
; For __adds, overflow is being checked after rounding by adding 1 to the exponent.
; If the result was overflowed, the sign bit inverts (overflowed exponent is 255,
; and 255+1 negates the sign bit). Note that overflow can only occur after
; renormalization, or during rounding, but not in both.
; Overflow cannot occur in __subs.
;
; If one of the operands is an uncommon number, the following happens:
; If the largest operand is a NaN, an Invalid Operation is raised (which
; returns the NaN if it is a quiet NaN).
; For __adds, infinities are returned unaltered (inf + inf = inf), but in __subs a
; Invalid Operation exception is raised for inf - inf.
; If the smallest operand is a zero, the other operand is returned (thus A + 0
; -> A, A - 0 -> A, but a special case is -0 - -0 -> +0).
; Denormalized numbers are handled by decoding an unnormalized fraction with
; exponent 1. This is to make up for the hidden bit which is clear in
; denormalized numbers. Normal addition or subtraction can now proceed without any
; modification (the algorithms don't rely on the operands being normalized).
; The result can be a denormalized number or a normalized number.
;
; Frsb (B - A) is implemented by negating both signs on input of __subs. Its use
; is mainly intended for code size optimization.
;
;===========================================================================
[ :DEF: add_s
AREA |.text|, CODE, READONLY
Export __adds
Export __subs
Export __fArithReturn ;; RDCFix: Should move to common area
Export __fArithNaNCheck ;; RDCFix: Should move to common area
Export __flt_underflow ;; RDCFix: Should move to common area
IMPORT FPE_Raise
[ :DEF: thumb
CODE32
]
; Prologues for __adds, __subs, __muls, and __divs must be the same.
NESTED_ENTRY __subs
EnterWithLR_16
STMFD sp!, {lr} ; Save return address
SUB sp, sp, #LOC_SIZE ; Allocate stack space
PROLOG_END
STR r1, [sp, #OrgOp2l] ; Save original args in case of exception
MOV r14, #_FpSubS ; Initialize no exceptions, float sub
STR r0, [sp, #OrgOp1l] ; Save original args in case of exception
B fsubtract
ENTRY_END __subs
; Prologues for __adds, __subs, __muls, and __divs must be the same.
NESTED_ENTRY __adds
EnterWithLR_16
STMFD sp!, {lr} ; Save return address
SUB sp, sp, #LOC_SIZE ; Allocate stack space
PROLOG_END
STR r1, [sp, #OrgOp2l] ; Save original args in case of exception
MOV r14, #_FpAddS ; Initialize no exceptions, float add
STR r0, [sp, #OrgOp1l] ; Save original args in case of exception
B faddition
_faddn ; Branch to here from subtract
EOR b, b, #1 << 31
B _fadd1
faddition
; if the signs are unequal, it is a subtract
TEQ a, b
BMI _fsubn
_fadd1
; swap a and b so that a >= b
SUBS tmp, a, b
SUBLO a, a, tmp
ADDLO b, b, tmp
; decode exponents, and filter out special cases
MOV exp, a, LSR #23 ; exp = sign<<8 + exponent
SUB shift, exp, b, LSR #23 ; shift = 0..254 (sign bits cancel out)
MOV tmp, #255 << 24
TST tmp, b, LSL #1 ; check for denorm/zero
TEQNE tmp, exp, LSL #24 ; check for inf/NaN
BEQ fadd_uncommon ; handle zeroes/denorms/infs/NaNs
; decode fractions and add the leading one
MOV tmp, #1 << 31
ORR a, tmp, a, LSL #8 ; a = 1.frac_a
ORR b, tmp, b, LSL #8 ; b = 1.frac_b
fadd_add
; Check for inexact where all bits lost
CMP shift, #24 ; Shift amount >= 24?
ORRGE r14, r14, #INX_bit ; Set inexact (note b != +/-0)
BGE fadd_add_core
RSB tmp, shift, #24 ; Number of bits lost
MOVS tmp, b, LSL tmp ; Check lower bits of lesser operand
ORRNE r14, r14, #INX_bit ; If bits set, then inexact
fadd_add_core ; do the addition and renormalise
ADDS a, a, b, LSR shift ; CS if a >= 2.0
BCS fadd_carry
ADD exp, exp, #-1 ; adjust exp for leading bit
MOVS a, a, LSR #8 ; CS -> round up (never EQ)
ADC a, a, exp, LSL #23 ; combine sign, exp & fraction and round
BCC __fArithReturn
RSB shift, shift, #25
MOVS b, b, LSL shift ; calc guard bits: CS,EQ -> round to even
MOV tmp, a, LSL #1
CMNNE tmp, #1 << 24 ; check for overflow (if not round to even)
BCC __fArithReturn ; return if NOT(overflow OR round to even)
BICEQ a, a, #1 ; round to even
CMN tmp, #1 << 24 ; check for overflow
BCC __fArithReturn
fadd_overflow ; sign in a is correct
ORR r14, r14, #OVF_bit :OR: INX_bit
; Set overflow and inexact
MOVS r0, r0 ; Check sign of result
MOV r1, #0xFF ; Load up a correctly signed INF
MOV r0, r1, LSL #23 ; Move unsigned INF into result
ORRMI r0, r0, #0x80000000 ; Set sign bit if result negative
B __fArithReturn
fadd_carry
MOV a, a, RRX ; restore leading bit
MOVS tmp, a, ROR #8 ; Check for inexact
ORRMI r14, r14, #INX_bit ; Set inexact if bit set
MOVS a, a, LSR #8 ; CS -> round up (never EQ)
ADC a, a, exp, LSL #23 ; combine sign, exp & fraction and round
MOV tmp, a, LSL #1
CMNCC tmp, #1 << 24 ; check for overflow (if not round to even)
BCC __fArithReturn
CMN tmp, #1 << 24
BCS fadd_overflow
RSB shift, shift, #24
MOVS b, b, LSL shift ; doesn't set carry if shift = 24!
BICEQ a, a, #1
B __fArithReturn
fadd_uncommon
; handle denorms, infinities and NaNs
TEQ tmp, exp, LSL #24 ; filter out NaN and infinites (EQ)
BEQ fadd_inf_NaN
; fast check for zeroes
MOVS tmp, b, LSL #1 ; EQ if b is zero
BEQ __fArithReturn ; return A + 0 = A
; b is denornalized, a might be
MOV a, a, LSL #8 ; a = 0.frac_a
MOV b, b, LSL #8 ; b = 0.frac_b
TST exp, #255 ; a denormalized? (exp == 0 -> EQ)
ORRNE a, a, #1 << 31 ; no denorm, add leading one
SUBNE shift, shift, #1 ; correct shift
ADDEQ exp, exp, #1 ; both denorms - correct exp
B fadd_add
fadd_inf_NaN
; handle infinities and NaNs - a is infinite or NaN, b might be
MOVS tmp, a, LSL #9 ; EQ if a inf, NE if a NaN
BEQ __fArithReturn
B __fArithNaNCheck
_fsubn ; Branch here from add
EOR b, b, #1 << 31
B _fsub1
fsubtract
; if the signs are unequal, it is an addition
TEQ a, b
BMI _faddn
_fsub1
; swap a and b so that a >= b
SUBS tmp, a, b
EORLO tmp, tmp, #1 << 31 ; negate both opnds (A - B = -B - -A)
SUBLO a, a, tmp
ADDLO b, b, tmp
; decode exponents, and filter out special cases
MOV exp, a, LSR #23 ; exp = sign<<8 + exponent
SUB shift, exp, b, LSR #23 ; shift = 0..254 (sign bits cancel out)
MOV tmp, #255 << 24
TST tmp, b, LSL #1 ; check for denorm/zero
TEQNE tmp, exp, LSL #24 ; check for inf/NaN
BEQ fsub_uncommon ; handle zeroes/denorms/infs/NaNs
; decode fractions and add the leading one
ORR a, tmp, a, LSL #1
BIC a, a, #0xFE000000
ORR b, tmp, b, LSL #1
; Check for inexact
CMP shift, #32 ; Shift amount >= 31?
ORRGE r14, r14, #INX_bit ; Set inexact (note b != +/-0)
BGE fsub_sub_core
RSB tmp, shift, #32 ; Number of bits lost
MOVS tmp, b, LSL tmp ; Check lower bits of lesser operand
ORRNE r14, r14, #INX_bit ; If bits set, then inexact
fsub_dosub
RSB b, b, #0xFE000000 ; Negate B
fsub_sub_core
; do the subtraction and calc number of bits to renormalise (0, 1, >=2)
ADD a, a, b, ASR shift
MOVS tmp, a, LSL #8 ; CS = 10/11, CC,MI = 01, CC,PL = 00
BCS fsub_renorm_0 ; high bit still set - no renormalisation
BPL fsub_renormalise ; high 2 bits clear - renormalise >= 2 bits
TST expa, #254 ; exp == 1? (cannot be zero)
BEQ fsub_renormalise ; yes -> underflow to denormalized number
fsub_renorm_1
; 1 left shift needed, exp -= 1
MOV a, tmp, ASR #8 ; doesn't set carry - no early exit!
; TST tmp, #0xFF ; RDCFix: Need this?
; ORRNE r14, r14, #INX_bit ; RDCFix: Need this?
RSBS shift, shift, #32+1 ; shift can be <= 0...
MOVLS shift, #1 ; shift 1 -> CS and NE - always roundup
MOVS b, b, LSL shift ; calc rounding (CS) and guard bits (EQ)
ADC a, a, exp, LSL #23 ; recombine sign, exponent and fraction
BCC __fArithReturn
BNE __fArithReturn
; ORR r14, r14, #INX_bit ; RDCFix: Need this?
BICEQ a, a, #1 ; round to even
B __fArithReturn
fsub_renorm_0
; no renormalisation needed
; RDCFix: Is this right?
MOVS a, tmp, LSL #32-9 ; Check if we're throwing away any bits
ORRNE r14, r14, #INX_bit ; If we are, set inexact
MOVS a, tmp, LSR #9 ; CS -> round up
ADC a, a, exp, LSL #23 ; recombine sign, exponent and fraction
BCC __fArithReturn
RSBS shift, shift, #32-0 ; shift can be <= 0... -> don't round to even
MOVHSS b, b, LSL shift ; EQ -> round to even
BNE __fArithReturn
BICEQ a, a, #1 ; round to even
B __fArithReturn
fsub_renormalise
; >= 2 bits renormalisation needed
MOV sign, exp, LSR #8
TST a, #0x00FF0000 ; bit 16..23 set?
BNE fsub_renorm_small
fsub_renorm_large
; bit 16..23 clear, >= 8 bits renormalisation
MOVS a, a, LSL #8
BEQ __fArithReturn ; return +0 if result is zero
SUB exp, exp, #8
TST a, #0x00FF0000 ; bit 16..23 set?
MOVEQ a, a, LSL #8
SUBEQ expa, expa, #8
fsub_renorm_small
; renormalise A until bit 23 is set
TST a, #0x00F00000
MOVEQ a, a, LSL #4
SUBEQ exp, exp, #4
TST a, #0x00C00000
MOVEQ a, a, LSL #2
SUBEQ exp, exp, #2
CMP a, #1 << 23
MOVCC a, a, LSL #1
ADC exp, exp, #-3
TEQ sign, exp, LSR #8 ; exponent underflow? (signs differ if so)
ADDEQ a, a, exp, LSL #23 ; no rounding necessary
BEQ __fArithReturn
; underflow to denormalized number
RSB exp, exp, #0
;;
;; RDCFix: Move this to a common area (out of the adds/subs routine).
;;
;; Code adapted from except.s __flt_underflow
;;
;; Note that an underflow cannot occur for an add nor a subtract. This is
;; because the Pegasus FP Specification states that underflow happens if the
;; result is denormal (or zero) after rounding and inexact. Since the only
;; way we can get a denormal result from an add or subtract is to add/subtract
;; two denormals, and adding/subtracting two denormals is always exact (no
;; shift occurs as the exponents are equal), it is impossible to generate
;; an underflow condition. Thus, for add and subtract, this code will just
;; generate the correct result. The result will always be exact.
;;
;; For multiply and divide, inexacts must be detected here. An inexact here
;; may or may not also raise underflow in __fArithReturn. It is possible
;; for a normal result to enter here.
;;
;; Register usage:
;; r0 - underflowed number with leading bit set, round and guard bits
;; r2 - shift count for a (0 - exp)
;; r3 - sign in bit 0 (negative if set)
;;
__flt_underflow
; RDCFix: What part of r2 is valid? Only low byte?
; I don't completely understand this.
; Check for inexact
TST r2, #0x000000E0 ; Check for shift >= 32
ORRNE r14, r14, #INX_bit ; If shift >= 32, lost all bits: inexact
fp_underflow_calc_result
MOV r3, r3, LSL #31 ; Position sign into sign bit position
ORRS r3, r3, r0, LSR r2 ; Combine sign, exponent, and mantissa
BCS fp_underflow_carry
RSB r2, r2, #32 ; Check for inexact to see if we shifted
MOVS r0, r0, LSL r2 ; any set bits out to the right
ORRNE r14, r14, #INX_bit ; If we did, set inexact
MOV r0, r3
B __fArithReturn
fp_underflow_carry
ORR r14, r14, #INX_bit ; RDCFix: Why is inexact guaranteed here?
RSB r2, r2, #33
MOVS r2, r0, LSL r2
ADC r0, r3, #0
BICEQ r0, r0, #1
B __fArithReturn
fsub_uncommon
TEQ tmp, exp, LSL #24 ; EQ if NaN
BEQ fsub_inf_NaN
fsub_denorm ; here b is denormalized or zero, a might be a normal number
; check whether a or b is zero - fast case
MOVS tmp, a, LSL #1
MOVEQ a, #0 ; -0 - -0 = +0
MOVS b, b, LSL #1 ; EQ if b == 0 or a == 0
BEQ __fArithReturn ; return a - 0 = a
; b is denormalized, a might be
TST exp, #255
BIC a, tmp, #0xFF000000
ORRNE a, a, #1 << 24
SUBNE shift, shift, #1
ADDEQ exp, exp, #1
; Check for inexact
CMP shift, #31 ; Shift amount >= 31?
ORRGE r14, r14, #INX_bit ; Set inexact (note b != +/-0)
BGE fsub_sub_core
RSB tmp, shift, #31 ; Number of bits lost
MOVS tmp, b, LSL tmp ; Check lower bits of lesser operand
ORRNE r14, r14, #INX_bit ; If bits set, then inexact
RSB b, b, #0
B fsub_sub_core
fsub_inf_NaN
; handle infinities and NaNs - a is infinite or a NaN, b might be
MOVS tmp, a, LSL #9 ; a NaN? (NE)
BNE __fArithNaNCheck
CMP a, b ; a is infinite, b too? (EQ)
ORREQ r14, r14, #IVO_bit ; Set invalid operation if is
ORREQ r0, r0, #fSignalBit ; Make INF into a QNaN
B __fArithReturn ; yes, a & b infinite -> generate IVO
;;
;; _fArithReturn
;;
;; Register Usage:
;; r0 - Default return value
;; r14 - Exception information
;;
;; Stack:
;; | Caller's Frame |
;; | |
;; +----------------+
;; | Return Address |
;; +----------------+
;; | Original Arg2 |
;; +----------------+
;; | Original Arg1 | <-- SP
;; +----------------+
;; Stack Top
;;
;;
;; Standard return path for single precision arithmetic routines. It checks
;; if any exceptions occurred. If any exceptional conditions occurred, then
;; an FPIEEE exception record is allocated and filled with the approptiate
;; values and the exception handler called. Upon returning, the possibly
;; changed result is loaded from the returned union, the stack space is
;; restored, and control is returned to the caller. If no exceptions occurred,
;; then the default result is returned.
;;
__fArithReturn
TST r14, #FPECause_mask ; Any exceptions?
ADDEQ sp, sp, #LOC_SIZE ; None so pop original args
IF Interworking :LOR: Thumbing
LDMEQFD sp!, {lr} ; and return
BXEQ lr
ELSE
LDMEQFD sp!, {pc} ; and return
ENDIF
; Else we have an exception
; Check for underflow (denormal & inexact)
MOV tmp, #0xFF000000 ; Load up exponent mask << 1
TST r0, tmp, LSR #1 ; See if exponent is zero
BNE no_underflow ; Non-zero exponent so no underflow possible
TST r14, #INX_bit ; See if inexact bit is set
ORRNE r14, r14, #UNF_bit ; If inexact, then underflow
no_underflow
STR r0, [sp, #ExDResl] ; Push default result
LDR r0, [sp, #OrgOp2l] ; Get orig Arg2 off stack
STR r0, [sp, #ExOp2l] ; Push Arg2
LDR r2, [sp, #OrgOp1l] ; Get orig Arg1 off stack
MOV r1, r14 ; ExInfo
ADD r0, sp, #NewResl ; Pointer to result from ex. handler
; Note that this clobbers original
CALL FPE_Raise
IF Thumbing :LAND: :LNOT: Interworking
CODE16
bx pc ; switch back to ARM mode
nop
CODE32
ENDIF ; Arg1 and Arg2 on the stack
LDR r0, [sp, #NewResl] ; Get returned result
ADD sp, sp, #LOC_SIZE ; Pop orig. args and arg passing space
IF Interworking :LOR: Thumbing
LDMFD sp!, {lr} ; Return
BX lr
ELSE
LDMFD sp!, {pc} ; Return
ENDIF
;; __fArithNaNCheck
;;
;; Checks both operands for SNaNs and raises and exception if one is present.
;; If no SNaNs are present, then a QNaN is returned. At least one of Arg1
;; and Arg2 must be a NaN.
;;
;; Register usage:
;; r0 - Arg1 (must be a NaN if Arg2 is not)
;; r1 - Arg2 (must be a NaN if Arg1 is not)
;; r14 - FP exception information
;;
;; Code adapted from except.s.
__fArithNaNCheck
MOV a4, #0x01000000
CMN a4, fOP1, LSL #1
BLS fcheck_opnd2_NaN
fcheck_opnd1_NaN
TST fOP1, #fSignalBit
ORREQ fOP1, fOP1, #fSignalBit
ORREQ r14, r14, #IVO_bit
BEQ __fArithReturn
CMN a4, fOP2, LSL #1
BLS __fArithReturn
fcheck_opnd2_NaN
MOV fOP1, fOP2
TST fOP1, #fSignalBit
ORREQ fOP1, fOP1, #fSignalBit
ORREQ r14, r14, #IVO_bit
B __fArithReturn
ENTRY_END __adds
]
;------------------------------------------------------------------------------
[ :DEF: mul_s
AREA |.text|, CODE, READONLY
Export __muls
Export fmul_fdiv_overflow
IMPORT __fArithReturn
IMPORT __fArithNaNCheck
IMPORT __flt_normalise2
IMPORT __flt_underflow
MACRO
MULL48 $a, $b, $res, $tmp
; a = AAAAAA00
; b = BBBBBB00
UMULL $tmp, $res, $a, $b
SUB exp, exp, #128 << 16 ; subtract bias+1 - 0..253 normal
CMP $tmp, #0
ORRNE $res, $res, #1
MEND
; Prologues for __adds, __subs, __muls, and __divs must be the same
NESTED_ENTRY __muls
EnterWithLR_16
STMFD sp!, {lr} ; Save return address
SUB sp, sp, #LOC_SIZE ; Allocate local storage
PROLOG_END
STR r0, [sp, #OrgOp1l] ; Save off args in case of exception
MOV r14, #_FpMulS ; Initialize no exceptions, float multiply
STR r1, [sp, #OrgOp2l]
MOV mask, #255 << 16
ANDS expa, mask, a, LSR #7
ANDNES expb, mask, b, LSR #7
TEQNE expa, mask
TEQNE expb, mask
BEQ fmul_uncommon
TEQ a, b
ORRMI expa, expa, #1 << 8
MOV mask, #1 << 31
ORR a, mask, a, LSL #8
ORR b, mask, b, LSL #8
fmul_mul
ADD exp, expa, expb
MULL48 a, b, res, tmp
;; r1 now available for scratch
CMP res, #&80000000
ORRCS r1, tmp, res, LSL #24 ; Check res low 8 bits, low bits
MOVLO res, res, LSL #1
ADC exp, exp, exp, ASR #16 ; recombine sign & exp, and adjust exp
ORRS r1, tmp, res, LSL #25 ; check low 7 bits, low bits
ORRNE r14, r14, #INX_bit
fmul_round
MOVS a, res, LSR #8 ; never EQ (leading bit)
ADC a, a, exp, LSL #23 ; add fraction, and round
TSTCS res, #0x7f ; EQ -> round to even
CMPNE exp, #252 << 16 ; possible overflow? (never EQ)
BLO __fArithReturn ; return if no overflow and no round to even
BICEQ a, a, #1 ; delayed round to even
CMP exp, #252 << 16
BLO __fArithReturn
BPL fmul_fdiv_overflow
fmul_underflow ; result may be normalised after rounding
MOV r1, a, LSL #1
SUB r1, r1, #1 << 24
CMP r1, #3 << 24 ; result exp in 1..3 -> return
BLO __fArithReturn
MOV a, res
MVN sign, exp, LSR #8 ; correct sign from underflowed exponent
RSB exp, exp, #8 ; calc denormalising shift
B __flt_underflow
;;
;; fmul_fdiv_overflow is shared between __muls and __divs.
;;
fmul_fdiv_overflow ; result might not be overflowed after all
MOV tmp, a, LSL #1
ADD tmp, tmp, #1 << 24
CMP tmp, #254 << 24 ; Check for exp = 253 or 254
BHS __fArithReturn ; no overflow - 9 cycles overhead
SUBS a, a, exp, LSL #7 ; get correct sign
ORR r14, r14, #OVF_bit :OR: INX_bit ; Set overflow and inexact
MOV a, #0x7F000000 ; Create a properly signed INF
ORR a, a, #0x00800000 ; ...
ORRMI a, a, #0x80000000 ; ...
B __fArithReturn
fmul_uncommon ; a or b denorm/NaN/inf
AND expb, mask, b, LSR #7
TEQ a, b
ORRMI expa, expa, #1 << 8
CMP expa, mask
CMPLO expb, mask
BHS fmul_inf_NaN
; a or b denorm, first check for zero case
MOVS tmp, a, LSL #1
MOVNES tmp, b, LSL #1
MOVEQ a, expa, LSL #23 ; return signed zero
BEQ __fArithReturn
; normalise operands
ADR tmp, fmul_mul
B __flt_normalise2
fmul_inf_NaN ; a or b is a NaN or infinite
MOV tmp, #0x01000000
CMN tmp, a, LSL #1
CMNLS tmp, b, LSL #1
BHI __fArithNaNCheck
; now a or b is infinite - check that a and b are non-zero
MOVS tmp, a, LSL #1 ; a zero?
MOVNES tmp, b, LSL #1 ; b zero?
ORRNE expa, expa, #255 ; create infinite
MOV a, expa, LSL #23 ; with correct sign
; If NE: a & b nonzero, return infinite
ORREQ r14, r14, #IVO_bit ; If EQ: inf * 0 signals an exception
ORREQ a, a, #0x7F000000 ; Create a QNaN
ORREQ a, a, #0x00C00000 ; ...
B __fArithReturn
ENTRY_END __muls
]
;---------------------------------------------------------------------------
[ :DEF: div_s
AREA |.text|, CODE, READONLY
Export __divs
IMPORT __flt_normalise2
IMPORT __fArithReturn
IMPORT __flt_underflow
IMPORT fmul_fdiv_overflow
IMPORT __fArithNaNCheck
; TODO: * halve lookup table size
DCB 0, 0, 0, 0
DCB 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17
DCB 18, 19, 20, 21, 22, 24, 25, 26, 27, 28, 30, 31, 32, 33, 35, 36
DCB 37, 39, 40, 41, 43, 44, 46, 47, 48, 50, 51, 53, 54, 56, 57, 59
DCB 60, 62, 63, 65, 66, 68, 70, 71, 73, 74, 76, 78, 80, 81, 83, 85
DCB 87, 88, 90, 92, 94, 96, 98,100,102,104,106,108,110,112,114,116
DCB 118,120,122,125,127,129,131,134,136,138,141,143,146,148,151,153
DCB 156,158,161,164,166,169,172,175,178,180,183,186,189,192,195,199
DCB 202,205,208,212,215,218,222,225,229,233,236,240,244,248,252,255
fdiv_tab
; Prologues for __adds, __subs, __muls, and __divs must be the same
NESTED_ENTRY __divs
EnterWithLR_16
STMFD sp!, {lr} ; Save return address
SUB sp, sp, #LOC_SIZE ; Allocate local storage
PROLOG_END
STR r0, [sp, #OrgOp1l] ; Save off args in case of exception
MOV r14, #_FpDivS ; Initialize no exceptions, float divide
; Note that r14 is also used by "guess"
STR r1, [sp, #OrgOp2l]
MOV mask, #255 << 16
ANDS expa, mask, a, LSR #7
ANDNES expb, mask, b, LSR #7
CMPNE expa, #255 << 16
CMPNE expb, #255 << 16
BEQ fdiv_uncommon
TEQ a, b
ADDMI expa, expa, #1 << 8
ORR tmp, a, #1 << 23
ORR den, b, #1 << 23
BIC num, tmp, #0xFF000000
BIC den, den, #0xFF000000
fdiv_div
; calculate exponent and find leading bit of result
SUB exp, expa, expb
CMP num, den
; this code fills result delay slots
;MOVLO num, num, LSL #1 ; shift so that div >= 1 << 23
;ADD exp, exp, #(127-2) << 16 ; subtract bias (one too small)
;ADC exp, exp, exp, ASR #16 ; calc exp, combine with sign
; lookup guess of 1/den - use rounded inverted tablelookup
ADD tmp, den, #32768 + ((. + 12 - (fdiv_tab + 127)) << 16)
LDRB guess, [pc, -tmp, LSR #16]
RSB den, den, #0 ; result delay - negate den for MLA
ADD guess, guess, #256
; do one Newton-Rhapson iteration to increase precision to 15 bits
MUL tmp, den, guess
MOVLO num, num, LSL #1 ; result delay - shift so that div >= 1 << 23
MOV tmp, tmp, ASR #4
MUL div, tmp, guess
MOV guess, guess, LSL #7
ADD guess, guess, div, ASR #21
; long division - 13 bits
MOV tmp, num, LSR #10
MUL tmp, guess, tmp
MOV num, num, LSL #12
MOV div, tmp, LSR #17
MLA num, den, div, num
ADD exp, exp, #(127-2) << 16 ; result delay - subtract bias (one too small)
; long division - 11 bits (can do 12)
MOV tmp, num, LSR #10
MUL tmp, guess, tmp
MOV num, num, LSL #11
MOV tmp, tmp, LSR #18
MLA num, den, tmp, num
ADC exp, exp, exp, ASR #16 ; result delay - calc exp, combine with sign
; correct div (may be one too small)
CMN num, den
ADDHS num, num, den ; now num < den
ADC div, tmp, div, LSL #11
MOV r14, #_FpDivS ; Reinitialize no exceptions, float divide
; Note that r14 was used for "guess"
CMP num, #0 ; Check for inexact
ORRNE r14, r14, #INX_bit ; Set inexact if bits lost
CMN den, num, LSL #1 ; CS -> round, EQ -> round to even
ADC a, div, exp, LSL #23 ; recombine exp and fraction - increment exp
CMPNE exp, #252 << 16 ; exp < 252 cannot overflow
BLO __fArithReturn
BICEQ a, a, #1
CMP exp, #252 << 16 ; exp < 252 cannot overflow
BLO __fArithReturn
BPL fmul_fdiv_overflow
fdiv_underflow ; result may be normalised after rounding
MOV tmp, a, LSL #1
SUB tmp, tmp, #1 << 24
CMP tmp, #3 << 24 ; result exp in 1..3 -> return
BLO __fArithReturn
CMP num, #1 ; num contains implicit guard bits
ADC a, div, div ; add explicit guard bit (1 if num > 0)
MVN sign, exp, LSR #8 ; get correct sign
RSB exp, exp, #1 ; calc 1 - exp
B __flt_underflow
fdiv_uncommon
AND expb, mask, b, LSR #7
TEQ a, b
ORRMI expa, expa, #1 << 8
CMP expa, mask
CMPLO expb, mask
BHS fdiv_inf_NaN
; a or b denorm, first check for zero case
MOVS tmp, b, LSL #1
BEQ fdiv_divbyzero ; a / 0 -> division by zero
MOVS tmp, a, LSL #1 ; 0 / b -> 0
MOVEQ a, expa, LSL #23 ; return signed zero
BEQ __fArithReturn
; normalise operands
ADR tmp, fdiv_div1
B __flt_normalise2
fdiv_div1 ; remove... quick hack
MOV tmp, a, LSR #8
MOV den, b, LSR #8
MOV num, tmp
B fdiv_div
fdiv_inf_NaN ; a or b is a NaN or infinite
MOV tmp, #0x01000000
CMN tmp, a, LSL #1
CMNLS tmp, b, LSL #1
BHI __fArithNaNCheck
; now a or b is infinite - check that a and b are not both infinite
CMN tmp, a, LSL #1
CMNEQ tmp, b, LSL #1
MOVEQ a, expa, LSL #23
ORREQ r14, r14, #IVO_bit ; Set invalid
ORREQ a, a, #0x7F000000 ; Create QNaN
ORREQ a, a, #0x00C00000 ; ...
BEQ __fArithReturn ; inf / inf -> IVO
CMN tmp, b, LSL #1 ; b inf? (EQ)
MOVEQ a, #0 ; a / inf -> signed zero
BICNE a, a, #1 << 31 ; inf / b = inf (even inf / 0 = inf)
ORR a, a, expa, LSL #23 ; set sign
B __fArithReturn
fdiv_divbyzero ; b zero
MOVS tmp, a, LSL #1
ORREQ r14, r14, #IVO_bit ; 0 / 0 -> IVO
ORREQ a, a, #0x7F000000 ; Create QNaN
ORREQ a, a, #0x00C00000 ; ...
ORRNE r14, r14, #DVZ_bit ; A / 0 -> DVZ
MOVNE a, expa, LSL #23 ; set sign of result (returns signed inf)
ORRNE a, a, #0x7F000000 ; Create properly signed INF
ORRNE a, a, #0x00800000 ; ...
B __fArithReturn
ENTRY_END __divs
]
;---------------------------------------------------------------------------
[ :DEF: fnorm2_s
AREA |.text|, CODE, READONLY
EXPORT __flt_normalise2
; normalise a or b (or both). One operand is denormalised
; a = x0AAAAAA, bits 0-22 nonzero, bits 23-30 zero
; normalise such that bit 23 = 1
; return to address in tmp
[ :DEF: thumb
CODE32
]
__flt_normalise2
MOV a, a, LSL #8
MOV b, b, LSL #8
TST expa, #255 << 16
BNE fnorm_b
fnorm_a
CMP a, #1 << 16
SUBLO expa, expa, #16 << 16
MOVLO a, a, LSL #16
TST a, #255 << 24
SUBEQ expa, expa, #8 << 16
MOVEQ a, a, LSL #8
TST a, #15 << 28
SUBEQ expa, expa, #4 << 16
MOVEQ a, a, LSL #4
TST a, #3 << 30
SUBEQ expa, expa, #2 << 16
MOVEQS a, a, LSL #2
MOVPL a, a, LSL #1
ADDMI expa, expa, #1 << 16
TST expb, #255 << 16
ORRNE b, b, #1 << 31
MOVNE pc, tmp
fnorm_b
ORR a, a, #1 << 31
CMP b, #1 << 16
SUBLO expb, expb, #16 << 16
MOVLO b, b, LSL #16
TST b, #255 << 24
SUBEQ expb, expb, #8 << 16
MOVEQ b, b, LSL #8
TST b, #15 << 28
SUBEQ expb, expb, #4 << 16
MOVEQ b, b, LSL #4
TST b, #3 << 30
SUBEQ expb, expb, #2 << 16
MOVEQS b, b, LSL #2
MOVPL b, b, LSL #1
ADDMI expb, expb, #1 << 16
MOV pc, tmp
]
;===========================================================================
END