960 lines
38 KiB
NASM
960 lines
38 KiB
NASM
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;;
|
|
;;; Microsoft Research Singularity
|
|
;;;
|
|
;;; Copyright (c) Microsoft Corporation. All rights reserved.
|
|
;;;
|
|
;;; This file contains ARM-specific assembly code.
|
|
;;;
|
|
|
|
; veneer_f.s - float add/sub/mul/div
|
|
;
|
|
; Copyright (C) Advanced RISC Machines Limited, 1994. All rights reserved.
|
|
;
|
|
; RCS Revision: 1
|
|
; Checkin Date: 2007/06/29 02:59:16
|
|
; Revising Author
|
|
;
|
|
|
|
; Local storage size and offsets
|
|
LOC_SIZE EQU 0x18
|
|
OrgOp2l EQU 0x14
|
|
OrgOp1l EQU 0x10
|
|
ExDResl EQU 0x08
|
|
ExOp2l EQU 0x00
|
|
NewResl EQU 0x10
|
|
|
|
|
|
GET fpe.asm
|
|
GET kxarm.inc
|
|
|
|
|
|
a RN 0
|
|
b RN 1
|
|
tmp RN 12
|
|
mask RN 12
|
|
expa RN 2
|
|
expb RN 3
|
|
exp RN expa
|
|
sign RN expb
|
|
shift RN expb
|
|
res RN expb
|
|
guess RN 14
|
|
num RN b
|
|
den RN a
|
|
div RN 3
|
|
|
|
|
|
|
|
;===============================================================================
|
|
;
|
|
; RDCFix:
|
|
; BUGBUG: These comments aren't necessarily right anymore.
|
|
;
|
|
;
|
|
|
|
|
|
; __adds/__subs:
|
|
;
|
|
; Upon entry the signs are checked, and if they are not equal, control is given
|
|
; to the inverse routine while negating the second operand. Ie. __adds(+A,-B) ->
|
|
; __subs(+A,+B) and __subs(-A,+B) -> __adds(-A,-B). After this check the signs are
|
|
; known to be equal.
|
|
;
|
|
; The operands are sorted to ensure that A >= B. This enables many checks to
|
|
; be simplified: (A == 0) || (B == 0) reduces to (B == 0). The calculations
|
|
; are also simpler: only operand B needs to be shifted. Unsigned arithmetic
|
|
; is used to compare the packed numbers, since we want to have the operand with
|
|
; the largest magnitude as operand A.
|
|
;
|
|
; Special cases, namely zeroes, infinities, denormals and Not-a-Numbers (NaNs)
|
|
; are checked for on entry. If one of the operands is special, a jump is made
|
|
; to handle these cases out of line to keep overhead for the general case as
|
|
; low as possible. Because the operands are sorted, only 2 checks need to be
|
|
; made: operand A is checked for NaN, while operand B is checked for zero.
|
|
;
|
|
; As the signs of the operands are known to be equal and the operands
|
|
; are ordered, the sign of the result is the sign of one of the operands.
|
|
; Since the exponent can only change a little (one in __adds, and often little
|
|
; in __subs), the sign and exponent are not separated.
|
|
;
|
|
; In __adds, the operands are added with the smallest one shifted right with the
|
|
; exponent difference. The fraction might be larger then 1.0, and is renormalised
|
|
; if neccesary (max 1 right shift needed). The exponent is adjusted with -1
|
|
; (+ 1 if the fraction was >= 2.0) to counter for the leading bit when the
|
|
; fraction and exponent are combined (using an ADD instruction).
|
|
;
|
|
; In __subs, operand B is subtracted from a, after being shifted right with the
|
|
; exponent difference. The result cannot be negative since A >= B, but it can
|
|
; result in a unnormalized number (as the high bits of A and B might cancel out).
|
|
; The common case results in the exponent being adjusted with +0 or -1, this is
|
|
; when the MSB is still set, or when the next bit is set. In the last case
|
|
; underflow to a denormalized number is possible. Rounding proceeds as normal.
|
|
; When 2 or more leading bits of the result are clear, the result must be
|
|
; normalized. If the resulting exponent is smaller than zero, denormalization
|
|
; follows. No rounding is necessary (the round bit is zero since we shifted
|
|
; left by at least 2 bits).
|
|
;
|
|
; In the rounding stage, the exponent is recombined with the fraction
|
|
; which leading bit is still set (if it is normalized). This causes the
|
|
; exponent to increment by one. Therefore, the exponent has been decremented
|
|
; in an earlier stage.
|
|
; The round-bit is calculated in the result by using more precision than
|
|
; necessary. After the result is shifted right to remove thse, the carry
|
|
; contains the roundbit.
|
|
; The guard bits are in the second operand, which are calculated by
|
|
; left shifting. This is only necessary if the roundbit was set.
|
|
; Round to even is implemented by always rounding upwards, and
|
|
; clearing the LSB in case the guard bits were all zero. Thus an odd value will
|
|
; be rounded up, while an even value will not. While rounding, the fraction may
|
|
; become too large (>= 2.0), at which time the exponent must be incremented and
|
|
; the fraction shifted right. However, this doesn't need extra code, since
|
|
; exponent and fraction are already combined: the overflow from the fraction
|
|
; increments the exponent. Note that this means a denormalized number might
|
|
; become normalized while rounding!
|
|
;
|
|
; For __adds, overflow is being checked after rounding by adding 1 to the exponent.
|
|
; If the result was overflowed, the sign bit inverts (overflowed exponent is 255,
|
|
; and 255+1 negates the sign bit). Note that overflow can only occur after
|
|
; renormalization, or during rounding, but not in both.
|
|
; Overflow cannot occur in __subs.
|
|
;
|
|
; If one of the operands is an uncommon number, the following happens:
|
|
; If the largest operand is a NaN, an Invalid Operation is raised (which
|
|
; returns the NaN if it is a quiet NaN).
|
|
; For __adds, infinities are returned unaltered (inf + inf = inf), but in __subs a
|
|
; Invalid Operation exception is raised for inf - inf.
|
|
; If the smallest operand is a zero, the other operand is returned (thus A + 0
|
|
; -> A, A - 0 -> A, but a special case is -0 - -0 -> +0).
|
|
; Denormalized numbers are handled by decoding an unnormalized fraction with
|
|
; exponent 1. This is to make up for the hidden bit which is clear in
|
|
; denormalized numbers. Normal addition or subtraction can now proceed without any
|
|
; modification (the algorithms don't rely on the operands being normalized).
|
|
; The result can be a denormalized number or a normalized number.
|
|
;
|
|
; Frsb (B - A) is implemented by negating both signs on input of __subs. Its use
|
|
; is mainly intended for code size optimization.
|
|
;
|
|
;===========================================================================
|
|
|
|
[ :DEF: add_s
|
|
|
|
AREA |.text|, CODE, READONLY
|
|
|
|
Export __adds
|
|
Export __subs
|
|
Export __fArithReturn ;; RDCFix: Should move to common area
|
|
Export __fArithNaNCheck ;; RDCFix: Should move to common area
|
|
Export __flt_underflow ;; RDCFix: Should move to common area
|
|
IMPORT FPE_Raise
|
|
[ :DEF: thumb
|
|
CODE32
|
|
]
|
|
|
|
; Prologues for __adds, __subs, __muls, and __divs must be the same.
|
|
NESTED_ENTRY __subs
|
|
EnterWithLR_16
|
|
STMFD sp!, {lr} ; Save return address
|
|
SUB sp, sp, #LOC_SIZE ; Allocate stack space
|
|
PROLOG_END
|
|
STR r1, [sp, #OrgOp2l] ; Save original args in case of exception
|
|
MOV r14, #_FpSubS ; Initialize no exceptions, float sub
|
|
STR r0, [sp, #OrgOp1l] ; Save original args in case of exception
|
|
B fsubtract
|
|
|
|
ENTRY_END __subs
|
|
|
|
|
|
|
|
; Prologues for __adds, __subs, __muls, and __divs must be the same.
|
|
NESTED_ENTRY __adds
|
|
EnterWithLR_16
|
|
STMFD sp!, {lr} ; Save return address
|
|
SUB sp, sp, #LOC_SIZE ; Allocate stack space
|
|
PROLOG_END
|
|
STR r1, [sp, #OrgOp2l] ; Save original args in case of exception
|
|
MOV r14, #_FpAddS ; Initialize no exceptions, float add
|
|
STR r0, [sp, #OrgOp1l] ; Save original args in case of exception
|
|
B faddition
|
|
|
|
|
|
|
|
|
|
|
|
_faddn ; Branch to here from subtract
|
|
EOR b, b, #1 << 31
|
|
B _fadd1
|
|
|
|
|
|
faddition
|
|
; if the signs are unequal, it is a subtract
|
|
TEQ a, b
|
|
BMI _fsubn
|
|
_fadd1
|
|
; swap a and b so that a >= b
|
|
SUBS tmp, a, b
|
|
SUBLO a, a, tmp
|
|
ADDLO b, b, tmp
|
|
; decode exponents, and filter out special cases
|
|
MOV exp, a, LSR #23 ; exp = sign<<8 + exponent
|
|
SUB shift, exp, b, LSR #23 ; shift = 0..254 (sign bits cancel out)
|
|
MOV tmp, #255 << 24
|
|
TST tmp, b, LSL #1 ; check for denorm/zero
|
|
TEQNE tmp, exp, LSL #24 ; check for inf/NaN
|
|
BEQ fadd_uncommon ; handle zeroes/denorms/infs/NaNs
|
|
; decode fractions and add the leading one
|
|
MOV tmp, #1 << 31
|
|
ORR a, tmp, a, LSL #8 ; a = 1.frac_a
|
|
ORR b, tmp, b, LSL #8 ; b = 1.frac_b
|
|
|
|
fadd_add
|
|
; Check for inexact where all bits lost
|
|
CMP shift, #24 ; Shift amount >= 24?
|
|
ORRGE r14, r14, #INX_bit ; Set inexact (note b != +/-0)
|
|
BGE fadd_add_core
|
|
RSB tmp, shift, #24 ; Number of bits lost
|
|
MOVS tmp, b, LSL tmp ; Check lower bits of lesser operand
|
|
ORRNE r14, r14, #INX_bit ; If bits set, then inexact
|
|
|
|
|
|
fadd_add_core ; do the addition and renormalise
|
|
ADDS a, a, b, LSR shift ; CS if a >= 2.0
|
|
|
|
BCS fadd_carry
|
|
ADD exp, exp, #-1 ; adjust exp for leading bit
|
|
MOVS a, a, LSR #8 ; CS -> round up (never EQ)
|
|
ADC a, a, exp, LSL #23 ; combine sign, exp & fraction and round
|
|
BCC __fArithReturn
|
|
RSB shift, shift, #25
|
|
MOVS b, b, LSL shift ; calc guard bits: CS,EQ -> round to even
|
|
MOV tmp, a, LSL #1
|
|
CMNNE tmp, #1 << 24 ; check for overflow (if not round to even)
|
|
BCC __fArithReturn ; return if NOT(overflow OR round to even)
|
|
BICEQ a, a, #1 ; round to even
|
|
CMN tmp, #1 << 24 ; check for overflow
|
|
BCC __fArithReturn
|
|
|
|
fadd_overflow ; sign in a is correct
|
|
ORR r14, r14, #OVF_bit :OR: INX_bit
|
|
; Set overflow and inexact
|
|
MOVS r0, r0 ; Check sign of result
|
|
MOV r1, #0xFF ; Load up a correctly signed INF
|
|
MOV r0, r1, LSL #23 ; Move unsigned INF into result
|
|
ORRMI r0, r0, #0x80000000 ; Set sign bit if result negative
|
|
B __fArithReturn
|
|
|
|
fadd_carry
|
|
MOV a, a, RRX ; restore leading bit
|
|
MOVS tmp, a, ROR #8 ; Check for inexact
|
|
ORRMI r14, r14, #INX_bit ; Set inexact if bit set
|
|
MOVS a, a, LSR #8 ; CS -> round up (never EQ)
|
|
ADC a, a, exp, LSL #23 ; combine sign, exp & fraction and round
|
|
MOV tmp, a, LSL #1
|
|
CMNCC tmp, #1 << 24 ; check for overflow (if not round to even)
|
|
BCC __fArithReturn
|
|
CMN tmp, #1 << 24
|
|
BCS fadd_overflow
|
|
RSB shift, shift, #24
|
|
MOVS b, b, LSL shift ; doesn't set carry if shift = 24!
|
|
BICEQ a, a, #1
|
|
B __fArithReturn
|
|
|
|
|
|
fadd_uncommon
|
|
; handle denorms, infinities and NaNs
|
|
TEQ tmp, exp, LSL #24 ; filter out NaN and infinites (EQ)
|
|
BEQ fadd_inf_NaN
|
|
; fast check for zeroes
|
|
MOVS tmp, b, LSL #1 ; EQ if b is zero
|
|
BEQ __fArithReturn ; return A + 0 = A
|
|
; b is denornalized, a might be
|
|
MOV a, a, LSL #8 ; a = 0.frac_a
|
|
MOV b, b, LSL #8 ; b = 0.frac_b
|
|
TST exp, #255 ; a denormalized? (exp == 0 -> EQ)
|
|
ORRNE a, a, #1 << 31 ; no denorm, add leading one
|
|
SUBNE shift, shift, #1 ; correct shift
|
|
ADDEQ exp, exp, #1 ; both denorms - correct exp
|
|
B fadd_add
|
|
|
|
fadd_inf_NaN
|
|
; handle infinities and NaNs - a is infinite or NaN, b might be
|
|
MOVS tmp, a, LSL #9 ; EQ if a inf, NE if a NaN
|
|
BEQ __fArithReturn
|
|
B __fArithNaNCheck
|
|
|
|
|
|
_fsubn ; Branch here from add
|
|
EOR b, b, #1 << 31
|
|
B _fsub1
|
|
|
|
|
|
fsubtract
|
|
; if the signs are unequal, it is an addition
|
|
TEQ a, b
|
|
BMI _faddn
|
|
_fsub1
|
|
; swap a and b so that a >= b
|
|
SUBS tmp, a, b
|
|
EORLO tmp, tmp, #1 << 31 ; negate both opnds (A - B = -B - -A)
|
|
SUBLO a, a, tmp
|
|
ADDLO b, b, tmp
|
|
; decode exponents, and filter out special cases
|
|
MOV exp, a, LSR #23 ; exp = sign<<8 + exponent
|
|
SUB shift, exp, b, LSR #23 ; shift = 0..254 (sign bits cancel out)
|
|
MOV tmp, #255 << 24
|
|
TST tmp, b, LSL #1 ; check for denorm/zero
|
|
TEQNE tmp, exp, LSL #24 ; check for inf/NaN
|
|
BEQ fsub_uncommon ; handle zeroes/denorms/infs/NaNs
|
|
; decode fractions and add the leading one
|
|
ORR a, tmp, a, LSL #1
|
|
BIC a, a, #0xFE000000
|
|
ORR b, tmp, b, LSL #1
|
|
|
|
; Check for inexact
|
|
CMP shift, #32 ; Shift amount >= 31?
|
|
ORRGE r14, r14, #INX_bit ; Set inexact (note b != +/-0)
|
|
BGE fsub_sub_core
|
|
RSB tmp, shift, #32 ; Number of bits lost
|
|
MOVS tmp, b, LSL tmp ; Check lower bits of lesser operand
|
|
ORRNE r14, r14, #INX_bit ; If bits set, then inexact
|
|
|
|
fsub_dosub
|
|
RSB b, b, #0xFE000000 ; Negate B
|
|
|
|
fsub_sub_core
|
|
; do the subtraction and calc number of bits to renormalise (0, 1, >=2)
|
|
ADD a, a, b, ASR shift
|
|
MOVS tmp, a, LSL #8 ; CS = 10/11, CC,MI = 01, CC,PL = 00
|
|
BCS fsub_renorm_0 ; high bit still set - no renormalisation
|
|
BPL fsub_renormalise ; high 2 bits clear - renormalise >= 2 bits
|
|
TST expa, #254 ; exp == 1? (cannot be zero)
|
|
BEQ fsub_renormalise ; yes -> underflow to denormalized number
|
|
fsub_renorm_1
|
|
; 1 left shift needed, exp -= 1
|
|
MOV a, tmp, ASR #8 ; doesn't set carry - no early exit!
|
|
; TST tmp, #0xFF ; RDCFix: Need this?
|
|
; ORRNE r14, r14, #INX_bit ; RDCFix: Need this?
|
|
RSBS shift, shift, #32+1 ; shift can be <= 0...
|
|
MOVLS shift, #1 ; shift 1 -> CS and NE - always roundup
|
|
MOVS b, b, LSL shift ; calc rounding (CS) and guard bits (EQ)
|
|
ADC a, a, exp, LSL #23 ; recombine sign, exponent and fraction
|
|
BCC __fArithReturn
|
|
BNE __fArithReturn
|
|
; ORR r14, r14, #INX_bit ; RDCFix: Need this?
|
|
BICEQ a, a, #1 ; round to even
|
|
B __fArithReturn
|
|
|
|
fsub_renorm_0
|
|
; no renormalisation needed
|
|
; RDCFix: Is this right?
|
|
MOVS a, tmp, LSL #32-9 ; Check if we're throwing away any bits
|
|
ORRNE r14, r14, #INX_bit ; If we are, set inexact
|
|
MOVS a, tmp, LSR #9 ; CS -> round up
|
|
ADC a, a, exp, LSL #23 ; recombine sign, exponent and fraction
|
|
BCC __fArithReturn
|
|
RSBS shift, shift, #32-0 ; shift can be <= 0... -> don't round to even
|
|
MOVHSS b, b, LSL shift ; EQ -> round to even
|
|
BNE __fArithReturn
|
|
BICEQ a, a, #1 ; round to even
|
|
B __fArithReturn
|
|
|
|
fsub_renormalise
|
|
; >= 2 bits renormalisation needed
|
|
MOV sign, exp, LSR #8
|
|
TST a, #0x00FF0000 ; bit 16..23 set?
|
|
BNE fsub_renorm_small
|
|
fsub_renorm_large
|
|
; bit 16..23 clear, >= 8 bits renormalisation
|
|
MOVS a, a, LSL #8
|
|
BEQ __fArithReturn ; return +0 if result is zero
|
|
SUB exp, exp, #8
|
|
TST a, #0x00FF0000 ; bit 16..23 set?
|
|
MOVEQ a, a, LSL #8
|
|
SUBEQ expa, expa, #8
|
|
fsub_renorm_small
|
|
; renormalise A until bit 23 is set
|
|
TST a, #0x00F00000
|
|
MOVEQ a, a, LSL #4
|
|
SUBEQ exp, exp, #4
|
|
TST a, #0x00C00000
|
|
MOVEQ a, a, LSL #2
|
|
SUBEQ exp, exp, #2
|
|
CMP a, #1 << 23
|
|
MOVCC a, a, LSL #1
|
|
ADC exp, exp, #-3
|
|
TEQ sign, exp, LSR #8 ; exponent underflow? (signs differ if so)
|
|
ADDEQ a, a, exp, LSL #23 ; no rounding necessary
|
|
BEQ __fArithReturn
|
|
; underflow to denormalized number
|
|
RSB exp, exp, #0
|
|
|
|
;;
|
|
;; RDCFix: Move this to a common area (out of the adds/subs routine).
|
|
;;
|
|
;; Code adapted from except.s __flt_underflow
|
|
;;
|
|
;; Note that an underflow cannot occur for an add nor a subtract. This is
|
|
;; because the Pegasus FP Specification states that underflow happens if the
|
|
;; result is denormal (or zero) after rounding and inexact. Since the only
|
|
;; way we can get a denormal result from an add or subtract is to add/subtract
|
|
;; two denormals, and adding/subtracting two denormals is always exact (no
|
|
;; shift occurs as the exponents are equal), it is impossible to generate
|
|
;; an underflow condition. Thus, for add and subtract, this code will just
|
|
;; generate the correct result. The result will always be exact.
|
|
;;
|
|
;; For multiply and divide, inexacts must be detected here. An inexact here
|
|
;; may or may not also raise underflow in __fArithReturn. It is possible
|
|
;; for a normal result to enter here.
|
|
;;
|
|
;; Register usage:
|
|
;; r0 - underflowed number with leading bit set, round and guard bits
|
|
;; r2 - shift count for a (0 - exp)
|
|
;; r3 - sign in bit 0 (negative if set)
|
|
;;
|
|
__flt_underflow
|
|
; RDCFix: What part of r2 is valid? Only low byte?
|
|
; I don't completely understand this.
|
|
; Check for inexact
|
|
TST r2, #0x000000E0 ; Check for shift >= 32
|
|
ORRNE r14, r14, #INX_bit ; If shift >= 32, lost all bits: inexact
|
|
|
|
fp_underflow_calc_result
|
|
MOV r3, r3, LSL #31 ; Position sign into sign bit position
|
|
ORRS r3, r3, r0, LSR r2 ; Combine sign, exponent, and mantissa
|
|
BCS fp_underflow_carry
|
|
RSB r2, r2, #32 ; Check for inexact to see if we shifted
|
|
MOVS r0, r0, LSL r2 ; any set bits out to the right
|
|
ORRNE r14, r14, #INX_bit ; If we did, set inexact
|
|
MOV r0, r3
|
|
B __fArithReturn
|
|
|
|
fp_underflow_carry
|
|
ORR r14, r14, #INX_bit ; RDCFix: Why is inexact guaranteed here?
|
|
RSB r2, r2, #33
|
|
MOVS r2, r0, LSL r2
|
|
ADC r0, r3, #0
|
|
BICEQ r0, r0, #1
|
|
B __fArithReturn
|
|
|
|
|
|
|
|
fsub_uncommon
|
|
TEQ tmp, exp, LSL #24 ; EQ if NaN
|
|
BEQ fsub_inf_NaN
|
|
fsub_denorm ; here b is denormalized or zero, a might be a normal number
|
|
; check whether a or b is zero - fast case
|
|
MOVS tmp, a, LSL #1
|
|
MOVEQ a, #0 ; -0 - -0 = +0
|
|
MOVS b, b, LSL #1 ; EQ if b == 0 or a == 0
|
|
BEQ __fArithReturn ; return a - 0 = a
|
|
; b is denormalized, a might be
|
|
TST exp, #255
|
|
BIC a, tmp, #0xFF000000
|
|
ORRNE a, a, #1 << 24
|
|
SUBNE shift, shift, #1
|
|
ADDEQ exp, exp, #1
|
|
|
|
; Check for inexact
|
|
CMP shift, #31 ; Shift amount >= 31?
|
|
ORRGE r14, r14, #INX_bit ; Set inexact (note b != +/-0)
|
|
BGE fsub_sub_core
|
|
RSB tmp, shift, #31 ; Number of bits lost
|
|
MOVS tmp, b, LSL tmp ; Check lower bits of lesser operand
|
|
ORRNE r14, r14, #INX_bit ; If bits set, then inexact
|
|
|
|
RSB b, b, #0
|
|
B fsub_sub_core
|
|
|
|
|
|
fsub_inf_NaN
|
|
; handle infinities and NaNs - a is infinite or a NaN, b might be
|
|
MOVS tmp, a, LSL #9 ; a NaN? (NE)
|
|
BNE __fArithNaNCheck
|
|
CMP a, b ; a is infinite, b too? (EQ)
|
|
ORREQ r14, r14, #IVO_bit ; Set invalid operation if is
|
|
ORREQ r0, r0, #fSignalBit ; Make INF into a QNaN
|
|
B __fArithReturn ; yes, a & b infinite -> generate IVO
|
|
|
|
|
|
;;
|
|
;; _fArithReturn
|
|
;;
|
|
;; Register Usage:
|
|
;; r0 - Default return value
|
|
;; r14 - Exception information
|
|
;;
|
|
;; Stack:
|
|
;; | Caller's Frame |
|
|
;; | |
|
|
;; +----------------+
|
|
;; | Return Address |
|
|
;; +----------------+
|
|
;; | Original Arg2 |
|
|
;; +----------------+
|
|
;; | Original Arg1 | <-- SP
|
|
;; +----------------+
|
|
;; Stack Top
|
|
;;
|
|
;;
|
|
;; Standard return path for single precision arithmetic routines. It checks
|
|
;; if any exceptions occurred. If any exceptional conditions occurred, then
|
|
;; an FPIEEE exception record is allocated and filled with the approptiate
|
|
;; values and the exception handler called. Upon returning, the possibly
|
|
;; changed result is loaded from the returned union, the stack space is
|
|
;; restored, and control is returned to the caller. If no exceptions occurred,
|
|
;; then the default result is returned.
|
|
;;
|
|
__fArithReturn
|
|
TST r14, #FPECause_mask ; Any exceptions?
|
|
ADDEQ sp, sp, #LOC_SIZE ; None so pop original args
|
|
IF Interworking :LOR: Thumbing
|
|
LDMEQFD sp!, {lr} ; and return
|
|
BXEQ lr
|
|
ELSE
|
|
LDMEQFD sp!, {pc} ; and return
|
|
ENDIF
|
|
; Else we have an exception
|
|
; Check for underflow (denormal & inexact)
|
|
MOV tmp, #0xFF000000 ; Load up exponent mask << 1
|
|
TST r0, tmp, LSR #1 ; See if exponent is zero
|
|
BNE no_underflow ; Non-zero exponent so no underflow possible
|
|
TST r14, #INX_bit ; See if inexact bit is set
|
|
ORRNE r14, r14, #UNF_bit ; If inexact, then underflow
|
|
no_underflow
|
|
STR r0, [sp, #ExDResl] ; Push default result
|
|
LDR r0, [sp, #OrgOp2l] ; Get orig Arg2 off stack
|
|
STR r0, [sp, #ExOp2l] ; Push Arg2
|
|
LDR r2, [sp, #OrgOp1l] ; Get orig Arg1 off stack
|
|
MOV r1, r14 ; ExInfo
|
|
ADD r0, sp, #NewResl ; Pointer to result from ex. handler
|
|
; Note that this clobbers original
|
|
CALL FPE_Raise
|
|
|
|
IF Thumbing :LAND: :LNOT: Interworking
|
|
CODE16
|
|
bx pc ; switch back to ARM mode
|
|
nop
|
|
CODE32
|
|
ENDIF ; Arg1 and Arg2 on the stack
|
|
|
|
LDR r0, [sp, #NewResl] ; Get returned result
|
|
ADD sp, sp, #LOC_SIZE ; Pop orig. args and arg passing space
|
|
IF Interworking :LOR: Thumbing
|
|
LDMFD sp!, {lr} ; Return
|
|
BX lr
|
|
ELSE
|
|
LDMFD sp!, {pc} ; Return
|
|
ENDIF
|
|
|
|
|
|
|
|
;; __fArithNaNCheck
|
|
;;
|
|
;; Checks both operands for SNaNs and raises and exception if one is present.
|
|
;; If no SNaNs are present, then a QNaN is returned. At least one of Arg1
|
|
;; and Arg2 must be a NaN.
|
|
;;
|
|
;; Register usage:
|
|
;; r0 - Arg1 (must be a NaN if Arg2 is not)
|
|
;; r1 - Arg2 (must be a NaN if Arg1 is not)
|
|
;; r14 - FP exception information
|
|
;;
|
|
;; Code adapted from except.s.
|
|
__fArithNaNCheck
|
|
MOV a4, #0x01000000
|
|
CMN a4, fOP1, LSL #1
|
|
BLS fcheck_opnd2_NaN
|
|
fcheck_opnd1_NaN
|
|
TST fOP1, #fSignalBit
|
|
ORREQ fOP1, fOP1, #fSignalBit
|
|
ORREQ r14, r14, #IVO_bit
|
|
BEQ __fArithReturn
|
|
CMN a4, fOP2, LSL #1
|
|
BLS __fArithReturn
|
|
fcheck_opnd2_NaN
|
|
MOV fOP1, fOP2
|
|
TST fOP1, #fSignalBit
|
|
ORREQ fOP1, fOP1, #fSignalBit
|
|
ORREQ r14, r14, #IVO_bit
|
|
B __fArithReturn
|
|
|
|
ENTRY_END __adds
|
|
]
|
|
|
|
|
|
;------------------------------------------------------------------------------
|
|
|
|
[ :DEF: mul_s
|
|
|
|
AREA |.text|, CODE, READONLY
|
|
|
|
Export __muls
|
|
Export fmul_fdiv_overflow
|
|
IMPORT __fArithReturn
|
|
IMPORT __fArithNaNCheck
|
|
IMPORT __flt_normalise2
|
|
IMPORT __flt_underflow
|
|
|
|
|
|
MACRO
|
|
MULL48 $a, $b, $res, $tmp
|
|
; a = AAAAAA00
|
|
; b = BBBBBB00
|
|
|
|
UMULL $tmp, $res, $a, $b
|
|
SUB exp, exp, #128 << 16 ; subtract bias+1 - 0..253 normal
|
|
CMP $tmp, #0
|
|
ORRNE $res, $res, #1
|
|
|
|
MEND
|
|
|
|
|
|
|
|
; Prologues for __adds, __subs, __muls, and __divs must be the same
|
|
NESTED_ENTRY __muls
|
|
EnterWithLR_16
|
|
STMFD sp!, {lr} ; Save return address
|
|
SUB sp, sp, #LOC_SIZE ; Allocate local storage
|
|
PROLOG_END
|
|
STR r0, [sp, #OrgOp1l] ; Save off args in case of exception
|
|
MOV r14, #_FpMulS ; Initialize no exceptions, float multiply
|
|
STR r1, [sp, #OrgOp2l]
|
|
|
|
MOV mask, #255 << 16
|
|
ANDS expa, mask, a, LSR #7
|
|
ANDNES expb, mask, b, LSR #7
|
|
TEQNE expa, mask
|
|
TEQNE expb, mask
|
|
BEQ fmul_uncommon
|
|
TEQ a, b
|
|
ORRMI expa, expa, #1 << 8
|
|
MOV mask, #1 << 31
|
|
ORR a, mask, a, LSL #8
|
|
ORR b, mask, b, LSL #8
|
|
fmul_mul
|
|
ADD exp, expa, expb
|
|
MULL48 a, b, res, tmp
|
|
|
|
;; r1 now available for scratch
|
|
|
|
CMP res, #&80000000
|
|
ORRCS r1, tmp, res, LSL #24 ; Check res low 8 bits, low bits
|
|
MOVLO res, res, LSL #1
|
|
ADC exp, exp, exp, ASR #16 ; recombine sign & exp, and adjust exp
|
|
ORRS r1, tmp, res, LSL #25 ; check low 7 bits, low bits
|
|
ORRNE r14, r14, #INX_bit
|
|
|
|
fmul_round
|
|
MOVS a, res, LSR #8 ; never EQ (leading bit)
|
|
ADC a, a, exp, LSL #23 ; add fraction, and round
|
|
TSTCS res, #0x7f ; EQ -> round to even
|
|
CMPNE exp, #252 << 16 ; possible overflow? (never EQ)
|
|
BLO __fArithReturn ; return if no overflow and no round to even
|
|
BICEQ a, a, #1 ; delayed round to even
|
|
CMP exp, #252 << 16
|
|
BLO __fArithReturn
|
|
BPL fmul_fdiv_overflow
|
|
|
|
fmul_underflow ; result may be normalised after rounding
|
|
MOV r1, a, LSL #1
|
|
SUB r1, r1, #1 << 24
|
|
CMP r1, #3 << 24 ; result exp in 1..3 -> return
|
|
BLO __fArithReturn
|
|
MOV a, res
|
|
MVN sign, exp, LSR #8 ; correct sign from underflowed exponent
|
|
RSB exp, exp, #8 ; calc denormalising shift
|
|
B __flt_underflow
|
|
|
|
;;
|
|
;; fmul_fdiv_overflow is shared between __muls and __divs.
|
|
;;
|
|
fmul_fdiv_overflow ; result might not be overflowed after all
|
|
MOV tmp, a, LSL #1
|
|
ADD tmp, tmp, #1 << 24
|
|
CMP tmp, #254 << 24 ; Check for exp = 253 or 254
|
|
BHS __fArithReturn ; no overflow - 9 cycles overhead
|
|
SUBS a, a, exp, LSL #7 ; get correct sign
|
|
ORR r14, r14, #OVF_bit :OR: INX_bit ; Set overflow and inexact
|
|
MOV a, #0x7F000000 ; Create a properly signed INF
|
|
ORR a, a, #0x00800000 ; ...
|
|
ORRMI a, a, #0x80000000 ; ...
|
|
B __fArithReturn
|
|
|
|
|
|
|
|
fmul_uncommon ; a or b denorm/NaN/inf
|
|
AND expb, mask, b, LSR #7
|
|
TEQ a, b
|
|
ORRMI expa, expa, #1 << 8
|
|
CMP expa, mask
|
|
CMPLO expb, mask
|
|
BHS fmul_inf_NaN
|
|
; a or b denorm, first check for zero case
|
|
MOVS tmp, a, LSL #1
|
|
MOVNES tmp, b, LSL #1
|
|
MOVEQ a, expa, LSL #23 ; return signed zero
|
|
BEQ __fArithReturn
|
|
; normalise operands
|
|
ADR tmp, fmul_mul
|
|
B __flt_normalise2
|
|
|
|
|
|
fmul_inf_NaN ; a or b is a NaN or infinite
|
|
MOV tmp, #0x01000000
|
|
CMN tmp, a, LSL #1
|
|
CMNLS tmp, b, LSL #1
|
|
BHI __fArithNaNCheck
|
|
; now a or b is infinite - check that a and b are non-zero
|
|
MOVS tmp, a, LSL #1 ; a zero?
|
|
MOVNES tmp, b, LSL #1 ; b zero?
|
|
ORRNE expa, expa, #255 ; create infinite
|
|
MOV a, expa, LSL #23 ; with correct sign
|
|
; If NE: a & b nonzero, return infinite
|
|
ORREQ r14, r14, #IVO_bit ; If EQ: inf * 0 signals an exception
|
|
ORREQ a, a, #0x7F000000 ; Create a QNaN
|
|
ORREQ a, a, #0x00C00000 ; ...
|
|
B __fArithReturn
|
|
|
|
ENTRY_END __muls
|
|
]
|
|
|
|
;---------------------------------------------------------------------------
|
|
|
|
[ :DEF: div_s
|
|
|
|
AREA |.text|, CODE, READONLY
|
|
|
|
Export __divs
|
|
IMPORT __flt_normalise2
|
|
IMPORT __fArithReturn
|
|
IMPORT __flt_underflow
|
|
IMPORT fmul_fdiv_overflow
|
|
IMPORT __fArithNaNCheck
|
|
|
|
; TODO: * halve lookup table size
|
|
|
|
|
|
DCB 0, 0, 0, 0
|
|
DCB 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17
|
|
DCB 18, 19, 20, 21, 22, 24, 25, 26, 27, 28, 30, 31, 32, 33, 35, 36
|
|
DCB 37, 39, 40, 41, 43, 44, 46, 47, 48, 50, 51, 53, 54, 56, 57, 59
|
|
DCB 60, 62, 63, 65, 66, 68, 70, 71, 73, 74, 76, 78, 80, 81, 83, 85
|
|
DCB 87, 88, 90, 92, 94, 96, 98,100,102,104,106,108,110,112,114,116
|
|
DCB 118,120,122,125,127,129,131,134,136,138,141,143,146,148,151,153
|
|
DCB 156,158,161,164,166,169,172,175,178,180,183,186,189,192,195,199
|
|
DCB 202,205,208,212,215,218,222,225,229,233,236,240,244,248,252,255
|
|
fdiv_tab
|
|
|
|
; Prologues for __adds, __subs, __muls, and __divs must be the same
|
|
NESTED_ENTRY __divs
|
|
EnterWithLR_16
|
|
STMFD sp!, {lr} ; Save return address
|
|
SUB sp, sp, #LOC_SIZE ; Allocate local storage
|
|
PROLOG_END
|
|
STR r0, [sp, #OrgOp1l] ; Save off args in case of exception
|
|
MOV r14, #_FpDivS ; Initialize no exceptions, float divide
|
|
; Note that r14 is also used by "guess"
|
|
STR r1, [sp, #OrgOp2l]
|
|
MOV mask, #255 << 16
|
|
ANDS expa, mask, a, LSR #7
|
|
ANDNES expb, mask, b, LSR #7
|
|
CMPNE expa, #255 << 16
|
|
CMPNE expb, #255 << 16
|
|
BEQ fdiv_uncommon
|
|
TEQ a, b
|
|
ADDMI expa, expa, #1 << 8
|
|
ORR tmp, a, #1 << 23
|
|
ORR den, b, #1 << 23
|
|
BIC num, tmp, #0xFF000000
|
|
BIC den, den, #0xFF000000
|
|
fdiv_div
|
|
; calculate exponent and find leading bit of result
|
|
SUB exp, expa, expb
|
|
CMP num, den
|
|
; this code fills result delay slots
|
|
;MOVLO num, num, LSL #1 ; shift so that div >= 1 << 23
|
|
;ADD exp, exp, #(127-2) << 16 ; subtract bias (one too small)
|
|
;ADC exp, exp, exp, ASR #16 ; calc exp, combine with sign
|
|
|
|
; lookup guess of 1/den - use rounded inverted tablelookup
|
|
ADD tmp, den, #32768 + ((. + 12 - (fdiv_tab + 127)) << 16)
|
|
LDRB guess, [pc, -tmp, LSR #16]
|
|
RSB den, den, #0 ; result delay - negate den for MLA
|
|
ADD guess, guess, #256
|
|
|
|
; do one Newton-Rhapson iteration to increase precision to 15 bits
|
|
MUL tmp, den, guess
|
|
MOVLO num, num, LSL #1 ; result delay - shift so that div >= 1 << 23
|
|
MOV tmp, tmp, ASR #4
|
|
MUL div, tmp, guess
|
|
MOV guess, guess, LSL #7
|
|
ADD guess, guess, div, ASR #21
|
|
|
|
; long division - 13 bits
|
|
MOV tmp, num, LSR #10
|
|
MUL tmp, guess, tmp
|
|
MOV num, num, LSL #12
|
|
MOV div, tmp, LSR #17
|
|
MLA num, den, div, num
|
|
ADD exp, exp, #(127-2) << 16 ; result delay - subtract bias (one too small)
|
|
|
|
; long division - 11 bits (can do 12)
|
|
MOV tmp, num, LSR #10
|
|
MUL tmp, guess, tmp
|
|
MOV num, num, LSL #11
|
|
MOV tmp, tmp, LSR #18
|
|
MLA num, den, tmp, num
|
|
ADC exp, exp, exp, ASR #16 ; result delay - calc exp, combine with sign
|
|
|
|
; correct div (may be one too small)
|
|
CMN num, den
|
|
ADDHS num, num, den ; now num < den
|
|
ADC div, tmp, div, LSL #11
|
|
|
|
MOV r14, #_FpDivS ; Reinitialize no exceptions, float divide
|
|
; Note that r14 was used for "guess"
|
|
CMP num, #0 ; Check for inexact
|
|
ORRNE r14, r14, #INX_bit ; Set inexact if bits lost
|
|
CMN den, num, LSL #1 ; CS -> round, EQ -> round to even
|
|
ADC a, div, exp, LSL #23 ; recombine exp and fraction - increment exp
|
|
CMPNE exp, #252 << 16 ; exp < 252 cannot overflow
|
|
BLO __fArithReturn
|
|
BICEQ a, a, #1
|
|
CMP exp, #252 << 16 ; exp < 252 cannot overflow
|
|
BLO __fArithReturn
|
|
BPL fmul_fdiv_overflow
|
|
|
|
fdiv_underflow ; result may be normalised after rounding
|
|
MOV tmp, a, LSL #1
|
|
SUB tmp, tmp, #1 << 24
|
|
CMP tmp, #3 << 24 ; result exp in 1..3 -> return
|
|
BLO __fArithReturn
|
|
CMP num, #1 ; num contains implicit guard bits
|
|
ADC a, div, div ; add explicit guard bit (1 if num > 0)
|
|
MVN sign, exp, LSR #8 ; get correct sign
|
|
RSB exp, exp, #1 ; calc 1 - exp
|
|
B __flt_underflow
|
|
|
|
|
|
fdiv_uncommon
|
|
AND expb, mask, b, LSR #7
|
|
TEQ a, b
|
|
ORRMI expa, expa, #1 << 8
|
|
CMP expa, mask
|
|
CMPLO expb, mask
|
|
BHS fdiv_inf_NaN
|
|
; a or b denorm, first check for zero case
|
|
MOVS tmp, b, LSL #1
|
|
BEQ fdiv_divbyzero ; a / 0 -> division by zero
|
|
MOVS tmp, a, LSL #1 ; 0 / b -> 0
|
|
MOVEQ a, expa, LSL #23 ; return signed zero
|
|
BEQ __fArithReturn
|
|
; normalise operands
|
|
ADR tmp, fdiv_div1
|
|
B __flt_normalise2
|
|
|
|
fdiv_div1 ; remove... quick hack
|
|
MOV tmp, a, LSR #8
|
|
MOV den, b, LSR #8
|
|
MOV num, tmp
|
|
B fdiv_div
|
|
|
|
|
|
fdiv_inf_NaN ; a or b is a NaN or infinite
|
|
MOV tmp, #0x01000000
|
|
CMN tmp, a, LSL #1
|
|
CMNLS tmp, b, LSL #1
|
|
BHI __fArithNaNCheck
|
|
; now a or b is infinite - check that a and b are not both infinite
|
|
CMN tmp, a, LSL #1
|
|
CMNEQ tmp, b, LSL #1
|
|
MOVEQ a, expa, LSL #23
|
|
ORREQ r14, r14, #IVO_bit ; Set invalid
|
|
ORREQ a, a, #0x7F000000 ; Create QNaN
|
|
ORREQ a, a, #0x00C00000 ; ...
|
|
BEQ __fArithReturn ; inf / inf -> IVO
|
|
CMN tmp, b, LSL #1 ; b inf? (EQ)
|
|
MOVEQ a, #0 ; a / inf -> signed zero
|
|
BICNE a, a, #1 << 31 ; inf / b = inf (even inf / 0 = inf)
|
|
ORR a, a, expa, LSL #23 ; set sign
|
|
B __fArithReturn
|
|
|
|
fdiv_divbyzero ; b zero
|
|
MOVS tmp, a, LSL #1
|
|
ORREQ r14, r14, #IVO_bit ; 0 / 0 -> IVO
|
|
ORREQ a, a, #0x7F000000 ; Create QNaN
|
|
ORREQ a, a, #0x00C00000 ; ...
|
|
ORRNE r14, r14, #DVZ_bit ; A / 0 -> DVZ
|
|
MOVNE a, expa, LSL #23 ; set sign of result (returns signed inf)
|
|
ORRNE a, a, #0x7F000000 ; Create properly signed INF
|
|
ORRNE a, a, #0x00800000 ; ...
|
|
B __fArithReturn
|
|
|
|
ENTRY_END __divs
|
|
]
|
|
|
|
;---------------------------------------------------------------------------
|
|
|
|
[ :DEF: fnorm2_s
|
|
|
|
AREA |.text|, CODE, READONLY
|
|
|
|
EXPORT __flt_normalise2
|
|
|
|
; normalise a or b (or both). One operand is denormalised
|
|
; a = x0AAAAAA, bits 0-22 nonzero, bits 23-30 zero
|
|
; normalise such that bit 23 = 1
|
|
; return to address in tmp
|
|
|
|
[ :DEF: thumb
|
|
CODE32
|
|
]
|
|
__flt_normalise2
|
|
MOV a, a, LSL #8
|
|
MOV b, b, LSL #8
|
|
TST expa, #255 << 16
|
|
BNE fnorm_b
|
|
fnorm_a
|
|
CMP a, #1 << 16
|
|
SUBLO expa, expa, #16 << 16
|
|
MOVLO a, a, LSL #16
|
|
TST a, #255 << 24
|
|
SUBEQ expa, expa, #8 << 16
|
|
MOVEQ a, a, LSL #8
|
|
TST a, #15 << 28
|
|
SUBEQ expa, expa, #4 << 16
|
|
MOVEQ a, a, LSL #4
|
|
TST a, #3 << 30
|
|
SUBEQ expa, expa, #2 << 16
|
|
MOVEQS a, a, LSL #2
|
|
MOVPL a, a, LSL #1
|
|
ADDMI expa, expa, #1 << 16
|
|
|
|
TST expb, #255 << 16
|
|
ORRNE b, b, #1 << 31
|
|
MOVNE pc, tmp
|
|
fnorm_b
|
|
ORR a, a, #1 << 31
|
|
CMP b, #1 << 16
|
|
SUBLO expb, expb, #16 << 16
|
|
MOVLO b, b, LSL #16
|
|
TST b, #255 << 24
|
|
SUBEQ expb, expb, #8 << 16
|
|
MOVEQ b, b, LSL #8
|
|
TST b, #15 << 28
|
|
SUBEQ expb, expb, #4 << 16
|
|
MOVEQ b, b, LSL #4
|
|
TST b, #3 << 30
|
|
SUBEQ expb, expb, #2 << 16
|
|
MOVEQS b, b, LSL #2
|
|
MOVPL b, b, LSL #1
|
|
ADDMI expb, expb, #1 << 16
|
|
MOV pc, tmp
|
|
|
|
|
|
]
|
|
|
|
;===========================================================================
|
|
|
|
END
|