singrdk/base/Kernel/Native/arm/Crt/veneer_f.asm

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; Microsoft Research Singularity
;;;
;;; Copyright (c) Microsoft Corporation.  All rights reserved.
;;;
;;; This file contains ARM-specific assembly code.
;;;

; veneer_f.s - float add/sub/mul/div
;
; Copyright (C) Advanced RISC Machines Limited, 1994. All rights reserved.
;
; RCS Revision: 1
; Checkin Date: 2007/06/29 02:59:16
; Revising Author
;

; Local storage size and offsets
LOC_SIZE   EQU  0x18
OrgOp2l    EQU  0x14
OrgOp1l    EQU  0x10
ExDResl    EQU  0x08
ExOp2l     EQU  0x00
NewResl    EQU  0x10


        GET     fpe.asm
        GET     kxarm.inc


a       RN 0
b       RN 1
tmp     RN 12
mask    RN 12
expa    RN 2
expb    RN 3
exp     RN expa
sign    RN expb
shift   RN expb
res     RN expb
guess   RN 14
num     RN b
den     RN a
div     RN 3


;===============================================================================
;
; RDCFix:
; BUGBUG: These comments aren't necessarily right anymore.
;
;


; __adds/__subs:
;
;   Upon entry the signs are checked, and if they are not equal, control is given
;   to the inverse routine while negating the second operand. Ie. __adds(+A,-B) ->
;   __subs(+A,+B) and __subs(-A,+B) -> __adds(-A,-B). After this check the signs are
;   known to be equal.
;
;   The operands are sorted to ensure that A >= B. This enables many checks to
;   be simplified: (A == 0) || (B == 0) reduces to (B == 0). The calculations
;   are also simpler: only operand B needs to be shifted. Unsigned arithmetic
;   is used to compare the packed numbers, since we want to have the operand with
;   the largest magnitude as operand A.
;
;   Special cases, namely zeroes, infinities, denormals and Not-a-Numbers (NaNs)
;   are checked for on entry. If one of the operands is special, a jump is made
;   to handle these cases out of line to keep overhead for the general case as
;   low as possible. Because the operands are sorted, only 2 checks need to be
;   made: operand A is checked for NaN, while operand B is checked for zero.
;
;   As the signs of the operands are known to be equal and the operands
;   are ordered, the sign of the result is the sign of one of the operands.
;   Since the exponent can only change a little (one in __adds, and often little
;   in __subs), the sign and exponent are not separated.
;
;   In __adds, the operands are added with the smallest one shifted right with the
;   exponent difference. The fraction might be larger then 1.0, and is renormalised
;   if neccesary (max 1 right shift needed). The exponent is adjusted with -1
;   (+ 1 if the fraction was >= 2.0) to counter for the leading bit when the
;   fraction and exponent are combined (using an ADD instruction).
;
;   In __subs, operand B is subtracted from a, after being shifted right with the
;   exponent difference. The result cannot be negative since A >= B, but it can
;   result in a unnormalized number (as the high bits of A and B might cancel out).
;   The common case results in the exponent being adjusted with +0 or -1, this is
;   when the MSB is still set, or when the next bit is set. In the last case
;   underflow to a denormalized number is possible. Rounding proceeds as normal.
;   When 2 or more leading bits of the result are clear, the result must be
;   normalized. If the resulting exponent is smaller than zero, denormalization
;   follows. No rounding is necessary (the round bit is zero since we shifted
;   left by at least 2 bits).
;
;   In the rounding stage, the exponent is recombined with the fraction
;   which leading bit is still set (if it is normalized). This causes the
;   exponent to increment by one. Therefore, the exponent has been decremented
;   in an earlier stage.
;   The round-bit is calculated in the result by using more precision than
;   necessary. After the result is shifted right to remove thse, the carry
;   contains the roundbit.
;   The guard bits are in the second operand, which are calculated by
;   left shifting. This is only necessary if the roundbit was set.
;   Round to even is implemented by always rounding upwards, and
;   clearing the LSB in case the guard bits were all zero. Thus an odd value will
;   be rounded up, while an even value will not. While rounding, the fraction may
;   become too large (>= 2.0), at which time the exponent must be incremented and
;   the fraction shifted right. However, this doesn't need extra code, since
;   exponent and fraction are already combined: the overflow from the fraction
;   increments the exponent. Note that this means a denormalized number might
;   become normalized while rounding!
;
;   For __adds, overflow is being checked after rounding by adding 1 to the exponent.
;   If the result was overflowed, the sign bit inverts (overflowed exponent is 255,
;   and 255+1 negates the sign bit). Note that overflow can only occur after
;   renormalization, or during rounding, but not in both.
;   Overflow cannot occur in __subs.
;
;   If one of the operands is an uncommon number, the following happens:
;   If the largest operand is a NaN, an Invalid Operation is raised (which
;   returns the NaN if it is a quiet NaN).
;   For __adds, infinities are returned unaltered (inf + inf = inf), but in __subs a
;   Invalid Operation exception is raised for inf - inf.
;   If the smallest operand is a zero, the other operand is returned (thus A + 0
;   -> A, A - 0 -> A, but a special case is -0 - -0 -> +0).
;   Denormalized numbers are handled by decoding an unnormalized fraction with
;   exponent 1. This is to make up for the hidden bit which is clear in
;   denormalized numbers. Normal addition or subtraction can now proceed without any
;   modification (the algorithms don't rely on the operands being normalized).
;   The result can be a denormalized number or a normalized number.
;
;   Frsb (B - A) is implemented by negating both signs on input of __subs. Its use
;   is mainly intended for code size optimization.
;
;===========================================================================

        [ :DEF: add_s

	AREA   |.text|, CODE, READONLY

        Export  __adds
        Export  __subs
        Export  __fArithReturn         ;; RDCFix: Should move to common area
        Export  __fArithNaNCheck       ;; RDCFix: Should move to common area
        Export  __flt_underflow        ;; RDCFix: Should move to common area
        IMPORT  FPE_Raise
        [ :DEF: thumb
        CODE32
        ]

; Prologues for __adds, __subs, __muls, and __divs must be the same.
    NESTED_ENTRY __subs
        EnterWithLR_16
        STMFD   sp!, {lr}               ; Save return address
        SUB     sp, sp, #LOC_SIZE       ; Allocate stack space
    PROLOG_END
        STR     r1, [sp, #OrgOp2l]      ; Save original args in case of exception
        MOV     r14, #_FpSubS           ; Initialize no exceptions, float sub
        STR     r0, [sp, #OrgOp1l]      ; Save original args in case of exception
        B       fsubtract

    ENTRY_END __subs


; Prologues for __adds, __subs, __muls, and __divs must be the same.
    NESTED_ENTRY __adds
        EnterWithLR_16
        STMFD   sp!, {lr}               ; Save return address
        SUB     sp, sp, #LOC_SIZE       ; Allocate stack space
    PROLOG_END
        STR     r1, [sp, #OrgOp2l]      ; Save original args in case of exception
        MOV     r14, #_FpAddS           ; Initialize no exceptions, float add
        STR     r0, [sp, #OrgOp1l]      ; Save original args in case of exception
        B       faddition


_faddn                                  ; Branch to here from subtract
        EOR     b, b, #1 << 31
        B       _fadd1


faddition
        ; if the signs are unequal, it is a subtract
        TEQ     a, b
        BMI     _fsubn
_fadd1
        ; swap a and b so that a >= b
        SUBS    tmp, a, b
        SUBLO   a, a, tmp
        ADDLO   b, b, tmp
        ; decode exponents, and filter out special cases
        MOV     exp, a, LSR #23         ; exp = sign<<8 + exponent
        SUB     shift, exp, b, LSR #23  ; shift = 0..254 (sign bits cancel out)
        MOV     tmp, #255 << 24
        TST     tmp, b, LSL #1          ; check for denorm/zero
        TEQNE   tmp, exp, LSL #24       ; check for inf/NaN
        BEQ     fadd_uncommon           ; handle zeroes/denorms/infs/NaNs
        ; decode fractions and add the leading one
        MOV     tmp, #1 << 31
        ORR     a, tmp, a, LSL #8       ; a = 1.frac_a
        ORR     b, tmp, b, LSL #8       ; b = 1.frac_b

fadd_add
                                        ; Check for inexact where all bits lost
        CMP     shift, #24              ; Shift amount >= 24?
        ORRGE   r14, r14, #INX_bit      ; Set inexact (note b != +/-0)
        BGE     fadd_add_core
        RSB     tmp, shift, #24         ; Number of bits lost
        MOVS    tmp, b, LSL tmp         ; Check lower bits of lesser operand
        ORRNE   r14, r14, #INX_bit      ; If bits set, then inexact


fadd_add_core                           ; do the addition and renormalise
        ADDS    a, a, b, LSR shift      ; CS if a >= 2.0

        BCS     fadd_carry
        ADD     exp, exp, #-1           ; adjust exp for leading bit
        MOVS    a, a, LSR #8            ; CS -> round up (never EQ)
        ADC     a, a, exp, LSL #23      ; combine sign, exp & fraction and round
        BCC     __fArithReturn
        RSB     shift, shift, #25
        MOVS    b, b, LSL shift         ; calc guard bits: CS,EQ -> round to even
        MOV     tmp, a, LSL #1
        CMNNE   tmp, #1 << 24           ; check for overflow (if not round to even)
        BCC     __fArithReturn          ; return if NOT(overflow OR round to even)
        BICEQ   a, a, #1                ; round to even
        CMN     tmp, #1 << 24           ; check for overflow
        BCC     __fArithReturn

fadd_overflow                           ; sign in a is correct
        ORR     r14, r14, #OVF_bit :OR: INX_bit
                                        ; Set overflow and inexact
        MOVS    r0, r0                  ; Check sign of result
        MOV     r1, #0xFF               ; Load up a correctly signed INF
        MOV     r0, r1, LSL #23         ; Move unsigned INF into result
        ORRMI   r0, r0, #0x80000000     ; Set sign bit if result negative
        B       __fArithReturn

fadd_carry
        MOV     a, a, RRX               ; restore leading bit
        MOVS    tmp, a, ROR #8          ; Check for inexact
        ORRMI   r14, r14, #INX_bit      ; Set inexact if bit set
        MOVS    a, a, LSR #8            ; CS -> round up (never EQ)
        ADC     a, a, exp, LSL #23      ; combine sign, exp & fraction and round
        MOV     tmp, a, LSL #1
        CMNCC   tmp, #1 << 24           ; check for overflow (if not round to even)
        BCC     __fArithReturn
        CMN     tmp, #1 << 24
        BCS     fadd_overflow
        RSB     shift, shift, #24
        MOVS    b, b, LSL shift         ; doesn't set carry if shift = 24!
        BICEQ   a, a, #1
        B       __fArithReturn


fadd_uncommon
        ; handle denorms, infinities and NaNs
        TEQ     tmp, exp, LSL #24       ; filter out NaN and infinites (EQ)
        BEQ     fadd_inf_NaN
        ; fast check for zeroes
        MOVS    tmp, b, LSL #1          ; EQ if b is zero
        BEQ     __fArithReturn          ; return A + 0 = A
        ; b is denornalized, a might be
        MOV     a, a, LSL #8            ; a = 0.frac_a
        MOV     b, b, LSL #8            ; b = 0.frac_b
        TST     exp, #255               ; a denormalized? (exp == 0 -> EQ)
        ORRNE   a, a, #1 << 31          ; no denorm, add leading one
        SUBNE   shift, shift, #1        ; correct shift
        ADDEQ   exp, exp, #1            ; both denorms - correct exp
        B       fadd_add

fadd_inf_NaN
        ; handle infinities and NaNs - a is infinite or NaN, b might be
        MOVS    tmp, a, LSL #9          ; EQ if a inf, NE if a NaN
        BEQ     __fArithReturn
        B       __fArithNaNCheck


_fsubn                                  ; Branch here from add
        EOR     b, b, #1 << 31
        B       _fsub1


fsubtract
        ; if the signs are unequal, it is an addition
        TEQ     a, b
        BMI     _faddn
_fsub1
        ; swap a and b so that a >= b
        SUBS    tmp, a, b
        EORLO   tmp, tmp, #1 << 31      ; negate both opnds (A - B = -B - -A)
        SUBLO   a, a, tmp
        ADDLO   b, b, tmp
        ; decode exponents, and filter out special cases
        MOV     exp, a, LSR #23         ; exp = sign<<8 + exponent
        SUB     shift, exp, b, LSR #23  ; shift = 0..254 (sign bits cancel out)
        MOV     tmp, #255 << 24
        TST     tmp, b, LSL #1          ; check for denorm/zero
        TEQNE   tmp, exp, LSL #24       ; check for inf/NaN
        BEQ     fsub_uncommon           ; handle zeroes/denorms/infs/NaNs
        ; decode fractions and add the leading one
        ORR     a, tmp, a, LSL #1
        BIC     a, a, #0xFE000000
        ORR     b, tmp, b, LSL #1

        ; Check for inexact
        CMP     shift, #32              ; Shift amount >= 31?
        ORRGE   r14, r14, #INX_bit      ; Set inexact (note b != +/-0)
        BGE     fsub_sub_core
        RSB     tmp, shift, #32         ; Number of bits lost
        MOVS    tmp, b, LSL tmp         ; Check lower bits of lesser operand
        ORRNE   r14, r14, #INX_bit      ; If bits set, then inexact

fsub_dosub
        RSB     b, b, #0xFE000000       ; Negate B

fsub_sub_core
        ; do the subtraction and calc number of bits to renormalise (0, 1, >=2)
        ADD     a, a, b, ASR shift
        MOVS    tmp, a, LSL #8          ; CS = 10/11, CC,MI = 01, CC,PL = 00
        BCS     fsub_renorm_0           ; high bit still set - no renormalisation
        BPL     fsub_renormalise        ; high 2 bits clear - renormalise >= 2 bits
        TST     expa, #254              ; exp == 1? (cannot be zero)
        BEQ     fsub_renormalise        ; yes -> underflow to denormalized number
fsub_renorm_1
        ; 1 left shift needed, exp -= 1
        MOV     a, tmp, ASR #8          ; doesn't set carry - no early exit!
; TST tmp, #0xFF           ; RDCFix: Need this?
; ORRNE r14, r14, #INX_bit ; RDCFix: Need this?
        RSBS    shift, shift, #32+1     ; shift can be <= 0...
        MOVLS   shift, #1               ; shift 1 -> CS and NE - always roundup
        MOVS    b, b, LSL shift         ; calc rounding (CS) and guard bits (EQ)
        ADC     a, a, exp, LSL #23      ; recombine sign, exponent and fraction
        BCC     __fArithReturn
        BNE     __fArithReturn
; ORR  r14, r14, #INX_bit  ; RDCFix: Need this?
        BICEQ   a, a, #1                ; round to even
        B       __fArithReturn

fsub_renorm_0
        ; no renormalisation needed
                                  ; RDCFix: Is this right?
        MOVS    a, tmp, LSL #32-9       ; Check if we're throwing away any bits
        ORRNE   r14, r14, #INX_bit      ; If we are, set inexact
        MOVS    a, tmp, LSR #9          ; CS -> round up
        ADC     a, a, exp, LSL #23      ; recombine sign, exponent and fraction
        BCC     __fArithReturn
        RSBS    shift, shift, #32-0     ; shift can be <= 0... -> don't round to even
        MOVHSS  b, b, LSL shift         ; EQ -> round to even
        BNE     __fArithReturn
        BICEQ   a, a, #1                ; round to even
        B       __fArithReturn

fsub_renormalise
        ; >= 2 bits renormalisation needed
        MOV     sign, exp, LSR #8
        TST     a, #0x00FF0000          ; bit 16..23 set?
        BNE     fsub_renorm_small
fsub_renorm_large
        ; bit 16..23 clear, >= 8 bits renormalisation
        MOVS    a, a, LSL #8
        BEQ     __fArithReturn          ; return +0 if result is zero
        SUB     exp, exp, #8
        TST     a, #0x00FF0000          ; bit 16..23 set?
        MOVEQ   a, a, LSL #8
        SUBEQ   expa, expa, #8
fsub_renorm_small
        ; renormalise A until bit 23 is set
        TST     a, #0x00F00000
        MOVEQ   a, a, LSL #4
        SUBEQ   exp, exp, #4
        TST     a, #0x00C00000
        MOVEQ   a, a, LSL #2
        SUBEQ   exp, exp, #2
        CMP     a, #1 << 23
        MOVCC   a, a, LSL #1
        ADC     exp, exp, #-3
        TEQ     sign, exp, LSR #8       ; exponent underflow? (signs differ if so)
        ADDEQ   a, a, exp, LSL #23     ; no rounding necessary
        BEQ     __fArithReturn
        ; underflow to denormalized number
        RSB     exp, exp, #0

;;
;; RDCFix: Move this to a common area (out of the adds/subs routine).
;;
;; Code adapted from except.s __flt_underflow
;;
;; Note that an underflow cannot occur for an add nor a subtract.  This is
;; because the Pegasus FP Specification states that underflow happens if the
;; result is denormal (or zero) after rounding and inexact.  Since the only
;; way we can get a denormal result from an add or subtract is to add/subtract
;; two denormals, and adding/subtracting two denormals is always exact (no
;; shift occurs as the exponents are equal), it is impossible to generate
;; an underflow condition.  Thus, for add and subtract, this code will just
;; generate the correct result.  The result will always be exact.
;;
;; For multiply and divide, inexacts must be detected here.  An inexact here
;; may or may not also raise underflow in __fArithReturn.  It is possible
;; for a normal result to enter here.
;;
;; Register usage:
;;   r0 - underflowed number with leading bit set, round and guard bits
;;   r2 - shift count for a (0 - exp)
;;   r3 - sign in bit 0 (negative if set)
;;
__flt_underflow
          ; RDCFix: What part of r2 is valid?  Only low byte?
          ;         I don't completely understand this.
                                       ; Check for inexact
        TST     r2, #0x000000E0        ; Check for shift >= 32
        ORRNE   r14, r14, #INX_bit     ; If shift >= 32, lost all bits: inexact

fp_underflow_calc_result
        MOV     r3, r3, LSL #31        ; Position sign into sign bit position
        ORRS    r3, r3, r0, LSR r2     ; Combine sign, exponent, and mantissa
        BCS     fp_underflow_carry
        RSB     r2, r2, #32            ; Check for inexact to see if we shifted
        MOVS    r0, r0, LSL r2         ;   any set bits out to the right
        ORRNE   r14, r14, #INX_bit     ; If we did, set inexact
        MOV     r0, r3
        B       __fArithReturn

fp_underflow_carry
        ORR     r14, r14, #INX_bit     ; RDCFix: Why is inexact guaranteed here?
        RSB     r2, r2, #33
        MOVS    r2, r0, LSL r2
        ADC     r0, r3, #0
        BICEQ   r0, r0, #1
        B       __fArithReturn


fsub_uncommon
        TEQ     tmp, exp, LSL #24       ; EQ if NaN
        BEQ     fsub_inf_NaN
fsub_denorm     ; here b is denormalized or zero, a might be a normal number
        ; check whether a or b is zero - fast case
        MOVS    tmp, a, LSL #1
        MOVEQ   a, #0                   ; -0 - -0 = +0
        MOVS    b, b, LSL #1            ; EQ if b == 0 or a == 0
        BEQ     __fArithReturn          ; return a - 0 = a
        ; b is denormalized, a might be
        TST     exp, #255
        BIC     a, tmp, #0xFF000000
        ORRNE   a, a, #1 << 24
        SUBNE   shift, shift, #1
        ADDEQ   exp, exp, #1

        ; Check for inexact
        CMP     shift, #31              ; Shift amount >= 31?
        ORRGE   r14, r14, #INX_bit      ; Set inexact (note b != +/-0)
        BGE     fsub_sub_core
        RSB     tmp, shift, #31         ; Number of bits lost
        MOVS    tmp, b, LSL tmp         ; Check lower bits of lesser operand
        ORRNE   r14, r14, #INX_bit      ; If bits set, then inexact

        RSB     b, b, #0
        B       fsub_sub_core


fsub_inf_NaN
        ; handle infinities and NaNs - a is infinite or a NaN, b might be
        MOVS    tmp, a, LSL #9          ; a NaN? (NE)
        BNE     __fArithNaNCheck
        CMP     a, b                    ; a is infinite, b too? (EQ)
        ORREQ   r14, r14, #IVO_bit      ; Set invalid operation if is
        ORREQ   r0, r0, #fSignalBit     ; Make INF into a QNaN
        B       __fArithReturn          ; yes, a & b infinite -> generate IVO


;;
;; _fArithReturn
;;
;; Register Usage:
;;   r0 - Default return value
;;   r14 - Exception information
;;
;; Stack:
;;   | Caller's Frame |
;;   |                |
;;   +----------------+
;;   | Return Address |
;;   +----------------+
;;   | Original Arg2  |
;;   +----------------+
;;   | Original Arg1  |  <-- SP
;;   +----------------+
;;      Stack Top
;;
;;
;; Standard return path for single precision arithmetic routines.  It checks
;; if any exceptions occurred.  If any exceptional conditions occurred, then
;; an FPIEEE exception record is allocated and filled with the approptiate
;; values and the exception handler called.  Upon returning, the possibly
;; changed result is loaded from the returned union, the stack space is
;; restored, and control is returned to the caller.  If no exceptions occurred,
;; then the default result is returned.
;;
__fArithReturn
       TST     r14, #FPECause_mask      ; Any exceptions?
       ADDEQ   sp, sp, #LOC_SIZE        ; None so pop original args
  IF Interworking :LOR: Thumbing
       LDMEQFD sp!, {lr}                ;   and return
       BXEQ    lr
  ELSE
       LDMEQFD sp!, {pc}                ;   and return
  ENDIF
                                        ; Else we have an exception
                                        ; Check for underflow (denormal & inexact)
       MOV     tmp, #0xFF000000         ; Load up exponent mask << 1
       TST     r0, tmp, LSR #1          ; See if exponent is zero
       BNE     no_underflow             ; Non-zero exponent so no underflow possible
       TST     r14, #INX_bit            ; See if inexact bit is set
       ORRNE   r14, r14, #UNF_bit       ; If inexact, then underflow
no_underflow
       STR     r0, [sp, #ExDResl]       ; Push default result
       LDR     r0, [sp, #OrgOp2l]       ; Get orig Arg2 off stack
       STR     r0, [sp, #ExOp2l]        ; Push Arg2
       LDR     r2, [sp, #OrgOp1l]       ; Get orig Arg1 off stack
       MOV     r1, r14                  ; ExInfo
       ADD     r0, sp, #NewResl         ; Pointer to result from ex. handler
                                        ;   Note that this clobbers original
       CALL    FPE_Raise

    IF Thumbing :LAND: :LNOT: Interworking
        CODE16
        bx      pc              ; switch back to ARM mode
        nop
        CODE32
    ENDIF                                   ;   Arg1 and Arg2 on the stack

       LDR     r0, [sp, #NewResl]       ; Get returned result
       ADD     sp, sp, #LOC_SIZE        ; Pop orig. args and arg passing space
  IF Interworking :LOR: Thumbing
       LDMFD   sp!, {lr}                ; Return
       BX      lr
  ELSE
       LDMFD   sp!, {pc}                ; Return
  ENDIF


;; __fArithNaNCheck
;;
;; Checks both operands for SNaNs and raises and exception if one is present.
;; If no SNaNs are present, then a QNaN is returned.  At least one of Arg1
;; and Arg2 must be a NaN.
;;
;; Register usage:
;;   r0 - Arg1 (must be a NaN if Arg2 is not)
;;   r1 - Arg2 (must be a NaN if Arg1 is not)
;;   r14 - FP exception information
;;
;; Code adapted from except.s.
__fArithNaNCheck
        MOV     a4, #0x01000000
        CMN     a4, fOP1, LSL #1
        BLS     fcheck_opnd2_NaN
fcheck_opnd1_NaN
        TST     fOP1, #fSignalBit
        ORREQ   fOP1, fOP1, #fSignalBit
        ORREQ   r14, r14, #IVO_bit
        BEQ     __fArithReturn
        CMN     a4, fOP2, LSL #1
        BLS     __fArithReturn
fcheck_opnd2_NaN
        MOV     fOP1, fOP2
        TST     fOP1, #fSignalBit
        ORREQ   fOP1, fOP1, #fSignalBit
        ORREQ   r14, r14, #IVO_bit
        B       __fArithReturn

    ENTRY_END __adds
        ]


;------------------------------------------------------------------------------

        [ :DEF: mul_s

	AREA   |.text|, CODE, READONLY

        Export  __muls
        Export  fmul_fdiv_overflow
        IMPORT  __fArithReturn
        IMPORT  __fArithNaNCheck
        IMPORT  __flt_normalise2
        IMPORT  __flt_underflow


        MACRO
        MULL48  $a, $b, $res, $tmp
        ; a = AAAAAA00
        ; b = BBBBBB00

        UMULL   $tmp, $res, $a, $b
        SUB     exp, exp, #128 << 16    ; subtract bias+1 - 0..253 normal
        CMP     $tmp, #0
        ORRNE   $res, $res, #1

        MEND


    ; Prologues for __adds, __subs, __muls, and __divs must be the same
    NESTED_ENTRY __muls
        EnterWithLR_16
        STMFD   sp!, {lr}               ; Save return address
        SUB     sp, sp, #LOC_SIZE       ; Allocate local storage
    PROLOG_END
        STR     r0, [sp, #OrgOp1l]      ; Save off args in case of exception
        MOV     r14, #_FpMulS           ; Initialize no exceptions, float multiply
        STR     r1, [sp, #OrgOp2l]

        MOV     mask, #255 << 16
        ANDS    expa, mask, a, LSR #7
        ANDNES  expb, mask, b, LSR #7
        TEQNE   expa, mask
        TEQNE   expb, mask
        BEQ     fmul_uncommon
        TEQ     a, b
        ORRMI   expa, expa, #1 << 8
        MOV     mask, #1 << 31
        ORR     a, mask, a, LSL #8
        ORR     b, mask, b, LSL #8
fmul_mul
        ADD     exp, expa, expb
        MULL48  a, b, res, tmp

   ;; r1 now available for scratch

        CMP     res, #&80000000
    ORRCS r1, tmp, res, LSL #24        ; Check res low 8 bits, low bits
        MOVLO   res, res, LSL #1
        ADC     exp, exp, exp, ASR #16  ; recombine sign & exp, and adjust exp
    ORRS r1, tmp, res, LSL #25         ; check low 7 bits, low bits
    ORRNE r14, r14, #INX_bit

fmul_round
        MOVS    a, res, LSR #8          ; never EQ (leading bit)
        ADC     a, a, exp, LSL #23      ; add fraction, and round
        TSTCS   res, #0x7f              ; EQ -> round to even
        CMPNE   exp, #252 << 16         ; possible overflow? (never EQ)
        BLO     __fArithReturn          ; return if no overflow and no round to even
        BICEQ   a, a, #1                ; delayed round to even
        CMP     exp, #252 << 16
        BLO     __fArithReturn
        BPL     fmul_fdiv_overflow

fmul_underflow                          ; result may be normalised after rounding
        MOV     r1, a, LSL #1
        SUB     r1, r1, #1 << 24
        CMP     r1, #3 << 24           ; result exp in 1..3 -> return
        BLO     __fArithReturn
        MOV     a, res
        MVN     sign, exp, LSR #8       ; correct sign from underflowed exponent
        RSB     exp, exp, #8            ; calc denormalising shift
        B       __flt_underflow

;;
;; fmul_fdiv_overflow is shared between __muls and __divs.
;;
fmul_fdiv_overflow                      ; result might not be overflowed after all
	MOV	tmp, a, LSL #1
	ADD	tmp, tmp, #1 << 24
	CMP	tmp, #254 << 24		; Check for exp = 253 or 254
	BHS	__fArithReturn		; no overflow - 9 cycles overhead
	SUBS	a, a, exp, LSL #7       ; get correct sign
	ORR	r14, r14, #OVF_bit :OR: INX_bit	; Set overflow and inexact
        MOV     a, #0x7F000000          ; Create a properly signed INF
        ORR     a, a, #0x00800000       ;   ...
        ORRMI   a, a, #0x80000000       ;   ...
	B	__fArithReturn


fmul_uncommon                           ; a or b denorm/NaN/inf
        AND     expb, mask, b, LSR #7
        TEQ     a, b
        ORRMI   expa, expa, #1 << 8
        CMP     expa, mask
        CMPLO   expb, mask
        BHS     fmul_inf_NaN
        ; a or b denorm, first check for zero case
        MOVS    tmp, a, LSL #1
        MOVNES  tmp, b, LSL #1
        MOVEQ   a, expa, LSL #23        ; return signed zero
        BEQ     __fArithReturn
        ; normalise operands
        ADR     tmp, fmul_mul
        B       __flt_normalise2


fmul_inf_NaN                            ; a or b is a NaN or infinite
        MOV     tmp, #0x01000000
        CMN     tmp, a, LSL #1
        CMNLS   tmp, b, LSL #1
        BHI     __fArithNaNCheck
        ; now a or b is infinite - check that a and b are non-zero
        MOVS    tmp, a, LSL #1          ; a zero?
        MOVNES  tmp, b, LSL #1          ; b zero?
        ORRNE   expa, expa, #255        ; create infinite
        MOV     a, expa, LSL #23        ;   with correct sign
                                        ; If NE: a & b nonzero, return infinite
        ORREQ   r14, r14, #IVO_bit      ; If EQ: inf * 0 signals an exception
        ORREQ   a, a, #0x7F000000       ; Create a QNaN
        ORREQ   a, a, #0x00C00000       ;   ...
        B       __fArithReturn

    ENTRY_END __muls
        ]

;---------------------------------------------------------------------------

        [ :DEF: div_s

	AREA   |.text|, CODE, READONLY

        Export  __divs
        IMPORT  __flt_normalise2
        IMPORT  __fArithReturn
        IMPORT  __flt_underflow
        IMPORT  fmul_fdiv_overflow
        IMPORT  __fArithNaNCheck

        ; TODO: * halve lookup table size


        DCB   0, 0, 0, 0
        DCB   1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 17
        DCB  18, 19, 20, 21, 22, 24, 25, 26, 27, 28, 30, 31, 32, 33, 35, 36
        DCB  37, 39, 40, 41, 43, 44, 46, 47, 48, 50, 51, 53, 54, 56, 57, 59
        DCB  60, 62, 63, 65, 66, 68, 70, 71, 73, 74, 76, 78, 80, 81, 83, 85
        DCB  87, 88, 90, 92, 94, 96, 98,100,102,104,106,108,110,112,114,116
        DCB 118,120,122,125,127,129,131,134,136,138,141,143,146,148,151,153
        DCB 156,158,161,164,166,169,172,175,178,180,183,186,189,192,195,199
        DCB 202,205,208,212,215,218,222,225,229,233,236,240,244,248,252,255
fdiv_tab

    ; Prologues for __adds, __subs, __muls, and __divs must be the same
    NESTED_ENTRY __divs
        EnterWithLR_16
        STMFD   sp!, {lr}               ; Save return address
        SUB     sp, sp, #LOC_SIZE       ; Allocate local storage
    PROLOG_END
        STR     r0, [sp, #OrgOp1l]      ; Save off args in case of exception
        MOV     r14, #_FpDivS           ; Initialize no exceptions, float divide
                                        ;   Note that r14 is also used by "guess"
        STR     r1, [sp, #OrgOp2l]
        MOV     mask, #255 << 16
        ANDS    expa, mask, a, LSR #7
        ANDNES  expb, mask, b, LSR #7
        CMPNE   expa, #255 << 16
        CMPNE   expb, #255 << 16
        BEQ     fdiv_uncommon
        TEQ     a, b
        ADDMI   expa, expa, #1 << 8
        ORR     tmp, a, #1 << 23
        ORR     den, b, #1 << 23
        BIC     num, tmp, #0xFF000000
        BIC     den, den, #0xFF000000
fdiv_div
        ; calculate exponent and find leading bit of result
        SUB     exp, expa, expb
        CMP     num, den
        ; this code fills result delay slots
        ;MOVLO   num, num, LSL #1         ; shift so that div >= 1 << 23
        ;ADD     exp, exp, #(127-2) << 16 ; subtract bias (one too small)
        ;ADC     exp, exp, exp, ASR #16   ; calc exp, combine with sign

        ; lookup guess of 1/den - use rounded inverted tablelookup
        ADD     tmp, den, #32768 + ((. + 12 - (fdiv_tab + 127)) << 16)
        LDRB    guess, [pc, -tmp, LSR #16]
        RSB     den, den, #0            ; result delay - negate den for MLA
        ADD     guess, guess, #256

        ; do one Newton-Rhapson iteration to increase precision to 15 bits
        MUL     tmp, den, guess
        MOVLO   num, num, LSL #1        ; result delay - shift so that div >= 1 << 23
        MOV     tmp, tmp, ASR #4
        MUL     div, tmp, guess
        MOV     guess, guess, LSL #7
        ADD     guess, guess, div, ASR #21

        ; long division - 13 bits
        MOV     tmp, num, LSR #10
        MUL     tmp, guess, tmp
        MOV     num, num, LSL #12
        MOV     div, tmp, LSR #17
        MLA     num, den, div, num
        ADD     exp, exp, #(127-2) << 16 ; result delay - subtract bias (one too small)

        ; long division - 11 bits (can do 12)
        MOV     tmp, num, LSR #10
        MUL     tmp, guess, tmp
        MOV     num, num, LSL #11
        MOV     tmp, tmp, LSR #18
        MLA     num, den, tmp, num
        ADC     exp, exp, exp, ASR #16  ; result delay - calc exp, combine with sign

        ; correct div (may be one too small)
        CMN     num, den
        ADDHS   num, num, den           ; now num < den
        ADC     div, tmp, div, LSL #11

        MOV     r14, #_FpDivS           ; Reinitialize no exceptions, float divide
                                        ;   Note that r14 was used for "guess"
        CMP     num, #0                 ; Check for inexact
        ORRNE   r14, r14, #INX_bit      ; Set inexact if bits lost
        CMN     den, num, LSL #1        ; CS -> round, EQ -> round to even
        ADC     a, div, exp, LSL #23    ; recombine exp and fraction - increment exp
        CMPNE   exp, #252 << 16         ; exp < 252 cannot overflow
        BLO     __fArithReturn
        BICEQ   a, a, #1
        CMP     exp, #252 << 16         ; exp < 252 cannot overflow
        BLO     __fArithReturn
        BPL     fmul_fdiv_overflow

fdiv_underflow                          ; result may be normalised after rounding
        MOV     tmp, a, LSL #1
        SUB     tmp, tmp, #1 << 24
        CMP     tmp, #3 << 24           ; result exp in 1..3 -> return
        BLO     __fArithReturn
        CMP     num, #1                 ; num contains implicit guard bits
        ADC     a, div, div             ; add explicit guard bit (1 if num > 0)
        MVN     sign, exp, LSR #8       ; get correct sign
        RSB     exp, exp, #1            ; calc 1 - exp
        B       __flt_underflow


fdiv_uncommon
        AND     expb, mask, b, LSR #7
        TEQ     a, b
        ORRMI   expa, expa, #1 << 8
        CMP     expa, mask
        CMPLO   expb, mask
        BHS     fdiv_inf_NaN
        ; a or b denorm, first check for zero case
        MOVS    tmp, b, LSL #1
        BEQ     fdiv_divbyzero          ; a / 0 -> division by zero
        MOVS    tmp, a, LSL #1          ; 0 / b -> 0
        MOVEQ   a, expa, LSL #23        ; return signed zero
        BEQ     __fArithReturn
        ; normalise operands
        ADR     tmp, fdiv_div1
        B       __flt_normalise2

fdiv_div1                               ; remove... quick hack
        MOV     tmp, a, LSR #8
        MOV     den, b, LSR #8
        MOV     num, tmp
        B       fdiv_div


fdiv_inf_NaN                            ; a or b is a NaN or infinite
        MOV     tmp, #0x01000000
        CMN     tmp, a, LSL #1
        CMNLS   tmp, b, LSL #1
        BHI     __fArithNaNCheck
        ; now a or b is infinite - check that a and b are not both infinite
        CMN     tmp, a, LSL #1
        CMNEQ   tmp, b, LSL #1
        MOVEQ   a, expa, LSL #23
        ORREQ   r14, r14, #IVO_bit      ; Set invalid
        ORREQ   a, a, #0x7F000000       ; Create QNaN
        ORREQ   a, a, #0x00C00000       ;   ...
        BEQ     __fArithReturn          ; inf / inf -> IVO
        CMN     tmp, b, LSL #1          ; b inf? (EQ)
        MOVEQ   a, #0                   ; a / inf -> signed zero
        BICNE   a, a, #1 << 31          ; inf / b = inf (even inf / 0 = inf)
        ORR     a, a, expa, LSL #23     ; set sign
        B       __fArithReturn

fdiv_divbyzero                          ; b zero
        MOVS    tmp, a, LSL #1
        ORREQ   r14, r14, #IVO_bit      ; 0 / 0 -> IVO
        ORREQ   a, a, #0x7F000000       ; Create QNaN
        ORREQ   a, a, #0x00C00000       ;   ...
        ORRNE   r14, r14, #DVZ_bit      ; A / 0 -> DVZ
        MOVNE   a, expa, LSL #23        ; set sign of result (returns signed inf)
        ORRNE   a, a, #0x7F000000       ; Create properly signed INF
        ORRNE   a, a, #0x00800000       ;   ...
        B       __fArithReturn

    ENTRY_END __divs
        ]

;---------------------------------------------------------------------------

        [ :DEF: fnorm2_s

	AREA   |.text|, CODE, READONLY

        EXPORT  __flt_normalise2

        ; normalise a or b (or both). One operand is denormalised
        ; a = x0AAAAAA, bits 0-22 nonzero, bits 23-30 zero
        ; normalise such that bit 23 = 1
        ; return to address in tmp

        [ :DEF: thumb
        CODE32
        ]
__flt_normalise2
        MOV     a, a, LSL #8
        MOV     b, b, LSL #8
        TST     expa, #255 << 16
        BNE     fnorm_b
fnorm_a
        CMP     a, #1 << 16
        SUBLO   expa, expa, #16 << 16
        MOVLO   a, a, LSL #16
        TST     a, #255 << 24
        SUBEQ   expa, expa, #8 << 16
        MOVEQ   a, a, LSL #8
        TST     a, #15 << 28
        SUBEQ   expa, expa, #4 << 16
        MOVEQ   a, a, LSL #4
        TST     a, #3 << 30
        SUBEQ   expa, expa, #2 << 16
        MOVEQS  a, a, LSL #2
        MOVPL   a, a, LSL #1
        ADDMI   expa, expa, #1 << 16

        TST     expb, #255 << 16
        ORRNE   b, b, #1 << 31
        MOVNE   pc, tmp
fnorm_b
        ORR     a, a, #1 << 31
        CMP     b, #1 << 16
        SUBLO   expb, expb, #16 << 16
        MOVLO   b, b, LSL #16
        TST     b, #255 << 24
        SUBEQ   expb, expb, #8 << 16
        MOVEQ   b, b, LSL #8
        TST     b, #15 << 28
        SUBEQ   expb, expb, #4 << 16
        MOVEQ   b, b, LSL #4
        TST     b, #3 << 30
        SUBEQ   expb, expb, #2 << 16
        MOVEQS  b, b, LSL #2
        MOVPL   b, b, LSL #1
        ADDMI   expb, expb, #1 << 16
        MOV     pc, tmp


        ]

;===========================================================================

        END