;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; Microsoft Research Singularity
;;; 
;;; Copyright (c) Microsoft Corporation.  All rights reserved.
;;;
;;; This file contains ARM-specific assembly code.
;;;

	GBLL mul_s

	GET veneer_d.asm

	END


;;;;  THE BELOW ROUTINE SHOULD WORK, BUT THE ARM ROUTINES SHOULD BE FASTER.


;
; Translated to ARM from SH3 FP emulation routines.
;
; __muld  Double precision floating point multiplication.
; Input:
;   r0 - Arg1.low
;   r1 - Arg1.high
;   r2 - Arg2.low
;   r3 - Arg2.high
; Output:
;   r0 - Result.low
;   r1 - Result.high
;
; Note:
;   If any FP exceptions are enabled, this routine may raise an exception.
;
;
; IEEE DOUBLE FORMAT
;
; 8 BYTES (LONG WORD * 2)
; 63 62       52 51                                                 0
; +-+-----------+----------------------------------------------------+
; |s|   e(11)   |                         m(52)                      |
; +-+-----------+----------------------------------------------------+
;               ^ point
;
; INFINITY NUMBER : e =  2047          m = 0
; ZERO            : e =     0          m = 0
; NaN             : e =  2047          m != 0
; DENORMAL NUMBER : e =     0          m != 0
;
    GET      fpe.asm

    Export   __muld

    IMPORT   FPE_Raise

    AREA |.text|, CODE, READONLY


CARRY_CHECK EQU 0x01000000
MSB         EQU 0x00100000


__muld

    STMFD   sp!, {r0-r9, lr} ; Save off args and non-volatiles and lr

    MOV     r8, r1          ; Load parameter1 as R8 R0
    MOV     r4, r2          ; Load parameter2 as R2 R4
    MOV     r2, r3          ;   ...
    MOV     r5, #_FpMulD    ; Double multiply, assume no exceptions


; Unpack parameters.
;
; R8 R0:   mantissa1            R2 R4:   mantissa2
; R9:      exponent1            R1:      exponent2
; R7:      sign = sign1 XOR sign2
;
; R5:      Exception flags

    MOV     r9, r8, LSL #1     ; Extract exponent1
    MOV     r9, r9, LSR #21    ;   ...
    MOV     r1, r2, LSL #1     ; Extract exponent2
    MOV     r1, r1, LSR #21    ;   ...
    MVN     r3, #0             ; Set up to extract mantissas
    EOR     r7, r8, r2         ; Compute sign of result
    AND     r8, r8, r3, LSR #12; Extract mantissa1
    AND     r2, r2, r3, LSR #12; Extract mantissa2


; Check for exceptional cases.  All NaNs, infinities, and 0's are eliminated.
; Denormal numbers return here after normalizing them.  After these checks,
; both parameters are normalized numbers.
;
; if (exponent1 == 2047)
;     exception1; parameter1 is nonfinite
; if (exponent2 == 2047)
;     exception2; parameter1 is finite, parameter2 is nonfinite
; if (exponent1 ==    0)
;     exception3; parameter1 is 0 or denormal, parameter2 is finite
; if (exponent2 ==    0)
;     exception4; parameter1 is normalized, parameter2 is 0 or denormal

    ADD     r3, r9, #1      ; if (exponent1==2047)
    CMP     r3, #2048       ;   ..
    BEQ     exception1      ;   exception1

    ADD     r3, r1, #1      ; if (exponent2==2047)
    CMP     r3, #2048       ;   ..
    BEQ     exception2      ;   exception2

    CMP     r9, #0          ; if (exponent1==0)
    BEQ     exception3      ;   exception3
exception_return3

    CMP     r1, #0          ; if (exponent2==0)
    BEQ     exception4      ;   exception4
exception_return4

; Multiply the 53-bit mantissa1 and mantissa2 to produce a 106-bit product.
;
; Mantissas:
;
;  63       53   51                 32 31                                0
;  31       21   19                  0 31                                0
; +-----------+-+---------------------+-----------------------------------+
; |<--- 0 --->|1|      m1h, m2h       |             m1l, m2l              |
; +-----------+-+---------------------+-----------------------------------+
;               ^ Binary point
;
; Partial product terms:
;
;                                     m1l*m2l.h       m1l*m2l.l
;                     m1h*m2l.h < C + m1h*m2l.l
;                   + m1l*m2h.h < C + m1l*m2h.l
;   + m1h*m2h.h < C + m1h*m2h.l
;   -----------     -----------     -----------     -----------
;          res3            res2            res1            res0
;
; Intermediate result:
;
;  127                106 104 103   96 95       64 63       32 31        0
;  31                  10 9 8 7      0 31        0 31        0 31        0
; +----------------------+-+-+--------+----/\/----+----/\/----+----/\/----+
; |<-------- 0 --------->|?|?|R3: res3|  R8: res2 |  R0: res1 |  R6: res0 |
; +----------------------+-+-+--------+----/\/----+----/\/----+----/\/----+
;                            ^ Binary point

    ADD     r9, r9, r1      ; Compute exponent of result ...

    UMULL   r6, r1, r0, r4  ; Compute m1l * m2l
                            ;   r6 = m1l*m2l.l, res0
                            ;   r1 = m1l*m2l.h

    ORR     r8, r8, #MSB    ; Set mantissa1's hidden bit
    ORR     r2, r2, #MSB    ; Set mantissa2's hidden bit

    UMULL   r4, r3, r8,r4   ; Compute m1h * m2l
                            ;   r4 = m1h*m2l.l
                            ;   r3 = m1h*m2l.h

    ADDS    r4, r1, r4      ; Add 1st 2 terms of res1

    SUB     r9, r9, #0x400  ;   ... compute exponent of result
    ADD     r9, r9, #0x1    ;   ... compute exponent of result

    UMULL   r0, r1, r2, r0  ; Compute m1l * m2h
                            ;   r0 = m1l*m2h.l
                            ;   r1 = m1l*m2h.h

    ADCS    r1, r1, r3      ; Add 1st 2 terms of res2, no carry out
    ADCS    r0, r0, r4      ; Add 3rd term of res1, no carry in

    UMULL   r8, r3, r2, r8  ; Compute m1h * m2h
                            ;   r8 = m1h*m2h.l
                            ;   r3 = m1h*m2h.h

    ADCS    r8, r8, r1      ; Add 3rd term of res2
    ADC     r3, r3, #0      ; Add res2's carry to res3


; Shift the intermediate result right 17 bits, and 1 more if the product took
; 2 bits to the left of the binary point.  Fold all dropped bits from the right
; into the sticky bit S.  This leaves the result in standardized form for
; rounding.
;
; Result:
;  63     56   54                   32 31                            3 2 0
;  31     24   22                    0 31                            3 2 0
; +---------+-+-----------------------+-----------------------------------+
; |<-- 0 -->|1|         R8            |               R0             L|GRS|
; +---------+-+-----------------------+-----------------------------------+
;             ^ Binary point

normalize
    CMP     r6, #0              ; Fold bits we're about to lose into a
    ORRNE   r0, r0, #1          ;   sticky bit
    MOV     r6, r6, LSR #17     ; Shift intermediate result right 17
    ORR     r6, r6, r0, LSL #15 ;   ..
    MOV     r0, r0, LSR #17     ;   ..
    ORR     r0, r0, r8, LSL #15 ;   ..
    MOV     r8, r8, LSR #17     ;   ..
    ORR     r8, r8, r3, LSL #15 ;   ..
    TST     r8, #CARRY_CHECK    ; If product has 2 bits to the left of the
    BEQ     end_normalize       ;   binary point
    MOVS    r8, r8, LSR #1      ; Then normalize by scaling right 1
    MOVS    r0, r0, RRX         ;   more bit
    MOV     r6, r6, RRX         ;   ..
    ADD     r9, r9, #1          ;   ..
end_normalize

; There are still 17 or 18 guard bits on the left of R6 that need to be folded
; into the sticky bit S.  It's safe to check the right ones over again because
; we're only concerned with stickiness.

    CMP     r6,#0              ; If any guard bits below S are set
    ORRNE   r0, r0, #1         ;   fold them into S

; Denormalize the result if necessary, with no concern for performance.

    CMP     r9, #0             ; If exponent <= 0
    BGT     end_denormal       ;   ..
    RSB     r9, r9, #0         ; Then shift right exponent1 places
    ADD     r9, r9, #1         ;   +1 for the non-hidden bit
denormal_loop
    MOVS    r8, r8, LSR #1     ;   ..
    MOVS    r0, r0, RRX        ;   ..
    ORRCS   r0, r0, #1         ;   Fold the lost bit into the sticky bit
    SUBS    r9, r9, #1         ;   ..
    BNE     denormal_loop      ;   ..
end_denormal

; Round to nearest.  If rounding occurs, set inexact and
; mantissa += G & ( L | R | S ).  If the rounding carries, then renormalize.

; Test for inexact.
    TST     r0, #0x7           ; If G|R|S (=> rounding required)
    BEQ     end_round          ;   ..
    ORR     r5, r5, #INX_bit   ;   result is inexact

; Round to nearest.
    TST     r0, #0x4           ; If G &&
    BEQ     end_round          ;   ..
    TST     r0, #0xB           ;   L|R|S
    BEQ     end_round          ;     ..
    ADDS    r0, r0, #0x8       ; Then round the mantissa up
    ADC     r8, r8, #0         ;   ..

    CMP     r8, #CARRY_CHECK   ;   If the rounding carried
    BLT     no_normal_carry    ;     (mantissa >= 0x01000000)
    
    MOVS    r8, r8, LSR #1     ;   Then renormalize
    MOV     r0, r0, RRX        ;     ..
    ADD     r9, r9, #1         ;     ..
    B       end_round          ;     ..

no_normal_carry
    CMP     r9, #0             ;   Else if (exponent == 0)
    BNE     end_round          ;     ..
    CMP     r8, #CARRY_CHECK>>1;     && (mantissa >= 0x00800000)
    MOVGE   r9, #1             ;   Then rounded MaxDenorm to MinNormal

end_round

; Test for overflow.  Do this after rounding in case rounding caused overflow.
    ADD     r3, r9, #1         ; If (exponent >= 2047)
    CMP     r3, #2048          ;   ..
    BGE     return_overflow    ;   return overflow exception

; Test tininess after rounding.
    TST     r5, #INX_bit        ; If already inexact
    BEQ     end_check_underflow1;  ..
    CMP     r9, #0              ;   and if exponent = 0
    ORREQ   r5, r5, #UNF_bit    ;     result has underflowed too
end_check_underflow1

; Pack the result back into IEEE format.

return_value

    MOV     r0, r0, LSR #3      ; Shift mantissa right 3 to remove GRS
    ORR     r0, r0, r8, LSL #29 ;   ..
    MOV     r8, r8, LSR #3      ;   ..
    MVN     r3, #0              ; Mask away the hidden bit
    AND     r8, r8, r3, LSR #12 ;   ..
    ORR     r1, r8, r9, LSL #20 ; Merge exponent and mantissa
    MOVS    r7, r7              ; Merge sign with exponent and mantissa
    ORRMI   r1, r1, #0x80000000 ;   ..

; If any trap enable flags are set corresponding to exception flags set,
; set the corresponding cause bits and cause a trap.
;
; if (exception)
;     call handler
;     extract the possibly updated result
; return

return

    TST     r5, #FPECause_mask ; If any exceptions occurred ...
    BEQ     done               ;   ..

cause_trap
;;
;;  Register usage:
;;      r0 - Default result.low
;;      r1 - Default result.high
;;      r5 - Exception information
;;
;;  Stack:
;;      0x10(sp) - up: Saved registers
;;      0xC(sp): Original Arg2.high
;;      0x8(sp): Original Arg2.low
;;      0x4(sp): Original Arg1.high
;;      0x0(sp): Original Arg1.low
;;
        LDR     r2, [sp, #0x8]           ; Load original Arg2.low
        LDR     r3, [sp, #0xC]           ; Load original Arg2.high
        SUB     sp, sp, #0x8             ; Make room for exception information
        STR     r2, [sp, #0x0]           ; Store original Arg2.low
        STR     r3, [sp, #0x4]           ; Store original Arg2.high
        LDR     r3, [sp, #0x8]           ; Load original Arg1.low
        LDR     r2, [sp, #0xC]           ; Load original Arg1.high
        STR     r0, [sp, #0x8]           ; Store default result.low
        STR     r1, [sp, #0xC]           ; Store default result.high
        MOV     r1, r5                   ; Move exception information
        ADD     r0, sp, #0x10            ; Pointer for return value

;;  Register Usage:
;;      r0 - Address for return value = 0x10(sp)
;;      r1 - Exception information
;;      r2 - Original arg1.low
;;      r3 - Original arg1.high
;;
;;  Stack Usage:
;;      0x14(sp): Return result.high
;;      0x10(sp): Return result.low
;;      0xC(sp): Default result.high
;;      0x8(sp): Default result.low
;;      0x4(sp): Original arg2.high
;;      0x0(sp): Original arg2.low
        CALL    FPE_Raise             ; Deal with exception information

    IF Thumbing :LAND: :LNOT: Interworking
        CODE16
        bx      pc              ; switch back to ARM mode
        nop
        CODE32
    ENDIF	
	
        LDR     r0, [sp, #0x10]       ; Load up returned result
        LDR     r1, [sp, #0x14]       ;  ...
        ADD     sp, sp, #0x8          ; Restore extra arg passing space

done
    ADD     sp, sp, #0x10             ; Pop off original args
  IF Interworking :LOR: Thumbing
    LDMIA   sp!, {r4-r9, lr}
    BX      lr
  ELSE
    LDMIA   sp!, {r4-r9, pc}
  ENDIF
                                      ; Restore off non-volatiles and return


;%%%%%%%%%%%%%%%%%%%%%%%%%
;%  Exceptional process  %
;%%%%%%%%%%%%%%%%%%%%%%%%%

; Exception 1: parameter1 is non-finite (exponent1 == 2047); it's either a
; NaN or inf.  The mantissa has not been shifted left for the guard bits yet.
;
; If either parameter is an SNaN, return an invalid op exception with a QNaN.
; Otherwise, if either parameter is a QNaN, silently return a QNaN.  Otherwise,
; parameter1 is inf.  Return an invalid op exception with a QNaN for inf*0, or
; inf for inf*inf or inf*<non-0 finite>.
;
; if (mantissa1<51> == 0 &           // parameter1 is an SNaN
;     mantissa1 != 0)                //  ..
;     return invalid op exception
; else if (exponent2 == 2047 &       // parameter2 is an SNaN
;          mantissa2<51> == 0 &      //   ..
;          mantissa2 != 0)           //   ..
;     return invalid op exception
; else if (mantissa1 != 0)           // parameter1 is a QNaN
;     return QNaN
; else if (exponent2 != 2047)        // parameter2 is finite
;     if (parameter2 != 0)           // inf*<non-0 finite>
;         return inf
;     else                           // inf*0
;         return invalid op exception
;     return inf
; else if (mantissa2 != 0)           // parameter2 is a QNaN
;     return QNaN
; else                               // inf*inf
;     return inf

exception1
    ORRS    r3, r8, r0         ; if (mantissa1 !=0 &&
    BEQ     e1_p2_snan_check   ;   ..
    TST     r8, #dSignalBit    ;     mantissa1[MSb] == 0)
    BEQ     return_invalid     ;   return invalid operation

e1_p2_snan_check
    ADD     r3, r1, #1         ; else if (exponent2 == 2047 &&
    CMP     r3, #2048          ;   ..
    BNE     e1_p2_not_snan     ;   ..
    ORRS    r3, r2, r4         ;          mantissa2 != 0 &&
    BEQ     e1_p2_not_snan     ;   ..
    TST     r2, #dSignalBit    ;          mantissa2[MSb] == 0)
    MOVEQ   r8, r2             ;   copy mantissa2 to mantissa1
    MOVEQ   r0, r4             ;     ..
    BEQ     return_invalid     ;   return invalid operation

e1_p2_not_snan
    ORRS    r3, r8, r0         ; else if (mantissa1 != 0)
    BNE     return_QNaN        ;   return QNaN

e1_p1_is_INF
    ADD     r3, r1, #1         ; else if (exponent2 != 2047)
    CMP     r3, #2048          ;   ..
    BEQ     e1_p2_INF_NaN      ;   ..
    CMP     r1, #0             ;   if (parameter2 != 0)
    ORREQS  r3, r2, r4         ;     ..
    BNE     return_inf         ;     return INF
    MOV     r8, #0             ;   else
    MOV     r0, #0             ;     zero out mantissa1 for QNaN
    B       return_invalid     ;     return invalid operation

e1_p2_INF_NaN
    ORRS    r3, r2, r4         ; else if (mantissa2 != 0)
    MOV     r8, r2             ;   copy mantissa2 to mantissa1
    MOV     r0, r4             ;     ..
    BNE     return_QNaN        ;   return QNaN
    B       return_inf         ; else
                               ;   return INF


; Exception 2: parameter1 is finite.  parameter2 is non-finite (exponent2 ==
; 2047); it's either a NaN or inf.  The mantissa has not been shifted left
; for the guard bits yet.
;
; If parameter2 is an SNaN, return an invalid op exception with a QNaN.
; Otherwise, if it's a QNaN, silently return a QNaN.  Otherwise it's finite*inf
; so return an invalid op exception with a QNaN for 0*inf, or
; <non-0 finite>*inf.
;
; if (mantissa2 != 0 &
;     mantissa2<51> == 1)       // parameter2 is an SNaN
;     return invalid op exception
; else if (mantissa2 != 0)      // parameter2 is a QNaN
;     return QNaN
; else if (parameter1 != 0)     // parameter1 is non-0 finite
;     return inf
; else                          // it's 0*inf
;     return invalid op exception

exception2
    ORRS    r3, r2, r4      ; if (mantissa2 != 0 &&
    BEQ     e2_p2_is_inf    ;   ..
    TST     r2, #dSignalBit ;     mantissa2[MSb] == 0)
    BEQ     return_invalid  ;   return invalid operation
    MOV     r8, r2          ; else if (mantissa2 != 0)
    MOV     r0, r4          ;   copy mantissa2 into mantissa1 for QNaN
    B       return_QNaN     ;   return QNaN

e2_p2_is_inf
    ORRS    r3, r8, r0      ; else if (parameter1 != 0)
    CMPEQ   r9, #0          ;   ..
    MOVEQ   r8, r2          ; copy mantissa2 to mantissa1 for QNaN
    MOVEQ   r0, r4          ; ..
    BEQ     return_invalid  ; ..
    B       return_inf      ;   return INF



; Exception 3: parameter1 is 0 or denormal (exponent1 = 0), parameter2 is
; finite.
;
; if (mantissa1 == 0)
;     return zero
; else normalize parameter1

exception3
    ORRS    r3, r8, r0      ; if (mantissa1 == 0)
    BEQ     return_zero     ;   return zero
p1_norm                     ; Normalize parameter1 stop when shift into 1.0 bit
    MOVS    r0, r0, LSL #1  ; Account for the hidden mantissa bit
    MOV     r8, r8, LSL #1  ;   that denormals don't have
    ORRCS   r8, r8, #0x1    ;   ..
    CMP     r8, #MSB        ; While mantissa1 < 1.0
    BGE     end_p1_norm     ;   ..
p1_norm_loop
    MOVS    r0, r0, LSL #1  ;   Scale mantissa1 up by 1 place
    MOV     r8, r8, LSL #1  ;     ..
    ORRCS   r8, r8, #0x1    ;     ..
    SUB     r9, r9, #1      ;   and exponent1 down by 1
    CMP     r8, #MSB        ;
    BLT     p1_norm_loop    ;
end_p1_norm
    B       exception_return3
                            ; Done


; Exception 4: parameter1 is finite and (now) normalized, parameter2 is 0 or
; denormal (exponent2 = 0).
;
; if (mantissa2 == 0)
;     return zero
; else normalize parameter2
exception4
    ORRS    r3, r2, r4      ; if (mantissa2 == 0)
    BEQ     return_zero     ;   return zero
p2_norm                     ; Normalize parameter2 stop when shift into 1.0 bit
    MOVS    r4, r4, LSL #1  ; Account for the hidden mantissa bit
    MOV     r2, r2, LSL #1  ;   that denormals don't have
    ORRCS   r2, r2, #0x1    ;   ..
    CMP     r2, #MSB        ; While mantissa2 < 1.0
    BGE     end_p2_norm     ;   ..
p2_norm_loop
    MOVS    r4, r4, LSL #1  ;   Scale mantissa2 up by 1 place
    MOV     r2, r2, LSL #1  ;     ..
    ORRCS   r2, r2, #0x1    ;     ..
    SUB     r1, r1, #1      ;   and exponent2 down by 1
    CMP     r2, #MSB        ;
    BLT     p2_norm_loop    ;
end_p2_norm
    B       exception_return4
                            ; Done



; Cause an overflow exception (=> inexact) and return properly signed inf.
return_overflow
    ORR     r5, r5, #OVF_bit :OR: INX_bit
                            ; Report overflow (=> inexact)
                            ; Fall thru to return inf

; Return properly signed inf.
return_inf
    MVN     r9, #0          ; exponent1 = 2047
    MOV     r9, r9, LSR #21 ;   ..
    MOV     r8, #0          ; mantissa1 = 0
    MOV     r0, #0          ;   ..
    B       return_value


; Return 0.
return_zero
    MOV     r1, r7, LSR #31 ; Apply the sign to zero
    MOV     r1, r1, LSL #31 ;   ..
    MOV     r0, #0          ; Zero low mantissa
    B       return          ; Done

; Cause an invalid operation exception and return a QNaN.
return_invalid
    ORR     r5, r5, #IVO_bit; Report invalid op exception
                            ; Fall thru to return a QNaN

; Return a QNaN.
return_QNaN
    ORR     r1, r8, #0x7F000000
                            ; Return a QNaN
    ORR     r1, r1, #0x00F80000
                            ;  ..
    B       return          ;  ..

    END