/*
 * Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
 *
 * SPDX-License-Identifier: BSD-3-Clause
 */

#include "pico/asm_helper.S"

#if PICO_FLOAT_SUPPORT_ROM_V1 && PICO_RP2040_B0_SUPPORTED

#ifndef PICO_FLOAT_IN_RAM
#define PICO_FLOAT_IN_RAM 0
#endif

pico_default_asm_setup

.macro float_section name
// todo separate flag for shims?
#if PICO_FLOAT_IN_RAM
.section RAM_SECTION_NAME(\name), "ax"
#else
.section SECTION_NAME(\name), "ax"
#endif
.endm

float_section float_table_shim_on_use_helper
regular_func float_table_shim_on_use_helper
    push {r0-r2, lr}
    mov r0, ip
#ifndef NDEBUG
    // sanity check to make sure we weren't called by non (shimmable_) table_tail_call macro
    cmp r0, #0
    bne 1f
    bkpt #0
#endif
1:
    ldrh r1, [r0]
    lsrs r2, r1, #8
    adds r0, #2
    cmp r2, #0xdf
    bne 1b
    uxtb r1, r1 // r1 holds table offset
    lsrs r2, r0, #2
    bcc 1f
    // unaligned
    ldrh r2, [r0, #0]
    ldrh r0, [r0, #2]
    lsls r0, #16
    orrs r0, r2
    b 2f
1:
    ldr r0, [r0]
2:
    ldr r2, =sf_table
    str r0, [r2, r1]
    str r0, [sp, #12]
    pop {r0-r2, pc}

float_section 642float_shims

@ convert uint64 to float, rounding
regular_func uint642float_shim
 movs r2,#0       @ fall through

@ convert unsigned 64-bit fix to float, rounding; number of r0:r1 bits after point in r2
regular_func ufix642float_shim
 push {r4,r5,r14}
 cmp r1,#0
 bpl 3f          @ positive? we can use signed code
 lsls r5,r1,#31  @ contribution to sticky bits
 orrs r5,r0
 lsrs r0,r1,#1
 subs r2,#1
 b 4f

@ convert int64 to float, rounding
regular_func int642float_shim
 movs r2,#0       @ fall through

@ convert signed 64-bit fix to float, rounding; number of r0:r1 bits after point in r2
regular_func fix642float_shim
 push {r4,r5,r14}
3:
 movs r5,r0
 orrs r5,r1
 beq ret_pop45   @ zero? return +0
 asrs r5,r1,#31  @ sign bits
2:
 asrs r4,r1,#24  @ try shifting 7 bits at a time
 cmp r4,r5
 bne 1f          @ next shift will overflow?
 lsls r1,#7
 lsrs r4,r0,#25
 orrs r1,r4
 lsls r0,#7
 adds r2,#7
 b 2b
1:
 movs r5,r0
 movs r0,r1
4:
 negs r2,r2
 adds r2,#32+29

 // bl packx
 ldr r1, =0x29ef // packx
 blx r1
ret_pop45:
 pop {r4,r5,r15}

float_section fatan2_shim
regular_func fatan2_shim
 push {r4,r5,r14}

 ldr r4, =0x29c1 // unpackx
 mov ip, r4
@ unpack arguments and shift one down to have common exponent
 blx ip
 mov r4,r0
 mov r0,r1
 mov r1,r4
 mov r4,r2
 mov r2,r3
 mov r3,r4
 blx ip
 lsls r0,r0,#5  @ Q28
 lsls r1,r1,#5  @ Q28
 adds r4,r2,r3  @ this is -760 if both arguments are 0 and at least -380-126=-506 otherwise
 asrs r4,#9
 adds r4,#1
 bmi 2f         @ force y to 0 proper, so result will be zero
 subs r4,r2,r3  @ calculate shift
 bge 1f         @ ex>=ey?
 negs r4,r4     @ make shift positive
 asrs r0,r4
 cmp r4,#28
 blo 3f
 asrs r0,#31
 b 3f
1:
 asrs r1,r4
 cmp r4,#28
 blo 3f
2:
@ here |x|>>|y| or both x and y are ±0
 cmp r0,#0
 bge 4f         @ x positive, return signed 0
 ldr r3, =0x2cfc         @ &pi_q29, circular coefficients
 ldr r0,[r3]    @ x negative, return +/- pi
 asrs r1,#31
 eors r0,r1
 b 7f
4:
 asrs r0,r1,#31
 b 7f
3:
 movs r2,#0              @ initial angle
 ldr r3, =0x2cfc         @ &pi_q29, circular coefficients
 cmp r0,#0               @ x negative
 bge 5f
 negs r0,r0              @ rotate to 1st/4th quadrants
 negs r1,r1
 ldr r2,[r3]             @ pi Q29
5:
 movs r4,#1              @ m=1
 ldr r5, =0x2b97         @ cordic_vec
 blx r5                  @ also produces magnitude (with scaling factor 1.646760119), which is discarded
 mov r0,r2               @ result here is -pi/2..3pi/2 Q29
@ asrs r2,#29
@ subs r0,r2
 ldr r3, =0x2cfc         @ &pi_q29, circular coefficients
 ldr r2,[r3]             @ pi Q29
 adds r4,r0,r2           @ attempt to fix -3pi/2..-pi case
 bcs 6f                  @ -pi/2..0? leave result as is
 subs r4,r0,r2           @ <pi? leave as is
 bmi 6f
 subs r0,r4,r2           @ >pi: take off 2pi
6:
 subs r0,#1              @ fiddle factor so atan2(0,1)==0
7:
 movs r2,#0              @ exponent for pack
 ldr r3, =0x2b19
 bx r3

float_section float232_shims

regular_func float2int_shim
     movs r1,#0                    @ fall through
regular_func float2fix_shim
     // check for -0 or -denormal upfront
     asrs r2, r0, #23
     adds r2, #128
     adds r2, #128
     beq 1f
     // call original
     ldr r2, =0x2acd
     bx r2
     1:
     movs r0, #0
     bx lr

float_section float264_shims

regular_func float2int64_shim
 movs r1,#0                    @ and fall through
regular_func float2fix64_shim
 push {r14}
 bl f2fix
 b d2f64_a

regular_func float2uint64_shim
 movs r1,#0                    @ and fall through
regular_func float2ufix64_shim
 asrs r3,r0,#23                @ negative? return 0
 bmi ret_dzero
@ and fall through

@ convert float in r0 to signed fixed point in r0:r1:r3, r1 places after point, rounding towards -Inf
@ result clamped so that r3 can only be 0 or -1
@ trashes r12
.thumb_func
f2fix:
 push {r4,r14}
 mov r12,r1
 asrs r3,r0,#31
 lsls r0,#1
 lsrs r2,r0,#24
 beq 1f                        @ zero?
 cmp r2,#0xff                  @ Inf?
 beq 2f
 subs r1,r2,#1
 subs r2,#0x7f                 @ remove exponent bias
 lsls r1,#24
 subs r0,r1                    @ insert implied 1
 eors r0,r3
 subs r0,r3                    @ top two's complement
 asrs r1,r0,#4                 @ convert to double format
 lsls r0,#28
 ldr r4, =d2fix_a
 bx r4
1:
 movs r0,#0
 movs r1,r0
 movs r3,r0
 pop {r4,r15}
2:
 mvns r0,r3                    @ return max/min value
 mvns r1,r3
 pop {r4,r15}

ret_dzero:
 movs r0,#0
 movs r1,#0
 bx r14

float_section d2fix_a_float

.weak d2fix_a // weak because it exists in float shims too
.thumb_func
d2fix_a:
@ here
@ r0:r1 two's complement mantissa
@ r2    unbaised exponent
@ r3    mantissa sign extension bits
 add r2,r12                    @ exponent plus offset for required binary point position
 subs r2,#52                   @ required shift
 bmi 1f                        @ shift down?
@ here a shift up by r2 places
 cmp r2,#12                    @ will clamp?
 bge 2f
 movs r4,r0
 lsls r1,r2
 lsls r0,r2
 negs r2,r2
 adds r2,#32                   @ complementary shift
 lsrs r4,r2
 orrs r1,r4
 pop {r4,r15}
2:
 mvns r0,r3
 mvns r1,r3                    @ overflow: clamp to extreme fixed-point values
 pop {r4,r15}
1:
@ here a shift down by -r2 places
 adds r2,#32
 bmi 1f                        @ long shift?
 mov r4,r1
 lsls r4,r2
 negs r2,r2
 adds r2,#32                   @ complementary shift
 asrs r1,r2
 lsrs r0,r2
 orrs r0,r4
 pop {r4,r15}
1:
@ here a long shift down
 movs r0,r1
 asrs r1,#31                   @ shift down 32 places
 adds r2,#32
 bmi 1f                        @ very long shift?
 negs r2,r2
 adds r2,#32
 asrs r0,r2
 pop {r4,r15}
1:
 movs r0,r3                    @ result very near zero: use sign extension bits
 movs r1,r3
 pop {r4,r15}
d2f64_a:
 asrs r2,r1,#31
 cmp r2,r3
 bne 1f                        @ sign extension bits fail to match sign of result?
 pop {r15}
1:
 mvns r0,r3
 movs r1,#1
 lsls r1,#31
 eors r1,r1,r0                 @ generate extreme fixed-point values
 pop {r15}

float_section float2double_shim
regular_func float2double_shim
 lsrs r3,r0,#31                @ sign bit
 lsls r3,#31
 lsls r1,r0,#1
 lsrs r2,r1,#24                @ exponent
 beq 1f                        @ zero?
 cmp r2,#0xff                  @ Inf?
 beq 2f
 lsrs r1,#4                    @ exponent and top 20 bits of mantissa
 ldr r2,=(0x3ff-0x7f)<<20     @ difference in exponent offsets
 adds r1,r2
 orrs r1,r3
 lsls r0,#29                   @ bottom 3 bits of mantissa
 bx r14
1:
 movs r1,r3                    @ return signed zero
3:
 movs r0,#0
 bx r14
2:
 ldr r1,=0x7ff00000           @ return signed infinity
 adds r1,r3
 b 3b

#endif