| //===----------------------Hexagon builtin routine ------------------------===// |
| // |
| // The LLVM Compiler Infrastructure |
| // |
| // This file is dual licensed under the MIT and the University of Illinois Open |
| // Source Licenses. See LICENSE.TXT for details. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| /* Double Precision square root */ |
| |
| #define EXP r28 |
| |
| #define A r1:0 |
| #define AH r1 |
| #define AL r0 |
| |
| #define SFSH r3:2 |
| #define SF_S r3 |
| #define SF_H r2 |
| |
| #define SFHALF_SONE r5:4 |
| #define S_ONE r4 |
| #define SFHALF r5 |
| #define SF_D r6 |
| #define SF_E r7 |
| #define RECIPEST r8 |
| #define SFRAD r9 |
| |
| #define FRACRAD r11:10 |
| #define FRACRADH r11 |
| #define FRACRADL r10 |
| |
| #define ROOT r13:12 |
| #define ROOTHI r13 |
| #define ROOTLO r12 |
| |
| #define PROD r15:14 |
| #define PRODHI r15 |
| #define PRODLO r14 |
| |
| #define P_TMP p0 |
| #define P_EXP1 p1 |
| #define NORMAL p2 |
| |
| #define SF_EXPBITS 8 |
| #define SF_MANTBITS 23 |
| |
| #define DF_EXPBITS 11 |
| #define DF_MANTBITS 52 |
| |
| #define DF_BIAS 0x3ff |
| |
| #define DFCLASS_ZERO 0x01 |
| #define DFCLASS_NORMAL 0x02 |
| #define DFCLASS_DENORMAL 0x02 |
| #define DFCLASS_INFINITE 0x08 |
| #define DFCLASS_NAN 0x10 |
| |
| #define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG; .type __qdsp_##TAG,@function |
| #define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG; .type __hexagon_fast_##TAG,@function |
| #define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG; .type __hexagon_fast2_##TAG,@function |
| #define END(TAG) .size TAG,.-TAG |
| |
| .text |
| .global __hexagon_sqrtdf2 |
| .type __hexagon_sqrtdf2,@function |
| .global __hexagon_sqrt |
| .type __hexagon_sqrt,@function |
| Q6_ALIAS(sqrtdf2) |
| Q6_ALIAS(sqrt) |
| FAST_ALIAS(sqrtdf2) |
| FAST_ALIAS(sqrt) |
| FAST2_ALIAS(sqrtdf2) |
| FAST2_ALIAS(sqrt) |
| .type sqrt,@function |
| .p2align 5 |
| __hexagon_sqrtdf2: |
| __hexagon_sqrt: |
| { |
| PROD = extractu(A,#SF_MANTBITS+1,#DF_MANTBITS-SF_MANTBITS) |
| EXP = extractu(AH,#DF_EXPBITS,#DF_MANTBITS-32) |
| SFHALF_SONE = combine(##0x3f000004,#1) |
| } |
| { |
| NORMAL = dfclass(A,#DFCLASS_NORMAL) // Is it normal |
| NORMAL = cmp.gt(AH,#-1) // and positive? |
| if (!NORMAL.new) jump:nt .Lsqrt_abnormal |
| SFRAD = or(SFHALF,PRODLO) |
| } |
| #undef NORMAL |
| .Ldenormal_restart: |
| { |
| FRACRAD = A |
| SF_E,P_TMP = sfinvsqrta(SFRAD) |
| SFHALF = and(SFHALF,#-16) |
| SFSH = #0 |
| } |
| #undef A |
| #undef AH |
| #undef AL |
| #define ERROR r1:0 |
| #define ERRORHI r1 |
| #define ERRORLO r0 |
| // SF_E : reciprocal square root |
| // SF_H : half rsqrt |
| // sf_S : square root |
| // SF_D : error term |
| // SFHALF: 0.5 |
| { |
| SF_S += sfmpy(SF_E,SFRAD):lib // s0: root |
| SF_H += sfmpy(SF_E,SFHALF):lib // h0: 0.5*y0. Could also decrement exponent... |
| SF_D = SFHALF |
| #undef SFRAD |
| #define SHIFTAMT r9 |
| SHIFTAMT = and(EXP,#1) |
| } |
| { |
| SF_D -= sfmpy(SF_S,SF_H):lib // d0: 0.5-H*S = 0.5-0.5*~1 |
| FRACRADH = insert(S_ONE,#DF_EXPBITS+1,#DF_MANTBITS-32) // replace upper bits with hidden |
| P_EXP1 = cmp.gtu(SHIFTAMT,#0) |
| } |
| { |
| SF_S += sfmpy(SF_S,SF_D):lib // s1: refine sqrt |
| SF_H += sfmpy(SF_H,SF_D):lib // h1: refine half-recip |
| SF_D = SFHALF |
| SHIFTAMT = mux(P_EXP1,#8,#9) |
| } |
| { |
| SF_D -= sfmpy(SF_S,SF_H):lib // d1: error term |
| FRACRAD = asl(FRACRAD,SHIFTAMT) // Move fracrad bits to right place |
| SHIFTAMT = mux(P_EXP1,#3,#2) |
| } |
| { |
| SF_H += sfmpy(SF_H,SF_D):lib // d2: rsqrt |
| // cool trick: half of 1/sqrt(x) has same mantissa as 1/sqrt(x). |
| PROD = asl(FRACRAD,SHIFTAMT) // fracrad<<(2+exp1) |
| } |
| { |
| SF_H = and(SF_H,##0x007fffff) |
| } |
| { |
| SF_H = add(SF_H,##0x00800000 - 3) |
| SHIFTAMT = mux(P_EXP1,#7,#8) |
| } |
| { |
| RECIPEST = asl(SF_H,SHIFTAMT) |
| SHIFTAMT = mux(P_EXP1,#15-(1+1),#15-(1+0)) |
| } |
| { |
| ROOT = mpyu(RECIPEST,PRODHI) // root = mpyu_full(recipest,hi(fracrad<<(2+exp1))) |
| } |
| |
| #undef SFSH // r3:2 |
| #undef SF_H // r2 |
| #undef SF_S // r3 |
| #undef S_ONE // r4 |
| #undef SFHALF // r5 |
| #undef SFHALF_SONE // r5:4 |
| #undef SF_D // r6 |
| #undef SF_E // r7 |
| |
| #define HL r3:2 |
| #define LL r5:4 |
| #define HH r7:6 |
| |
| #undef P_EXP1 |
| #define P_CARRY0 p1 |
| #define P_CARRY1 p2 |
| #define P_CARRY2 p3 |
| |
| /* Iteration 0 */ |
| /* Maybe we can save a cycle by starting with ERROR=asl(fracrad), then as we multiply */ |
| /* We can shift and subtract instead of shift and add? */ |
| { |
| ERROR = asl(FRACRAD,#15) |
| PROD = mpyu(ROOTHI,ROOTHI) |
| P_CARRY0 = cmp.eq(r0,r0) |
| } |
| { |
| ERROR -= asl(PROD,#15) |
| PROD = mpyu(ROOTHI,ROOTLO) |
| P_CARRY1 = cmp.eq(r0,r0) |
| } |
| { |
| ERROR -= lsr(PROD,#16) |
| P_CARRY2 = cmp.eq(r0,r0) |
| } |
| { |
| ERROR = mpyu(ERRORHI,RECIPEST) |
| } |
| { |
| ROOT += lsr(ERROR,SHIFTAMT) |
| SHIFTAMT = add(SHIFTAMT,#16) |
| ERROR = asl(FRACRAD,#31) // for next iter |
| } |
| /* Iteration 1 */ |
| { |
| PROD = mpyu(ROOTHI,ROOTHI) |
| ERROR -= mpyu(ROOTHI,ROOTLO) // amount is 31, no shift needed |
| } |
| { |
| ERROR -= asl(PROD,#31) |
| PROD = mpyu(ROOTLO,ROOTLO) |
| } |
| { |
| ERROR -= lsr(PROD,#33) |
| } |
| { |
| ERROR = mpyu(ERRORHI,RECIPEST) |
| } |
| { |
| ROOT += lsr(ERROR,SHIFTAMT) |
| SHIFTAMT = add(SHIFTAMT,#16) |
| ERROR = asl(FRACRAD,#47) // for next iter |
| } |
| /* Iteration 2 */ |
| { |
| PROD = mpyu(ROOTHI,ROOTHI) |
| } |
| { |
| ERROR -= asl(PROD,#47) |
| PROD = mpyu(ROOTHI,ROOTLO) |
| } |
| { |
| ERROR -= asl(PROD,#16) // bidir shr 31-47 |
| PROD = mpyu(ROOTLO,ROOTLO) |
| } |
| { |
| ERROR -= lsr(PROD,#17) // 64-47 |
| } |
| { |
| ERROR = mpyu(ERRORHI,RECIPEST) |
| } |
| { |
| ROOT += lsr(ERROR,SHIFTAMT) |
| } |
| #undef ERROR |
| #undef PROD |
| #undef PRODHI |
| #undef PRODLO |
| #define REM_HI r15:14 |
| #define REM_HI_HI r15 |
| #define REM_LO r1:0 |
| #undef RECIPEST |
| #undef SHIFTAMT |
| #define TWOROOT_LO r9:8 |
| /* Adjust Root */ |
| { |
| HL = mpyu(ROOTHI,ROOTLO) |
| LL = mpyu(ROOTLO,ROOTLO) |
| REM_HI = #0 |
| REM_LO = #0 |
| } |
| { |
| HL += lsr(LL,#33) |
| LL += asl(HL,#33) |
| P_CARRY0 = cmp.eq(r0,r0) |
| } |
| { |
| HH = mpyu(ROOTHI,ROOTHI) |
| REM_LO = sub(REM_LO,LL,P_CARRY0):carry |
| TWOROOT_LO = #1 |
| } |
| { |
| HH += lsr(HL,#31) |
| TWOROOT_LO += asl(ROOT,#1) |
| } |
| #undef HL |
| #undef LL |
| #define REM_HI_TMP r3:2 |
| #define REM_HI_TMP_HI r3 |
| #define REM_LO_TMP r5:4 |
| { |
| REM_HI = sub(FRACRAD,HH,P_CARRY0):carry |
| REM_LO_TMP = sub(REM_LO,TWOROOT_LO,P_CARRY1):carry |
| #undef FRACRAD |
| #undef HH |
| #define ZERO r11:10 |
| #define ONE r7:6 |
| ONE = #1 |
| ZERO = #0 |
| } |
| { |
| REM_HI_TMP = sub(REM_HI,ZERO,P_CARRY1):carry |
| ONE = add(ROOT,ONE) |
| EXP = add(EXP,#-DF_BIAS) // subtract bias --> signed exp |
| } |
| { |
| // If carry set, no borrow: result was still positive |
| if (P_CARRY1) ROOT = ONE |
| if (P_CARRY1) REM_LO = REM_LO_TMP |
| if (P_CARRY1) REM_HI = REM_HI_TMP |
| } |
| { |
| REM_LO_TMP = sub(REM_LO,TWOROOT_LO,P_CARRY2):carry |
| ONE = #1 |
| EXP = asr(EXP,#1) // divide signed exp by 2 |
| } |
| { |
| REM_HI_TMP = sub(REM_HI,ZERO,P_CARRY2):carry |
| ONE = add(ROOT,ONE) |
| } |
| { |
| if (P_CARRY2) ROOT = ONE |
| if (P_CARRY2) REM_LO = REM_LO_TMP |
| // since tworoot <= 2^32, remhi must be zero |
| #undef REM_HI_TMP |
| #undef REM_HI_TMP_HI |
| #define S_ONE r2 |
| #define ADJ r3 |
| S_ONE = #1 |
| } |
| { |
| P_TMP = cmp.eq(REM_LO,ZERO) // is the low part zero |
| if (!P_TMP.new) ROOTLO = or(ROOTLO,S_ONE) // if so, it's exact... hopefully |
| ADJ = cl0(ROOT) |
| EXP = add(EXP,#-63) |
| } |
| #undef REM_LO |
| #define RET r1:0 |
| #define RETHI r1 |
| { |
| RET = convert_ud2df(ROOT) // set up mantissa, maybe set inexact flag |
| EXP = add(EXP,ADJ) // add back bias |
| } |
| { |
| RETHI += asl(EXP,#DF_MANTBITS-32) // add exponent adjust |
| jumpr r31 |
| } |
| #undef REM_LO_TMP |
| #undef REM_HI_TMP |
| #undef REM_HI_TMP_HI |
| #undef REM_LO |
| #undef REM_HI |
| #undef TWOROOT_LO |
| |
| #undef RET |
| #define A r1:0 |
| #define AH r1 |
| #define AL r1 |
| #undef S_ONE |
| #define TMP r3:2 |
| #define TMPHI r3 |
| #define TMPLO r2 |
| #undef P_CARRY0 |
| #define P_NEG p1 |
| |
| |
| #define SFHALF r5 |
| #define SFRAD r9 |
| .Lsqrt_abnormal: |
| { |
| P_TMP = dfclass(A,#DFCLASS_ZERO) // zero? |
| if (P_TMP.new) jumpr:t r31 |
| } |
| { |
| P_TMP = dfclass(A,#DFCLASS_NAN) |
| if (P_TMP.new) jump:nt .Lsqrt_nan |
| } |
| { |
| P_TMP = cmp.gt(AH,#-1) |
| if (!P_TMP.new) jump:nt .Lsqrt_invalid_neg |
| if (!P_TMP.new) EXP = ##0x7F800001 // sNaN |
| } |
| { |
| P_TMP = dfclass(A,#DFCLASS_INFINITE) |
| if (P_TMP.new) jumpr:nt r31 |
| } |
| // If we got here, we're denormal |
| // prepare to restart |
| { |
| A = extractu(A,#DF_MANTBITS,#0) // Extract mantissa |
| } |
| { |
| EXP = add(clb(A),#-DF_EXPBITS) // how much to normalize? |
| } |
| { |
| A = asl(A,EXP) // Shift mantissa |
| EXP = sub(#1,EXP) // Form exponent |
| } |
| { |
| AH = insert(EXP,#1,#DF_MANTBITS-32) // insert lsb of exponent |
| } |
| { |
| TMP = extractu(A,#SF_MANTBITS+1,#DF_MANTBITS-SF_MANTBITS) // get sf value (mant+exp1) |
| SFHALF = ##0x3f000004 // form half constant |
| } |
| { |
| SFRAD = or(SFHALF,TMPLO) // form sf value |
| SFHALF = and(SFHALF,#-16) |
| jump .Ldenormal_restart // restart |
| } |
| .Lsqrt_nan: |
| { |
| EXP = convert_df2sf(A) // if sNaN, get invalid |
| A = #-1 // qNaN |
| jumpr r31 |
| } |
| .Lsqrt_invalid_neg: |
| { |
| A = convert_sf2df(EXP) // Invalid,NaNval |
| jumpr r31 |
| } |
| END(__hexagon_sqrt) |
| END(__hexagon_sqrtdf2) |