| #if defined(__aarch64__) |
| .text |
| |
| .globl bn_mul_mont |
| .hidden bn_mul_mont |
| .type bn_mul_mont,%function |
| .align 5 |
| bn_mul_mont: |
| tst x5,#7 |
| b.eq __bn_sqr8x_mont |
| tst x5,#3 |
| b.eq __bn_mul4x_mont |
| .Lmul_mont: |
| stp x29,x30,[sp,#-64]! |
| add x29,sp,#0 |
| stp x19,x20,[sp,#16] |
| stp x21,x22,[sp,#32] |
| stp x23,x24,[sp,#48] |
| |
| ldr x9,[x2],#8 // bp[0] |
| sub x22,sp,x5,lsl#3 |
| ldp x7,x8,[x1],#16 // ap[0..1] |
| lsl x5,x5,#3 |
| ldr x4,[x4] // *n0 |
| and x22,x22,#-16 // ABI says so |
| ldp x13,x14,[x3],#16 // np[0..1] |
| |
| mul x6,x7,x9 // ap[0]*bp[0] |
| sub x21,x5,#16 // j=num-2 |
| umulh x7,x7,x9 |
| mul x10,x8,x9 // ap[1]*bp[0] |
| umulh x11,x8,x9 |
| |
| mul x15,x6,x4 // "tp[0]"*n0 |
| mov sp,x22 // alloca |
| |
| // (*) mul x12,x13,x15 // np[0]*m1 |
| umulh x13,x13,x15 |
| mul x16,x14,x15 // np[1]*m1 |
| // (*) adds x12,x12,x6 // discarded |
| // (*) As for removal of first multiplication and addition |
| // instructions. The outcome of first addition is |
| // guaranteed to be zero, which leaves two computationally |
| // significant outcomes: it either carries or not. Then |
| // question is when does it carry? Is there alternative |
| // way to deduce it? If you follow operations, you can |
| // observe that condition for carry is quite simple: |
| // x6 being non-zero. So that carry can be calculated |
| // by adding -1 to x6. That's what next instruction does. |
| subs xzr,x6,#1 // (*) |
| umulh x17,x14,x15 |
| adc x13,x13,xzr |
| cbz x21,.L1st_skip |
| |
| .L1st: |
| ldr x8,[x1],#8 |
| adds x6,x10,x7 |
| sub x21,x21,#8 // j-- |
| adc x7,x11,xzr |
| |
| ldr x14,[x3],#8 |
| adds x12,x16,x13 |
| mul x10,x8,x9 // ap[j]*bp[0] |
| adc x13,x17,xzr |
| umulh x11,x8,x9 |
| |
| adds x12,x12,x6 |
| mul x16,x14,x15 // np[j]*m1 |
| adc x13,x13,xzr |
| umulh x17,x14,x15 |
| str x12,[x22],#8 // tp[j-1] |
| cbnz x21,.L1st |
| |
| .L1st_skip: |
| adds x6,x10,x7 |
| sub x1,x1,x5 // rewind x1 |
| adc x7,x11,xzr |
| |
| adds x12,x16,x13 |
| sub x3,x3,x5 // rewind x3 |
| adc x13,x17,xzr |
| |
| adds x12,x12,x6 |
| sub x20,x5,#8 // i=num-1 |
| adcs x13,x13,x7 |
| |
| adc x19,xzr,xzr // upmost overflow bit |
| stp x12,x13,[x22] |
| |
| .Louter: |
| ldr x9,[x2],#8 // bp[i] |
| ldp x7,x8,[x1],#16 |
| ldr x23,[sp] // tp[0] |
| add x22,sp,#8 |
| |
| mul x6,x7,x9 // ap[0]*bp[i] |
| sub x21,x5,#16 // j=num-2 |
| umulh x7,x7,x9 |
| ldp x13,x14,[x3],#16 |
| mul x10,x8,x9 // ap[1]*bp[i] |
| adds x6,x6,x23 |
| umulh x11,x8,x9 |
| adc x7,x7,xzr |
| |
| mul x15,x6,x4 |
| sub x20,x20,#8 // i-- |
| |
| // (*) mul x12,x13,x15 // np[0]*m1 |
| umulh x13,x13,x15 |
| mul x16,x14,x15 // np[1]*m1 |
| // (*) adds x12,x12,x6 |
| subs xzr,x6,#1 // (*) |
| umulh x17,x14,x15 |
| cbz x21,.Linner_skip |
| |
| .Linner: |
| ldr x8,[x1],#8 |
| adc x13,x13,xzr |
| ldr x23,[x22],#8 // tp[j] |
| adds x6,x10,x7 |
| sub x21,x21,#8 // j-- |
| adc x7,x11,xzr |
| |
| adds x12,x16,x13 |
| ldr x14,[x3],#8 |
| adc x13,x17,xzr |
| |
| mul x10,x8,x9 // ap[j]*bp[i] |
| adds x6,x6,x23 |
| umulh x11,x8,x9 |
| adc x7,x7,xzr |
| |
| mul x16,x14,x15 // np[j]*m1 |
| adds x12,x12,x6 |
| umulh x17,x14,x15 |
| str x12,[x22,#-16] // tp[j-1] |
| cbnz x21,.Linner |
| |
| .Linner_skip: |
| ldr x23,[x22],#8 // tp[j] |
| adc x13,x13,xzr |
| adds x6,x10,x7 |
| sub x1,x1,x5 // rewind x1 |
| adc x7,x11,xzr |
| |
| adds x12,x16,x13 |
| sub x3,x3,x5 // rewind x3 |
| adcs x13,x17,x19 |
| adc x19,xzr,xzr |
| |
| adds x6,x6,x23 |
| adc x7,x7,xzr |
| |
| adds x12,x12,x6 |
| adcs x13,x13,x7 |
| adc x19,x19,xzr // upmost overflow bit |
| stp x12,x13,[x22,#-16] |
| |
| cbnz x20,.Louter |
| |
| // Final step. We see if result is larger than modulus, and |
| // if it is, subtract the modulus. But comparison implies |
| // subtraction. So we subtract modulus, see if it borrowed, |
| // and conditionally copy original value. |
| ldr x23,[sp] // tp[0] |
| add x22,sp,#8 |
| ldr x14,[x3],#8 // np[0] |
| subs x21,x5,#8 // j=num-1 and clear borrow |
| mov x1,x0 |
| .Lsub: |
| sbcs x8,x23,x14 // tp[j]-np[j] |
| ldr x23,[x22],#8 |
| sub x21,x21,#8 // j-- |
| ldr x14,[x3],#8 |
| str x8,[x1],#8 // rp[j]=tp[j]-np[j] |
| cbnz x21,.Lsub |
| |
| sbcs x8,x23,x14 |
| sbcs x19,x19,xzr // did it borrow? |
| str x8,[x1],#8 // rp[num-1] |
| |
| ldr x23,[sp] // tp[0] |
| add x22,sp,#8 |
| ldr x8,[x0],#8 // rp[0] |
| sub x5,x5,#8 // num-- |
| nop |
| .Lcond_copy: |
| sub x5,x5,#8 // num-- |
| csel x14,x23,x8,lo // did it borrow? |
| ldr x23,[x22],#8 |
| ldr x8,[x0],#8 |
| str xzr,[x22,#-16] // wipe tp |
| str x14,[x0,#-16] |
| cbnz x5,.Lcond_copy |
| |
| csel x14,x23,x8,lo |
| str xzr,[x22,#-8] // wipe tp |
| str x14,[x0,#-8] |
| |
| ldp x19,x20,[x29,#16] |
| mov sp,x29 |
| ldp x21,x22,[x29,#32] |
| mov x0,#1 |
| ldp x23,x24,[x29,#48] |
| ldr x29,[sp],#64 |
| ret |
| .size bn_mul_mont,.-bn_mul_mont |
| .type __bn_sqr8x_mont,%function |
| .align 5 |
| __bn_sqr8x_mont: |
| cmp x1,x2 |
| b.ne __bn_mul4x_mont |
| .Lsqr8x_mont: |
| stp x29,x30,[sp,#-128]! |
| add x29,sp,#0 |
| stp x19,x20,[sp,#16] |
| stp x21,x22,[sp,#32] |
| stp x23,x24,[sp,#48] |
| stp x25,x26,[sp,#64] |
| stp x27,x28,[sp,#80] |
| stp x0,x3,[sp,#96] // offload rp and np |
| |
| ldp x6,x7,[x1,#8*0] |
| ldp x8,x9,[x1,#8*2] |
| ldp x10,x11,[x1,#8*4] |
| ldp x12,x13,[x1,#8*6] |
| |
| sub x2,sp,x5,lsl#4 |
| lsl x5,x5,#3 |
| ldr x4,[x4] // *n0 |
| mov sp,x2 // alloca |
| sub x27,x5,#8*8 |
| b .Lsqr8x_zero_start |
| |
| .Lsqr8x_zero: |
| sub x27,x27,#8*8 |
| stp xzr,xzr,[x2,#8*0] |
| stp xzr,xzr,[x2,#8*2] |
| stp xzr,xzr,[x2,#8*4] |
| stp xzr,xzr,[x2,#8*6] |
| .Lsqr8x_zero_start: |
| stp xzr,xzr,[x2,#8*8] |
| stp xzr,xzr,[x2,#8*10] |
| stp xzr,xzr,[x2,#8*12] |
| stp xzr,xzr,[x2,#8*14] |
| add x2,x2,#8*16 |
| cbnz x27,.Lsqr8x_zero |
| |
| add x3,x1,x5 |
| add x1,x1,#8*8 |
| mov x19,xzr |
| mov x20,xzr |
| mov x21,xzr |
| mov x22,xzr |
| mov x23,xzr |
| mov x24,xzr |
| mov x25,xzr |
| mov x26,xzr |
| mov x2,sp |
| str x4,[x29,#112] // offload n0 |
| |
| // Multiply everything but a[i]*a[i] |
| .align 4 |
| .Lsqr8x_outer_loop: |
| // a[1]a[0] (i) |
| // a[2]a[0] |
| // a[3]a[0] |
| // a[4]a[0] |
| // a[5]a[0] |
| // a[6]a[0] |
| // a[7]a[0] |
| // a[2]a[1] (ii) |
| // a[3]a[1] |
| // a[4]a[1] |
| // a[5]a[1] |
| // a[6]a[1] |
| // a[7]a[1] |
| // a[3]a[2] (iii) |
| // a[4]a[2] |
| // a[5]a[2] |
| // a[6]a[2] |
| // a[7]a[2] |
| // a[4]a[3] (iv) |
| // a[5]a[3] |
| // a[6]a[3] |
| // a[7]a[3] |
| // a[5]a[4] (v) |
| // a[6]a[4] |
| // a[7]a[4] |
| // a[6]a[5] (vi) |
| // a[7]a[5] |
| // a[7]a[6] (vii) |
| |
| mul x14,x7,x6 // lo(a[1..7]*a[0]) (i) |
| mul x15,x8,x6 |
| mul x16,x9,x6 |
| mul x17,x10,x6 |
| adds x20,x20,x14 // t[1]+lo(a[1]*a[0]) |
| mul x14,x11,x6 |
| adcs x21,x21,x15 |
| mul x15,x12,x6 |
| adcs x22,x22,x16 |
| mul x16,x13,x6 |
| adcs x23,x23,x17 |
| umulh x17,x7,x6 // hi(a[1..7]*a[0]) |
| adcs x24,x24,x14 |
| umulh x14,x8,x6 |
| adcs x25,x25,x15 |
| umulh x15,x9,x6 |
| adcs x26,x26,x16 |
| umulh x16,x10,x6 |
| stp x19,x20,[x2],#8*2 // t[0..1] |
| adc x19,xzr,xzr // t[8] |
| adds x21,x21,x17 // t[2]+lo(a[1]*a[0]) |
| umulh x17,x11,x6 |
| adcs x22,x22,x14 |
| umulh x14,x12,x6 |
| adcs x23,x23,x15 |
| umulh x15,x13,x6 |
| adcs x24,x24,x16 |
| mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii) |
| adcs x25,x25,x17 |
| mul x17,x9,x7 |
| adcs x26,x26,x14 |
| mul x14,x10,x7 |
| adc x19,x19,x15 |
| |
| mul x15,x11,x7 |
| adds x22,x22,x16 |
| mul x16,x12,x7 |
| adcs x23,x23,x17 |
| mul x17,x13,x7 |
| adcs x24,x24,x14 |
| umulh x14,x8,x7 // hi(a[2..7]*a[1]) |
| adcs x25,x25,x15 |
| umulh x15,x9,x7 |
| adcs x26,x26,x16 |
| umulh x16,x10,x7 |
| adcs x19,x19,x17 |
| umulh x17,x11,x7 |
| stp x21,x22,[x2],#8*2 // t[2..3] |
| adc x20,xzr,xzr // t[9] |
| adds x23,x23,x14 |
| umulh x14,x12,x7 |
| adcs x24,x24,x15 |
| umulh x15,x13,x7 |
| adcs x25,x25,x16 |
| mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii) |
| adcs x26,x26,x17 |
| mul x17,x10,x8 |
| adcs x19,x19,x14 |
| mul x14,x11,x8 |
| adc x20,x20,x15 |
| |
| mul x15,x12,x8 |
| adds x24,x24,x16 |
| mul x16,x13,x8 |
| adcs x25,x25,x17 |
| umulh x17,x9,x8 // hi(a[3..7]*a[2]) |
| adcs x26,x26,x14 |
| umulh x14,x10,x8 |
| adcs x19,x19,x15 |
| umulh x15,x11,x8 |
| adcs x20,x20,x16 |
| umulh x16,x12,x8 |
| stp x23,x24,[x2],#8*2 // t[4..5] |
| adc x21,xzr,xzr // t[10] |
| adds x25,x25,x17 |
| umulh x17,x13,x8 |
| adcs x26,x26,x14 |
| mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv) |
| adcs x19,x19,x15 |
| mul x15,x11,x9 |
| adcs x20,x20,x16 |
| mul x16,x12,x9 |
| adc x21,x21,x17 |
| |
| mul x17,x13,x9 |
| adds x26,x26,x14 |
| umulh x14,x10,x9 // hi(a[4..7]*a[3]) |
| adcs x19,x19,x15 |
| umulh x15,x11,x9 |
| adcs x20,x20,x16 |
| umulh x16,x12,x9 |
| adcs x21,x21,x17 |
| umulh x17,x13,x9 |
| stp x25,x26,[x2],#8*2 // t[6..7] |
| adc x22,xzr,xzr // t[11] |
| adds x19,x19,x14 |
| mul x14,x11,x10 // lo(a[5..7]*a[4]) (v) |
| adcs x20,x20,x15 |
| mul x15,x12,x10 |
| adcs x21,x21,x16 |
| mul x16,x13,x10 |
| adc x22,x22,x17 |
| |
| umulh x17,x11,x10 // hi(a[5..7]*a[4]) |
| adds x20,x20,x14 |
| umulh x14,x12,x10 |
| adcs x21,x21,x15 |
| umulh x15,x13,x10 |
| adcs x22,x22,x16 |
| mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi) |
| adc x23,xzr,xzr // t[12] |
| adds x21,x21,x17 |
| mul x17,x13,x11 |
| adcs x22,x22,x14 |
| umulh x14,x12,x11 // hi(a[6..7]*a[5]) |
| adc x23,x23,x15 |
| |
| umulh x15,x13,x11 |
| adds x22,x22,x16 |
| mul x16,x13,x12 // lo(a[7]*a[6]) (vii) |
| adcs x23,x23,x17 |
| umulh x17,x13,x12 // hi(a[7]*a[6]) |
| adc x24,xzr,xzr // t[13] |
| adds x23,x23,x14 |
| sub x27,x3,x1 // done yet? |
| adc x24,x24,x15 |
| |
| adds x24,x24,x16 |
| sub x14,x3,x5 // rewinded ap |
| adc x25,xzr,xzr // t[14] |
| add x25,x25,x17 |
| |
| cbz x27,.Lsqr8x_outer_break |
| |
| mov x4,x6 |
| ldp x6,x7,[x2,#8*0] |
| ldp x8,x9,[x2,#8*2] |
| ldp x10,x11,[x2,#8*4] |
| ldp x12,x13,[x2,#8*6] |
| adds x19,x19,x6 |
| adcs x20,x20,x7 |
| ldp x6,x7,[x1,#8*0] |
| adcs x21,x21,x8 |
| adcs x22,x22,x9 |
| ldp x8,x9,[x1,#8*2] |
| adcs x23,x23,x10 |
| adcs x24,x24,x11 |
| ldp x10,x11,[x1,#8*4] |
| adcs x25,x25,x12 |
| mov x0,x1 |
| adcs x26,xzr,x13 |
| ldp x12,x13,[x1,#8*6] |
| add x1,x1,#8*8 |
| //adc x28,xzr,xzr // moved below |
| mov x27,#-8*8 |
| |
| // a[8]a[0] |
| // a[9]a[0] |
| // a[a]a[0] |
| // a[b]a[0] |
| // a[c]a[0] |
| // a[d]a[0] |
| // a[e]a[0] |
| // a[f]a[0] |
| // a[8]a[1] |
| // a[f]a[1]........................ |
| // a[8]a[2] |
| // a[f]a[2]........................ |
| // a[8]a[3] |
| // a[f]a[3]........................ |
| // a[8]a[4] |
| // a[f]a[4]........................ |
| // a[8]a[5] |
| // a[f]a[5]........................ |
| // a[8]a[6] |
| // a[f]a[6]........................ |
| // a[8]a[7] |
| // a[f]a[7]........................ |
| .Lsqr8x_mul: |
| mul x14,x6,x4 |
| adc x28,xzr,xzr // carry bit, modulo-scheduled |
| mul x15,x7,x4 |
| add x27,x27,#8 |
| mul x16,x8,x4 |
| mul x17,x9,x4 |
| adds x19,x19,x14 |
| mul x14,x10,x4 |
| adcs x20,x20,x15 |
| mul x15,x11,x4 |
| adcs x21,x21,x16 |
| mul x16,x12,x4 |
| adcs x22,x22,x17 |
| mul x17,x13,x4 |
| adcs x23,x23,x14 |
| umulh x14,x6,x4 |
| adcs x24,x24,x15 |
| umulh x15,x7,x4 |
| adcs x25,x25,x16 |
| umulh x16,x8,x4 |
| adcs x26,x26,x17 |
| umulh x17,x9,x4 |
| adc x28,x28,xzr |
| str x19,[x2],#8 |
| adds x19,x20,x14 |
| umulh x14,x10,x4 |
| adcs x20,x21,x15 |
| umulh x15,x11,x4 |
| adcs x21,x22,x16 |
| umulh x16,x12,x4 |
| adcs x22,x23,x17 |
| umulh x17,x13,x4 |
| ldr x4,[x0,x27] |
| adcs x23,x24,x14 |
| adcs x24,x25,x15 |
| adcs x25,x26,x16 |
| adcs x26,x28,x17 |
| //adc x28,xzr,xzr // moved above |
| cbnz x27,.Lsqr8x_mul |
| // note that carry flag is guaranteed |
| // to be zero at this point |
| cmp x1,x3 // done yet? |
| b.eq .Lsqr8x_break |
| |
| ldp x6,x7,[x2,#8*0] |
| ldp x8,x9,[x2,#8*2] |
| ldp x10,x11,[x2,#8*4] |
| ldp x12,x13,[x2,#8*6] |
| adds x19,x19,x6 |
| ldr x4,[x0,#-8*8] |
| adcs x20,x20,x7 |
| ldp x6,x7,[x1,#8*0] |
| adcs x21,x21,x8 |
| adcs x22,x22,x9 |
| ldp x8,x9,[x1,#8*2] |
| adcs x23,x23,x10 |
| adcs x24,x24,x11 |
| ldp x10,x11,[x1,#8*4] |
| adcs x25,x25,x12 |
| mov x27,#-8*8 |
| adcs x26,x26,x13 |
| ldp x12,x13,[x1,#8*6] |
| add x1,x1,#8*8 |
| //adc x28,xzr,xzr // moved above |
| b .Lsqr8x_mul |
| |
| .align 4 |
| .Lsqr8x_break: |
| ldp x6,x7,[x0,#8*0] |
| add x1,x0,#8*8 |
| ldp x8,x9,[x0,#8*2] |
| sub x14,x3,x1 // is it last iteration? |
| ldp x10,x11,[x0,#8*4] |
| sub x15,x2,x14 |
| ldp x12,x13,[x0,#8*6] |
| cbz x14,.Lsqr8x_outer_loop |
| |
| stp x19,x20,[x2,#8*0] |
| ldp x19,x20,[x15,#8*0] |
| stp x21,x22,[x2,#8*2] |
| ldp x21,x22,[x15,#8*2] |
| stp x23,x24,[x2,#8*4] |
| ldp x23,x24,[x15,#8*4] |
| stp x25,x26,[x2,#8*6] |
| mov x2,x15 |
| ldp x25,x26,[x15,#8*6] |
| b .Lsqr8x_outer_loop |
| |
| .align 4 |
| .Lsqr8x_outer_break: |
| // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] |
| ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0] |
| ldp x15,x16,[sp,#8*1] |
| ldp x11,x13,[x14,#8*2] |
| add x1,x14,#8*4 |
| ldp x17,x14,[sp,#8*3] |
| |
| stp x19,x20,[x2,#8*0] |
| mul x19,x7,x7 |
| stp x21,x22,[x2,#8*2] |
| umulh x7,x7,x7 |
| stp x23,x24,[x2,#8*4] |
| mul x8,x9,x9 |
| stp x25,x26,[x2,#8*6] |
| mov x2,sp |
| umulh x9,x9,x9 |
| adds x20,x7,x15,lsl#1 |
| extr x15,x16,x15,#63 |
| sub x27,x5,#8*4 |
| |
| .Lsqr4x_shift_n_add: |
| adcs x21,x8,x15 |
| extr x16,x17,x16,#63 |
| sub x27,x27,#8*4 |
| adcs x22,x9,x16 |
| ldp x15,x16,[x2,#8*5] |
| mul x10,x11,x11 |
| ldp x7,x9,[x1],#8*2 |
| umulh x11,x11,x11 |
| mul x12,x13,x13 |
| umulh x13,x13,x13 |
| extr x17,x14,x17,#63 |
| stp x19,x20,[x2,#8*0] |
| adcs x23,x10,x17 |
| extr x14,x15,x14,#63 |
| stp x21,x22,[x2,#8*2] |
| adcs x24,x11,x14 |
| ldp x17,x14,[x2,#8*7] |
| extr x15,x16,x15,#63 |
| adcs x25,x12,x15 |
| extr x16,x17,x16,#63 |
| adcs x26,x13,x16 |
| ldp x15,x16,[x2,#8*9] |
| mul x6,x7,x7 |
| ldp x11,x13,[x1],#8*2 |
| umulh x7,x7,x7 |
| mul x8,x9,x9 |
| umulh x9,x9,x9 |
| stp x23,x24,[x2,#8*4] |
| extr x17,x14,x17,#63 |
| stp x25,x26,[x2,#8*6] |
| add x2,x2,#8*8 |
| adcs x19,x6,x17 |
| extr x14,x15,x14,#63 |
| adcs x20,x7,x14 |
| ldp x17,x14,[x2,#8*3] |
| extr x15,x16,x15,#63 |
| cbnz x27,.Lsqr4x_shift_n_add |
| ldp x1,x4,[x29,#104] // pull np and n0 |
| |
| adcs x21,x8,x15 |
| extr x16,x17,x16,#63 |
| adcs x22,x9,x16 |
| ldp x15,x16,[x2,#8*5] |
| mul x10,x11,x11 |
| umulh x11,x11,x11 |
| stp x19,x20,[x2,#8*0] |
| mul x12,x13,x13 |
| umulh x13,x13,x13 |
| stp x21,x22,[x2,#8*2] |
| extr x17,x14,x17,#63 |
| adcs x23,x10,x17 |
| extr x14,x15,x14,#63 |
| ldp x19,x20,[sp,#8*0] |
| adcs x24,x11,x14 |
| extr x15,x16,x15,#63 |
| ldp x6,x7,[x1,#8*0] |
| adcs x25,x12,x15 |
| extr x16,xzr,x16,#63 |
| ldp x8,x9,[x1,#8*2] |
| adc x26,x13,x16 |
| ldp x10,x11,[x1,#8*4] |
| |
| // Reduce by 512 bits per iteration |
| mul x28,x4,x19 // t[0]*n0 |
| ldp x12,x13,[x1,#8*6] |
| add x3,x1,x5 |
| ldp x21,x22,[sp,#8*2] |
| stp x23,x24,[x2,#8*4] |
| ldp x23,x24,[sp,#8*4] |
| stp x25,x26,[x2,#8*6] |
| ldp x25,x26,[sp,#8*6] |
| add x1,x1,#8*8 |
| mov x30,xzr // initial top-most carry |
| mov x2,sp |
| mov x27,#8 |
| |
| .Lsqr8x_reduction: |
| // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0) |
| mul x15,x7,x28 |
| sub x27,x27,#1 |
| mul x16,x8,x28 |
| str x28,[x2],#8 // put aside t[0]*n0 for tail processing |
| mul x17,x9,x28 |
| // (*) adds xzr,x19,x14 |
| subs xzr,x19,#1 // (*) |
| mul x14,x10,x28 |
| adcs x19,x20,x15 |
| mul x15,x11,x28 |
| adcs x20,x21,x16 |
| mul x16,x12,x28 |
| adcs x21,x22,x17 |
| mul x17,x13,x28 |
| adcs x22,x23,x14 |
| umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0) |
| adcs x23,x24,x15 |
| umulh x15,x7,x28 |
| adcs x24,x25,x16 |
| umulh x16,x8,x28 |
| adcs x25,x26,x17 |
| umulh x17,x9,x28 |
| adc x26,xzr,xzr |
| adds x19,x19,x14 |
| umulh x14,x10,x28 |
| adcs x20,x20,x15 |
| umulh x15,x11,x28 |
| adcs x21,x21,x16 |
| umulh x16,x12,x28 |
| adcs x22,x22,x17 |
| umulh x17,x13,x28 |
| mul x28,x4,x19 // next t[0]*n0 |
| adcs x23,x23,x14 |
| adcs x24,x24,x15 |
| adcs x25,x25,x16 |
| adc x26,x26,x17 |
| cbnz x27,.Lsqr8x_reduction |
| |
| ldp x14,x15,[x2,#8*0] |
| ldp x16,x17,[x2,#8*2] |
| mov x0,x2 |
| sub x27,x3,x1 // done yet? |
| adds x19,x19,x14 |
| adcs x20,x20,x15 |
| ldp x14,x15,[x2,#8*4] |
| adcs x21,x21,x16 |
| adcs x22,x22,x17 |
| ldp x16,x17,[x2,#8*6] |
| adcs x23,x23,x14 |
| adcs x24,x24,x15 |
| adcs x25,x25,x16 |
| adcs x26,x26,x17 |
| //adc x28,xzr,xzr // moved below |
| cbz x27,.Lsqr8x8_post_condition |
| |
| ldr x4,[x2,#-8*8] |
| ldp x6,x7,[x1,#8*0] |
| ldp x8,x9,[x1,#8*2] |
| ldp x10,x11,[x1,#8*4] |
| mov x27,#-8*8 |
| ldp x12,x13,[x1,#8*6] |
| add x1,x1,#8*8 |
| |
| .Lsqr8x_tail: |
| mul x14,x6,x4 |
| adc x28,xzr,xzr // carry bit, modulo-scheduled |
| mul x15,x7,x4 |
| add x27,x27,#8 |
| mul x16,x8,x4 |
| mul x17,x9,x4 |
| adds x19,x19,x14 |
| mul x14,x10,x4 |
| adcs x20,x20,x15 |
| mul x15,x11,x4 |
| adcs x21,x21,x16 |
| mul x16,x12,x4 |
| adcs x22,x22,x17 |
| mul x17,x13,x4 |
| adcs x23,x23,x14 |
| umulh x14,x6,x4 |
| adcs x24,x24,x15 |
| umulh x15,x7,x4 |
| adcs x25,x25,x16 |
| umulh x16,x8,x4 |
| adcs x26,x26,x17 |
| umulh x17,x9,x4 |
| adc x28,x28,xzr |
| str x19,[x2],#8 |
| adds x19,x20,x14 |
| umulh x14,x10,x4 |
| adcs x20,x21,x15 |
| umulh x15,x11,x4 |
| adcs x21,x22,x16 |
| umulh x16,x12,x4 |
| adcs x22,x23,x17 |
| umulh x17,x13,x4 |
| ldr x4,[x0,x27] |
| adcs x23,x24,x14 |
| adcs x24,x25,x15 |
| adcs x25,x26,x16 |
| adcs x26,x28,x17 |
| //adc x28,xzr,xzr // moved above |
| cbnz x27,.Lsqr8x_tail |
| // note that carry flag is guaranteed |
| // to be zero at this point |
| ldp x6,x7,[x2,#8*0] |
| sub x27,x3,x1 // done yet? |
| sub x16,x3,x5 // rewinded np |
| ldp x8,x9,[x2,#8*2] |
| ldp x10,x11,[x2,#8*4] |
| ldp x12,x13,[x2,#8*6] |
| cbz x27,.Lsqr8x_tail_break |
| |
| ldr x4,[x0,#-8*8] |
| adds x19,x19,x6 |
| adcs x20,x20,x7 |
| ldp x6,x7,[x1,#8*0] |
| adcs x21,x21,x8 |
| adcs x22,x22,x9 |
| ldp x8,x9,[x1,#8*2] |
| adcs x23,x23,x10 |
| adcs x24,x24,x11 |
| ldp x10,x11,[x1,#8*4] |
| adcs x25,x25,x12 |
| mov x27,#-8*8 |
| adcs x26,x26,x13 |
| ldp x12,x13,[x1,#8*6] |
| add x1,x1,#8*8 |
| //adc x28,xzr,xzr // moved above |
| b .Lsqr8x_tail |
| |
| .align 4 |
| .Lsqr8x_tail_break: |
| ldr x4,[x29,#112] // pull n0 |
| add x27,x2,#8*8 // end of current t[num] window |
| |
| subs xzr,x30,#1 // "move" top-most carry to carry bit |
| adcs x14,x19,x6 |
| adcs x15,x20,x7 |
| ldp x19,x20,[x0,#8*0] |
| adcs x21,x21,x8 |
| ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0] |
| adcs x22,x22,x9 |
| ldp x8,x9,[x16,#8*2] |
| adcs x23,x23,x10 |
| adcs x24,x24,x11 |
| ldp x10,x11,[x16,#8*4] |
| adcs x25,x25,x12 |
| adcs x26,x26,x13 |
| ldp x12,x13,[x16,#8*6] |
| add x1,x16,#8*8 |
| adc x30,xzr,xzr // top-most carry |
| mul x28,x4,x19 |
| stp x14,x15,[x2,#8*0] |
| stp x21,x22,[x2,#8*2] |
| ldp x21,x22,[x0,#8*2] |
| stp x23,x24,[x2,#8*4] |
| ldp x23,x24,[x0,#8*4] |
| cmp x27,x29 // did we hit the bottom? |
| stp x25,x26,[x2,#8*6] |
| mov x2,x0 // slide the window |
| ldp x25,x26,[x0,#8*6] |
| mov x27,#8 |
| b.ne .Lsqr8x_reduction |
| |
| // Final step. We see if result is larger than modulus, and |
| // if it is, subtract the modulus. But comparison implies |
| // subtraction. So we subtract modulus, see if it borrowed, |
| // and conditionally copy original value. |
| ldr x0,[x29,#96] // pull rp |
| add x2,x2,#8*8 |
| subs x14,x19,x6 |
| sbcs x15,x20,x7 |
| sub x27,x5,#8*8 |
| mov x3,x0 // x0 copy |
| |
| .Lsqr8x_sub: |
| sbcs x16,x21,x8 |
| ldp x6,x7,[x1,#8*0] |
| sbcs x17,x22,x9 |
| stp x14,x15,[x0,#8*0] |
| sbcs x14,x23,x10 |
| ldp x8,x9,[x1,#8*2] |
| sbcs x15,x24,x11 |
| stp x16,x17,[x0,#8*2] |
| sbcs x16,x25,x12 |
| ldp x10,x11,[x1,#8*4] |
| sbcs x17,x26,x13 |
| ldp x12,x13,[x1,#8*6] |
| add x1,x1,#8*8 |
| ldp x19,x20,[x2,#8*0] |
| sub x27,x27,#8*8 |
| ldp x21,x22,[x2,#8*2] |
| ldp x23,x24,[x2,#8*4] |
| ldp x25,x26,[x2,#8*6] |
| add x2,x2,#8*8 |
| stp x14,x15,[x0,#8*4] |
| sbcs x14,x19,x6 |
| stp x16,x17,[x0,#8*6] |
| add x0,x0,#8*8 |
| sbcs x15,x20,x7 |
| cbnz x27,.Lsqr8x_sub |
| |
| sbcs x16,x21,x8 |
| mov x2,sp |
| add x1,sp,x5 |
| ldp x6,x7,[x3,#8*0] |
| sbcs x17,x22,x9 |
| stp x14,x15,[x0,#8*0] |
| sbcs x14,x23,x10 |
| ldp x8,x9,[x3,#8*2] |
| sbcs x15,x24,x11 |
| stp x16,x17,[x0,#8*2] |
| sbcs x16,x25,x12 |
| ldp x19,x20,[x1,#8*0] |
| sbcs x17,x26,x13 |
| ldp x21,x22,[x1,#8*2] |
| sbcs xzr,x30,xzr // did it borrow? |
| ldr x30,[x29,#8] // pull return address |
| stp x14,x15,[x0,#8*4] |
| stp x16,x17,[x0,#8*6] |
| |
| sub x27,x5,#8*4 |
| .Lsqr4x_cond_copy: |
| sub x27,x27,#8*4 |
| csel x14,x19,x6,lo |
| stp xzr,xzr,[x2,#8*0] |
| csel x15,x20,x7,lo |
| ldp x6,x7,[x3,#8*4] |
| ldp x19,x20,[x1,#8*4] |
| csel x16,x21,x8,lo |
| stp xzr,xzr,[x2,#8*2] |
| add x2,x2,#8*4 |
| csel x17,x22,x9,lo |
| ldp x8,x9,[x3,#8*6] |
| ldp x21,x22,[x1,#8*6] |
| add x1,x1,#8*4 |
| stp x14,x15,[x3,#8*0] |
| stp x16,x17,[x3,#8*2] |
| add x3,x3,#8*4 |
| stp xzr,xzr,[x1,#8*0] |
| stp xzr,xzr,[x1,#8*2] |
| cbnz x27,.Lsqr4x_cond_copy |
| |
| csel x14,x19,x6,lo |
| stp xzr,xzr,[x2,#8*0] |
| csel x15,x20,x7,lo |
| stp xzr,xzr,[x2,#8*2] |
| csel x16,x21,x8,lo |
| csel x17,x22,x9,lo |
| stp x14,x15,[x3,#8*0] |
| stp x16,x17,[x3,#8*2] |
| |
| b .Lsqr8x_done |
| |
| .align 4 |
| .Lsqr8x8_post_condition: |
| adc x28,xzr,xzr |
| ldr x30,[x29,#8] // pull return address |
| // x19-7,x28 hold result, x6-7 hold modulus |
| subs x6,x19,x6 |
| ldr x1,[x29,#96] // pull rp |
| sbcs x7,x20,x7 |
| stp xzr,xzr,[sp,#8*0] |
| sbcs x8,x21,x8 |
| stp xzr,xzr,[sp,#8*2] |
| sbcs x9,x22,x9 |
| stp xzr,xzr,[sp,#8*4] |
| sbcs x10,x23,x10 |
| stp xzr,xzr,[sp,#8*6] |
| sbcs x11,x24,x11 |
| stp xzr,xzr,[sp,#8*8] |
| sbcs x12,x25,x12 |
| stp xzr,xzr,[sp,#8*10] |
| sbcs x13,x26,x13 |
| stp xzr,xzr,[sp,#8*12] |
| sbcs x28,x28,xzr // did it borrow? |
| stp xzr,xzr,[sp,#8*14] |
| |
| // x6-7 hold result-modulus |
| csel x6,x19,x6,lo |
| csel x7,x20,x7,lo |
| csel x8,x21,x8,lo |
| csel x9,x22,x9,lo |
| stp x6,x7,[x1,#8*0] |
| csel x10,x23,x10,lo |
| csel x11,x24,x11,lo |
| stp x8,x9,[x1,#8*2] |
| csel x12,x25,x12,lo |
| csel x13,x26,x13,lo |
| stp x10,x11,[x1,#8*4] |
| stp x12,x13,[x1,#8*6] |
| |
| .Lsqr8x_done: |
| ldp x19,x20,[x29,#16] |
| mov sp,x29 |
| ldp x21,x22,[x29,#32] |
| mov x0,#1 |
| ldp x23,x24,[x29,#48] |
| ldp x25,x26,[x29,#64] |
| ldp x27,x28,[x29,#80] |
| ldr x29,[sp],#128 |
| ret |
| .size __bn_sqr8x_mont,.-__bn_sqr8x_mont |
| .type __bn_mul4x_mont,%function |
| .align 5 |
| __bn_mul4x_mont: |
| stp x29,x30,[sp,#-128]! |
| add x29,sp,#0 |
| stp x19,x20,[sp,#16] |
| stp x21,x22,[sp,#32] |
| stp x23,x24,[sp,#48] |
| stp x25,x26,[sp,#64] |
| stp x27,x28,[sp,#80] |
| |
| sub x26,sp,x5,lsl#3 |
| lsl x5,x5,#3 |
| ldr x4,[x4] // *n0 |
| sub sp,x26,#8*4 // alloca |
| |
| add x10,x2,x5 |
| add x27,x1,x5 |
| stp x0,x10,[x29,#96] // offload rp and &b[num] |
| |
| ldr x24,[x2,#8*0] // b[0] |
| ldp x6,x7,[x1,#8*0] // a[0..3] |
| ldp x8,x9,[x1,#8*2] |
| add x1,x1,#8*4 |
| mov x19,xzr |
| mov x20,xzr |
| mov x21,xzr |
| mov x22,xzr |
| ldp x14,x15,[x3,#8*0] // n[0..3] |
| ldp x16,x17,[x3,#8*2] |
| adds x3,x3,#8*4 // clear carry bit |
| mov x0,xzr |
| mov x28,#0 |
| mov x26,sp |
| |
| .Loop_mul4x_1st_reduction: |
| mul x10,x6,x24 // lo(a[0..3]*b[0]) |
| adc x0,x0,xzr // modulo-scheduled |
| mul x11,x7,x24 |
| add x28,x28,#8 |
| mul x12,x8,x24 |
| and x28,x28,#31 |
| mul x13,x9,x24 |
| adds x19,x19,x10 |
| umulh x10,x6,x24 // hi(a[0..3]*b[0]) |
| adcs x20,x20,x11 |
| mul x25,x19,x4 // t[0]*n0 |
| adcs x21,x21,x12 |
| umulh x11,x7,x24 |
| adcs x22,x22,x13 |
| umulh x12,x8,x24 |
| adc x23,xzr,xzr |
| umulh x13,x9,x24 |
| ldr x24,[x2,x28] // next b[i] (or b[0]) |
| adds x20,x20,x10 |
| // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0) |
| str x25,[x26],#8 // put aside t[0]*n0 for tail processing |
| adcs x21,x21,x11 |
| mul x11,x15,x25 |
| adcs x22,x22,x12 |
| mul x12,x16,x25 |
| adc x23,x23,x13 // can't overflow |
| mul x13,x17,x25 |
| // (*) adds xzr,x19,x10 |
| subs xzr,x19,#1 // (*) |
| umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0) |
| adcs x19,x20,x11 |
| umulh x11,x15,x25 |
| adcs x20,x21,x12 |
| umulh x12,x16,x25 |
| adcs x21,x22,x13 |
| umulh x13,x17,x25 |
| adcs x22,x23,x0 |
| adc x0,xzr,xzr |
| adds x19,x19,x10 |
| sub x10,x27,x1 |
| adcs x20,x20,x11 |
| adcs x21,x21,x12 |
| adcs x22,x22,x13 |
| //adc x0,x0,xzr |
| cbnz x28,.Loop_mul4x_1st_reduction |
| |
| cbz x10,.Lmul4x4_post_condition |
| |
| ldp x6,x7,[x1,#8*0] // a[4..7] |
| ldp x8,x9,[x1,#8*2] |
| add x1,x1,#8*4 |
| ldr x25,[sp] // a[0]*n0 |
| ldp x14,x15,[x3,#8*0] // n[4..7] |
| ldp x16,x17,[x3,#8*2] |
| add x3,x3,#8*4 |
| |
| .Loop_mul4x_1st_tail: |
| mul x10,x6,x24 // lo(a[4..7]*b[i]) |
| adc x0,x0,xzr // modulo-scheduled |
| mul x11,x7,x24 |
| add x28,x28,#8 |
| mul x12,x8,x24 |
| and x28,x28,#31 |
| mul x13,x9,x24 |
| adds x19,x19,x10 |
| umulh x10,x6,x24 // hi(a[4..7]*b[i]) |
| adcs x20,x20,x11 |
| umulh x11,x7,x24 |
| adcs x21,x21,x12 |
| umulh x12,x8,x24 |
| adcs x22,x22,x13 |
| umulh x13,x9,x24 |
| adc x23,xzr,xzr |
| ldr x24,[x2,x28] // next b[i] (or b[0]) |
| adds x20,x20,x10 |
| mul x10,x14,x25 // lo(n[4..7]*a[0]*n0) |
| adcs x21,x21,x11 |
| mul x11,x15,x25 |
| adcs x22,x22,x12 |
| mul x12,x16,x25 |
| adc x23,x23,x13 // can't overflow |
| mul x13,x17,x25 |
| adds x19,x19,x10 |
| umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0) |
| adcs x20,x20,x11 |
| umulh x11,x15,x25 |
| adcs x21,x21,x12 |
| umulh x12,x16,x25 |
| adcs x22,x22,x13 |
| adcs x23,x23,x0 |
| umulh x13,x17,x25 |
| adc x0,xzr,xzr |
| ldr x25,[sp,x28] // next t[0]*n0 |
| str x19,[x26],#8 // result!!! |
| adds x19,x20,x10 |
| sub x10,x27,x1 // done yet? |
| adcs x20,x21,x11 |
| adcs x21,x22,x12 |
| adcs x22,x23,x13 |
| //adc x0,x0,xzr |
| cbnz x28,.Loop_mul4x_1st_tail |
| |
| sub x11,x27,x5 // rewinded x1 |
| cbz x10,.Lmul4x_proceed |
| |
| ldp x6,x7,[x1,#8*0] |
| ldp x8,x9,[x1,#8*2] |
| add x1,x1,#8*4 |
| ldp x14,x15,[x3,#8*0] |
| ldp x16,x17,[x3,#8*2] |
| add x3,x3,#8*4 |
| b .Loop_mul4x_1st_tail |
| |
| .align 5 |
| .Lmul4x_proceed: |
| ldr x24,[x2,#8*4]! // *++b |
| adc x30,x0,xzr |
| ldp x6,x7,[x11,#8*0] // a[0..3] |
| sub x3,x3,x5 // rewind np |
| ldp x8,x9,[x11,#8*2] |
| add x1,x11,#8*4 |
| |
| stp x19,x20,[x26,#8*0] // result!!! |
| ldp x19,x20,[sp,#8*4] // t[0..3] |
| stp x21,x22,[x26,#8*2] // result!!! |
| ldp x21,x22,[sp,#8*6] |
| |
| ldp x14,x15,[x3,#8*0] // n[0..3] |
| mov x26,sp |
| ldp x16,x17,[x3,#8*2] |
| adds x3,x3,#8*4 // clear carry bit |
| mov x0,xzr |
| |
| .align 4 |
| .Loop_mul4x_reduction: |
| mul x10,x6,x24 // lo(a[0..3]*b[4]) |
| adc x0,x0,xzr // modulo-scheduled |
| mul x11,x7,x24 |
| add x28,x28,#8 |
| mul x12,x8,x24 |
| and x28,x28,#31 |
| mul x13,x9,x24 |
| adds x19,x19,x10 |
| umulh x10,x6,x24 // hi(a[0..3]*b[4]) |
| adcs x20,x20,x11 |
| mul x25,x19,x4 // t[0]*n0 |
| adcs x21,x21,x12 |
| umulh x11,x7,x24 |
| adcs x22,x22,x13 |
| umulh x12,x8,x24 |
| adc x23,xzr,xzr |
| umulh x13,x9,x24 |
| ldr x24,[x2,x28] // next b[i] |
| adds x20,x20,x10 |
| // (*) mul x10,x14,x25 |
| str x25,[x26],#8 // put aside t[0]*n0 for tail processing |
| adcs x21,x21,x11 |
| mul x11,x15,x25 // lo(n[0..3]*t[0]*n0 |
| adcs x22,x22,x12 |
| mul x12,x16,x25 |
| adc x23,x23,x13 // can't overflow |
| mul x13,x17,x25 |
| // (*) adds xzr,x19,x10 |
| subs xzr,x19,#1 // (*) |
| umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0 |
| adcs x19,x20,x11 |
| umulh x11,x15,x25 |
| adcs x20,x21,x12 |
| umulh x12,x16,x25 |
| adcs x21,x22,x13 |
| umulh x13,x17,x25 |
| adcs x22,x23,x0 |
| adc x0,xzr,xzr |
| adds x19,x19,x10 |
| adcs x20,x20,x11 |
| adcs x21,x21,x12 |
| adcs x22,x22,x13 |
| //adc x0,x0,xzr |
| cbnz x28,.Loop_mul4x_reduction |
| |
| adc x0,x0,xzr |
| ldp x10,x11,[x26,#8*4] // t[4..7] |
| ldp x12,x13,[x26,#8*6] |
| ldp x6,x7,[x1,#8*0] // a[4..7] |
| ldp x8,x9,[x1,#8*2] |
| add x1,x1,#8*4 |
| adds x19,x19,x10 |
| adcs x20,x20,x11 |
| adcs x21,x21,x12 |
| adcs x22,x22,x13 |
| //adc x0,x0,xzr |
| |
| ldr x25,[sp] // t[0]*n0 |
| ldp x14,x15,[x3,#8*0] // n[4..7] |
| ldp x16,x17,[x3,#8*2] |
| add x3,x3,#8*4 |
| |
| .align 4 |
| .Loop_mul4x_tail: |
| mul x10,x6,x24 // lo(a[4..7]*b[4]) |
| adc x0,x0,xzr // modulo-scheduled |
| mul x11,x7,x24 |
| add x28,x28,#8 |
| mul x12,x8,x24 |
| and x28,x28,#31 |
| mul x13,x9,x24 |
| adds x19,x19,x10 |
| umulh x10,x6,x24 // hi(a[4..7]*b[4]) |
| adcs x20,x20,x11 |
| umulh x11,x7,x24 |
| adcs x21,x21,x12 |
| umulh x12,x8,x24 |
| adcs x22,x22,x13 |
| umulh x13,x9,x24 |
| adc x23,xzr,xzr |
| ldr x24,[x2,x28] // next b[i] |
| adds x20,x20,x10 |
| mul x10,x14,x25 // lo(n[4..7]*t[0]*n0) |
| adcs x21,x21,x11 |
| mul x11,x15,x25 |
| adcs x22,x22,x12 |
| mul x12,x16,x25 |
| adc x23,x23,x13 // can't overflow |
| mul x13,x17,x25 |
| adds x19,x19,x10 |
| umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0) |
| adcs x20,x20,x11 |
| umulh x11,x15,x25 |
| adcs x21,x21,x12 |
| umulh x12,x16,x25 |
| adcs x22,x22,x13 |
| umulh x13,x17,x25 |
| adcs x23,x23,x0 |
| ldr x25,[sp,x28] // next a[0]*n0 |
| adc x0,xzr,xzr |
| str x19,[x26],#8 // result!!! |
| adds x19,x20,x10 |
| sub x10,x27,x1 // done yet? |
| adcs x20,x21,x11 |
| adcs x21,x22,x12 |
| adcs x22,x23,x13 |
| //adc x0,x0,xzr |
| cbnz x28,.Loop_mul4x_tail |
| |
| sub x11,x3,x5 // rewinded np? |
| adc x0,x0,xzr |
| cbz x10,.Loop_mul4x_break |
| |
| ldp x10,x11,[x26,#8*4] |
| ldp x12,x13,[x26,#8*6] |
| ldp x6,x7,[x1,#8*0] |
| ldp x8,x9,[x1,#8*2] |
| add x1,x1,#8*4 |
| adds x19,x19,x10 |
| adcs x20,x20,x11 |
| adcs x21,x21,x12 |
| adcs x22,x22,x13 |
| //adc x0,x0,xzr |
| ldp x14,x15,[x3,#8*0] |
| ldp x16,x17,[x3,#8*2] |
| add x3,x3,#8*4 |
| b .Loop_mul4x_tail |
| |
| .align 4 |
| .Loop_mul4x_break: |
| ldp x12,x13,[x29,#96] // pull rp and &b[num] |
| adds x19,x19,x30 |
| add x2,x2,#8*4 // bp++ |
| adcs x20,x20,xzr |
| sub x1,x1,x5 // rewind ap |
| adcs x21,x21,xzr |
| stp x19,x20,[x26,#8*0] // result!!! |
| adcs x22,x22,xzr |
| ldp x19,x20,[sp,#8*4] // t[0..3] |
| adc x30,x0,xzr |
| stp x21,x22,[x26,#8*2] // result!!! |
| cmp x2,x13 // done yet? |
| ldp x21,x22,[sp,#8*6] |
| ldp x14,x15,[x11,#8*0] // n[0..3] |
| ldp x16,x17,[x11,#8*2] |
| add x3,x11,#8*4 |
| b.eq .Lmul4x_post |
| |
| ldr x24,[x2] |
| ldp x6,x7,[x1,#8*0] // a[0..3] |
| ldp x8,x9,[x1,#8*2] |
| adds x1,x1,#8*4 // clear carry bit |
| mov x0,xzr |
| mov x26,sp |
| b .Loop_mul4x_reduction |
| |
| .align 4 |
| .Lmul4x_post: |
| // Final step. We see if result is larger than modulus, and |
| // if it is, subtract the modulus. But comparison implies |
| // subtraction. So we subtract modulus, see if it borrowed, |
| // and conditionally copy original value. |
| mov x0,x12 |
| mov x27,x12 // x0 copy |
| subs x10,x19,x14 |
| add x26,sp,#8*8 |
| sbcs x11,x20,x15 |
| sub x28,x5,#8*4 |
| |
| .Lmul4x_sub: |
| sbcs x12,x21,x16 |
| ldp x14,x15,[x3,#8*0] |
| sub x28,x28,#8*4 |
| ldp x19,x20,[x26,#8*0] |
| sbcs x13,x22,x17 |
| ldp x16,x17,[x3,#8*2] |
| add x3,x3,#8*4 |
| ldp x21,x22,[x26,#8*2] |
| add x26,x26,#8*4 |
| stp x10,x11,[x0,#8*0] |
| sbcs x10,x19,x14 |
| stp x12,x13,[x0,#8*2] |
| add x0,x0,#8*4 |
| sbcs x11,x20,x15 |
| cbnz x28,.Lmul4x_sub |
| |
| sbcs x12,x21,x16 |
| mov x26,sp |
| add x1,sp,#8*4 |
| ldp x6,x7,[x27,#8*0] |
| sbcs x13,x22,x17 |
| stp x10,x11,[x0,#8*0] |
| ldp x8,x9,[x27,#8*2] |
| stp x12,x13,[x0,#8*2] |
| ldp x19,x20,[x1,#8*0] |
| ldp x21,x22,[x1,#8*2] |
| sbcs xzr,x30,xzr // did it borrow? |
| ldr x30,[x29,#8] // pull return address |
| |
| sub x28,x5,#8*4 |
| .Lmul4x_cond_copy: |
| sub x28,x28,#8*4 |
| csel x10,x19,x6,lo |
| stp xzr,xzr,[x26,#8*0] |
| csel x11,x20,x7,lo |
| ldp x6,x7,[x27,#8*4] |
| ldp x19,x20,[x1,#8*4] |
| csel x12,x21,x8,lo |
| stp xzr,xzr,[x26,#8*2] |
| add x26,x26,#8*4 |
| csel x13,x22,x9,lo |
| ldp x8,x9,[x27,#8*6] |
| ldp x21,x22,[x1,#8*6] |
| add x1,x1,#8*4 |
| stp x10,x11,[x27,#8*0] |
| stp x12,x13,[x27,#8*2] |
| add x27,x27,#8*4 |
| cbnz x28,.Lmul4x_cond_copy |
| |
| csel x10,x19,x6,lo |
| stp xzr,xzr,[x26,#8*0] |
| csel x11,x20,x7,lo |
| stp xzr,xzr,[x26,#8*2] |
| csel x12,x21,x8,lo |
| stp xzr,xzr,[x26,#8*3] |
| csel x13,x22,x9,lo |
| stp xzr,xzr,[x26,#8*4] |
| stp x10,x11,[x27,#8*0] |
| stp x12,x13,[x27,#8*2] |
| |
| b .Lmul4x_done |
| |
| .align 4 |
| .Lmul4x4_post_condition: |
| adc x0,x0,xzr |
| ldr x1,[x29,#96] // pull rp |
| // x19-3,x0 hold result, x14-7 hold modulus |
| subs x6,x19,x14 |
| ldr x30,[x29,#8] // pull return address |
| sbcs x7,x20,x15 |
| stp xzr,xzr,[sp,#8*0] |
| sbcs x8,x21,x16 |
| stp xzr,xzr,[sp,#8*2] |
| sbcs x9,x22,x17 |
| stp xzr,xzr,[sp,#8*4] |
| sbcs xzr,x0,xzr // did it borrow? |
| stp xzr,xzr,[sp,#8*6] |
| |
| // x6-3 hold result-modulus |
| csel x6,x19,x6,lo |
| csel x7,x20,x7,lo |
| csel x8,x21,x8,lo |
| csel x9,x22,x9,lo |
| stp x6,x7,[x1,#8*0] |
| stp x8,x9,[x1,#8*2] |
| |
| .Lmul4x_done: |
| ldp x19,x20,[x29,#16] |
| mov sp,x29 |
| ldp x21,x22,[x29,#32] |
| mov x0,#1 |
| ldp x23,x24,[x29,#48] |
| ldp x25,x26,[x29,#64] |
| ldp x27,x28,[x29,#80] |
| ldr x29,[sp],#128 |
| ret |
| .size __bn_mul4x_mont,.-__bn_mul4x_mont |
| .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 |
| .align 2 |
| .align 4 |
| #endif |