| #include <openssl/arm_arch.h> |
| |
| .text |
| |
| |
| |
| .align 5 |
| Lsigma: |
| .quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral |
| Lone: |
| .long 1,0,0,0 |
| LOPENSSL_armcap_P: |
| #ifdef __ILP32__ |
| .long _OPENSSL_armcap_P-. |
| #else |
| .quad _OPENSSL_armcap_P-. |
| #endif |
| .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 |
| .align 2 |
| |
| .globl _ChaCha20_ctr32 |
| .private_extern _ChaCha20_ctr32 |
| |
| .align 5 |
| _ChaCha20_ctr32: |
| cbz x2,Labort |
| adr x5,LOPENSSL_armcap_P |
| cmp x2,#192 |
| b.lo Lshort |
| #ifdef __ILP32__ |
| ldrsw x6,[x5] |
| #else |
| ldr x6,[x5] |
| #endif |
| ldr w17,[x6,x5] |
| tst w17,#ARMV7_NEON |
| b.ne ChaCha20_neon |
| |
| Lshort: |
| stp x29,x30,[sp,#-96]! |
| add x29,sp,#0 |
| |
| adr x5,Lsigma |
| stp x19,x20,[sp,#16] |
| stp x21,x22,[sp,#32] |
| stp x23,x24,[sp,#48] |
| stp x25,x26,[sp,#64] |
| stp x27,x28,[sp,#80] |
| sub sp,sp,#64 |
| |
| ldp x22,x23,[x5] // load sigma |
| ldp x24,x25,[x3] // load key |
| ldp x26,x27,[x3,#16] |
| ldp x28,x30,[x4] // load counter |
| #ifdef __ARMEB__ |
| ror x24,x24,#32 |
| ror x25,x25,#32 |
| ror x26,x26,#32 |
| ror x27,x27,#32 |
| ror x28,x28,#32 |
| ror x30,x30,#32 |
| #endif |
| |
| Loop_outer: |
| mov w5,w22 // unpack key block |
| lsr x6,x22,#32 |
| mov w7,w23 |
| lsr x8,x23,#32 |
| mov w9,w24 |
| lsr x10,x24,#32 |
| mov w11,w25 |
| lsr x12,x25,#32 |
| mov w13,w26 |
| lsr x14,x26,#32 |
| mov w15,w27 |
| lsr x16,x27,#32 |
| mov w17,w28 |
| lsr x19,x28,#32 |
| mov w20,w30 |
| lsr x21,x30,#32 |
| |
| mov x4,#10 |
| subs x2,x2,#64 |
| Loop: |
| sub x4,x4,#1 |
| add w5,w5,w9 |
| add w6,w6,w10 |
| add w7,w7,w11 |
| add w8,w8,w12 |
| eor w17,w17,w5 |
| eor w19,w19,w6 |
| eor w20,w20,w7 |
| eor w21,w21,w8 |
| ror w17,w17,#16 |
| ror w19,w19,#16 |
| ror w20,w20,#16 |
| ror w21,w21,#16 |
| add w13,w13,w17 |
| add w14,w14,w19 |
| add w15,w15,w20 |
| add w16,w16,w21 |
| eor w9,w9,w13 |
| eor w10,w10,w14 |
| eor w11,w11,w15 |
| eor w12,w12,w16 |
| ror w9,w9,#20 |
| ror w10,w10,#20 |
| ror w11,w11,#20 |
| ror w12,w12,#20 |
| add w5,w5,w9 |
| add w6,w6,w10 |
| add w7,w7,w11 |
| add w8,w8,w12 |
| eor w17,w17,w5 |
| eor w19,w19,w6 |
| eor w20,w20,w7 |
| eor w21,w21,w8 |
| ror w17,w17,#24 |
| ror w19,w19,#24 |
| ror w20,w20,#24 |
| ror w21,w21,#24 |
| add w13,w13,w17 |
| add w14,w14,w19 |
| add w15,w15,w20 |
| add w16,w16,w21 |
| eor w9,w9,w13 |
| eor w10,w10,w14 |
| eor w11,w11,w15 |
| eor w12,w12,w16 |
| ror w9,w9,#25 |
| ror w10,w10,#25 |
| ror w11,w11,#25 |
| ror w12,w12,#25 |
| add w5,w5,w10 |
| add w6,w6,w11 |
| add w7,w7,w12 |
| add w8,w8,w9 |
| eor w21,w21,w5 |
| eor w17,w17,w6 |
| eor w19,w19,w7 |
| eor w20,w20,w8 |
| ror w21,w21,#16 |
| ror w17,w17,#16 |
| ror w19,w19,#16 |
| ror w20,w20,#16 |
| add w15,w15,w21 |
| add w16,w16,w17 |
| add w13,w13,w19 |
| add w14,w14,w20 |
| eor w10,w10,w15 |
| eor w11,w11,w16 |
| eor w12,w12,w13 |
| eor w9,w9,w14 |
| ror w10,w10,#20 |
| ror w11,w11,#20 |
| ror w12,w12,#20 |
| ror w9,w9,#20 |
| add w5,w5,w10 |
| add w6,w6,w11 |
| add w7,w7,w12 |
| add w8,w8,w9 |
| eor w21,w21,w5 |
| eor w17,w17,w6 |
| eor w19,w19,w7 |
| eor w20,w20,w8 |
| ror w21,w21,#24 |
| ror w17,w17,#24 |
| ror w19,w19,#24 |
| ror w20,w20,#24 |
| add w15,w15,w21 |
| add w16,w16,w17 |
| add w13,w13,w19 |
| add w14,w14,w20 |
| eor w10,w10,w15 |
| eor w11,w11,w16 |
| eor w12,w12,w13 |
| eor w9,w9,w14 |
| ror w10,w10,#25 |
| ror w11,w11,#25 |
| ror w12,w12,#25 |
| ror w9,w9,#25 |
| cbnz x4,Loop |
| |
| add w5,w5,w22 // accumulate key block |
| add x6,x6,x22,lsr#32 |
| add w7,w7,w23 |
| add x8,x8,x23,lsr#32 |
| add w9,w9,w24 |
| add x10,x10,x24,lsr#32 |
| add w11,w11,w25 |
| add x12,x12,x25,lsr#32 |
| add w13,w13,w26 |
| add x14,x14,x26,lsr#32 |
| add w15,w15,w27 |
| add x16,x16,x27,lsr#32 |
| add w17,w17,w28 |
| add x19,x19,x28,lsr#32 |
| add w20,w20,w30 |
| add x21,x21,x30,lsr#32 |
| |
| b.lo Ltail |
| |
| add x5,x5,x6,lsl#32 // pack |
| add x7,x7,x8,lsl#32 |
| ldp x6,x8,[x1,#0] // load input |
| add x9,x9,x10,lsl#32 |
| add x11,x11,x12,lsl#32 |
| ldp x10,x12,[x1,#16] |
| add x13,x13,x14,lsl#32 |
| add x15,x15,x16,lsl#32 |
| ldp x14,x16,[x1,#32] |
| add x17,x17,x19,lsl#32 |
| add x20,x20,x21,lsl#32 |
| ldp x19,x21,[x1,#48] |
| add x1,x1,#64 |
| #ifdef __ARMEB__ |
| rev x5,x5 |
| rev x7,x7 |
| rev x9,x9 |
| rev x11,x11 |
| rev x13,x13 |
| rev x15,x15 |
| rev x17,x17 |
| rev x20,x20 |
| #endif |
| eor x5,x5,x6 |
| eor x7,x7,x8 |
| eor x9,x9,x10 |
| eor x11,x11,x12 |
| eor x13,x13,x14 |
| eor x15,x15,x16 |
| eor x17,x17,x19 |
| eor x20,x20,x21 |
| |
| stp x5,x7,[x0,#0] // store output |
| add x28,x28,#1 // increment counter |
| stp x9,x11,[x0,#16] |
| stp x13,x15,[x0,#32] |
| stp x17,x20,[x0,#48] |
| add x0,x0,#64 |
| |
| b.hi Loop_outer |
| |
| ldp x19,x20,[x29,#16] |
| add sp,sp,#64 |
| ldp x21,x22,[x29,#32] |
| ldp x23,x24,[x29,#48] |
| ldp x25,x26,[x29,#64] |
| ldp x27,x28,[x29,#80] |
| ldp x29,x30,[sp],#96 |
| Labort: |
| ret |
| |
| .align 4 |
| Ltail: |
| add x2,x2,#64 |
| Less_than_64: |
| sub x0,x0,#1 |
| add x1,x1,x2 |
| add x0,x0,x2 |
| add x4,sp,x2 |
| neg x2,x2 |
| |
| add x5,x5,x6,lsl#32 // pack |
| add x7,x7,x8,lsl#32 |
| add x9,x9,x10,lsl#32 |
| add x11,x11,x12,lsl#32 |
| add x13,x13,x14,lsl#32 |
| add x15,x15,x16,lsl#32 |
| add x17,x17,x19,lsl#32 |
| add x20,x20,x21,lsl#32 |
| #ifdef __ARMEB__ |
| rev x5,x5 |
| rev x7,x7 |
| rev x9,x9 |
| rev x11,x11 |
| rev x13,x13 |
| rev x15,x15 |
| rev x17,x17 |
| rev x20,x20 |
| #endif |
| stp x5,x7,[sp,#0] |
| stp x9,x11,[sp,#16] |
| stp x13,x15,[sp,#32] |
| stp x17,x20,[sp,#48] |
| |
| Loop_tail: |
| ldrb w10,[x1,x2] |
| ldrb w11,[x4,x2] |
| add x2,x2,#1 |
| eor w10,w10,w11 |
| strb w10,[x0,x2] |
| cbnz x2,Loop_tail |
| |
| stp xzr,xzr,[sp,#0] |
| stp xzr,xzr,[sp,#16] |
| stp xzr,xzr,[sp,#32] |
| stp xzr,xzr,[sp,#48] |
| |
| ldp x19,x20,[x29,#16] |
| add sp,sp,#64 |
| ldp x21,x22,[x29,#32] |
| ldp x23,x24,[x29,#48] |
| ldp x25,x26,[x29,#64] |
| ldp x27,x28,[x29,#80] |
| ldp x29,x30,[sp],#96 |
| ret |
| |
| |
| |
| .align 5 |
| ChaCha20_neon: |
| stp x29,x30,[sp,#-96]! |
| add x29,sp,#0 |
| |
| adr x5,Lsigma |
| stp x19,x20,[sp,#16] |
| stp x21,x22,[sp,#32] |
| stp x23,x24,[sp,#48] |
| stp x25,x26,[sp,#64] |
| stp x27,x28,[sp,#80] |
| cmp x2,#512 |
| b.hs L512_or_more_neon |
| |
| sub sp,sp,#64 |
| |
| ldp x22,x23,[x5] // load sigma |
| ld1 {v24.4s},[x5],#16 |
| ldp x24,x25,[x3] // load key |
| ldp x26,x27,[x3,#16] |
| ld1 {v25.4s,v26.4s},[x3] |
| ldp x28,x30,[x4] // load counter |
| ld1 {v27.4s},[x4] |
| ld1 {v31.4s},[x5] |
| #ifdef __ARMEB__ |
| rev64 v24.4s,v24.4s |
| ror x24,x24,#32 |
| ror x25,x25,#32 |
| ror x26,x26,#32 |
| ror x27,x27,#32 |
| ror x28,x28,#32 |
| ror x30,x30,#32 |
| #endif |
| add v27.4s,v27.4s,v31.4s // += 1 |
| add v28.4s,v27.4s,v31.4s |
| add v29.4s,v28.4s,v31.4s |
| shl v31.4s,v31.4s,#2 // 1 -> 4 |
| |
| Loop_outer_neon: |
| mov w5,w22 // unpack key block |
| lsr x6,x22,#32 |
| mov v0.16b,v24.16b |
| mov w7,w23 |
| lsr x8,x23,#32 |
| mov v4.16b,v24.16b |
| mov w9,w24 |
| lsr x10,x24,#32 |
| mov v16.16b,v24.16b |
| mov w11,w25 |
| mov v1.16b,v25.16b |
| lsr x12,x25,#32 |
| mov v5.16b,v25.16b |
| mov w13,w26 |
| mov v17.16b,v25.16b |
| lsr x14,x26,#32 |
| mov v3.16b,v27.16b |
| mov w15,w27 |
| mov v7.16b,v28.16b |
| lsr x16,x27,#32 |
| mov v19.16b,v29.16b |
| mov w17,w28 |
| mov v2.16b,v26.16b |
| lsr x19,x28,#32 |
| mov v6.16b,v26.16b |
| mov w20,w30 |
| mov v18.16b,v26.16b |
| lsr x21,x30,#32 |
| |
| mov x4,#10 |
| subs x2,x2,#256 |
| Loop_neon: |
| sub x4,x4,#1 |
| add v0.4s,v0.4s,v1.4s |
| add w5,w5,w9 |
| add v4.4s,v4.4s,v5.4s |
| add w6,w6,w10 |
| add v16.4s,v16.4s,v17.4s |
| add w7,w7,w11 |
| eor v3.16b,v3.16b,v0.16b |
| add w8,w8,w12 |
| eor v7.16b,v7.16b,v4.16b |
| eor w17,w17,w5 |
| eor v19.16b,v19.16b,v16.16b |
| eor w19,w19,w6 |
| rev32 v3.8h,v3.8h |
| eor w20,w20,w7 |
| rev32 v7.8h,v7.8h |
| eor w21,w21,w8 |
| rev32 v19.8h,v19.8h |
| ror w17,w17,#16 |
| add v2.4s,v2.4s,v3.4s |
| ror w19,w19,#16 |
| add v6.4s,v6.4s,v7.4s |
| ror w20,w20,#16 |
| add v18.4s,v18.4s,v19.4s |
| ror w21,w21,#16 |
| eor v20.16b,v1.16b,v2.16b |
| add w13,w13,w17 |
| eor v21.16b,v5.16b,v6.16b |
| add w14,w14,w19 |
| eor v22.16b,v17.16b,v18.16b |
| add w15,w15,w20 |
| ushr v1.4s,v20.4s,#20 |
| add w16,w16,w21 |
| ushr v5.4s,v21.4s,#20 |
| eor w9,w9,w13 |
| ushr v17.4s,v22.4s,#20 |
| eor w10,w10,w14 |
| sli v1.4s,v20.4s,#12 |
| eor w11,w11,w15 |
| sli v5.4s,v21.4s,#12 |
| eor w12,w12,w16 |
| sli v17.4s,v22.4s,#12 |
| ror w9,w9,#20 |
| add v0.4s,v0.4s,v1.4s |
| ror w10,w10,#20 |
| add v4.4s,v4.4s,v5.4s |
| ror w11,w11,#20 |
| add v16.4s,v16.4s,v17.4s |
| ror w12,w12,#20 |
| eor v20.16b,v3.16b,v0.16b |
| add w5,w5,w9 |
| eor v21.16b,v7.16b,v4.16b |
| add w6,w6,w10 |
| eor v22.16b,v19.16b,v16.16b |
| add w7,w7,w11 |
| ushr v3.4s,v20.4s,#24 |
| add w8,w8,w12 |
| ushr v7.4s,v21.4s,#24 |
| eor w17,w17,w5 |
| ushr v19.4s,v22.4s,#24 |
| eor w19,w19,w6 |
| sli v3.4s,v20.4s,#8 |
| eor w20,w20,w7 |
| sli v7.4s,v21.4s,#8 |
| eor w21,w21,w8 |
| sli v19.4s,v22.4s,#8 |
| ror w17,w17,#24 |
| add v2.4s,v2.4s,v3.4s |
| ror w19,w19,#24 |
| add v6.4s,v6.4s,v7.4s |
| ror w20,w20,#24 |
| add v18.4s,v18.4s,v19.4s |
| ror w21,w21,#24 |
| eor v20.16b,v1.16b,v2.16b |
| add w13,w13,w17 |
| eor v21.16b,v5.16b,v6.16b |
| add w14,w14,w19 |
| eor v22.16b,v17.16b,v18.16b |
| add w15,w15,w20 |
| ushr v1.4s,v20.4s,#25 |
| add w16,w16,w21 |
| ushr v5.4s,v21.4s,#25 |
| eor w9,w9,w13 |
| ushr v17.4s,v22.4s,#25 |
| eor w10,w10,w14 |
| sli v1.4s,v20.4s,#7 |
| eor w11,w11,w15 |
| sli v5.4s,v21.4s,#7 |
| eor w12,w12,w16 |
| sli v17.4s,v22.4s,#7 |
| ror w9,w9,#25 |
| ext v2.16b,v2.16b,v2.16b,#8 |
| ror w10,w10,#25 |
| ext v6.16b,v6.16b,v6.16b,#8 |
| ror w11,w11,#25 |
| ext v18.16b,v18.16b,v18.16b,#8 |
| ror w12,w12,#25 |
| ext v3.16b,v3.16b,v3.16b,#12 |
| ext v7.16b,v7.16b,v7.16b,#12 |
| ext v19.16b,v19.16b,v19.16b,#12 |
| ext v1.16b,v1.16b,v1.16b,#4 |
| ext v5.16b,v5.16b,v5.16b,#4 |
| ext v17.16b,v17.16b,v17.16b,#4 |
| add v0.4s,v0.4s,v1.4s |
| add w5,w5,w10 |
| add v4.4s,v4.4s,v5.4s |
| add w6,w6,w11 |
| add v16.4s,v16.4s,v17.4s |
| add w7,w7,w12 |
| eor v3.16b,v3.16b,v0.16b |
| add w8,w8,w9 |
| eor v7.16b,v7.16b,v4.16b |
| eor w21,w21,w5 |
| eor v19.16b,v19.16b,v16.16b |
| eor w17,w17,w6 |
| rev32 v3.8h,v3.8h |
| eor w19,w19,w7 |
| rev32 v7.8h,v7.8h |
| eor w20,w20,w8 |
| rev32 v19.8h,v19.8h |
| ror w21,w21,#16 |
| add v2.4s,v2.4s,v3.4s |
| ror w17,w17,#16 |
| add v6.4s,v6.4s,v7.4s |
| ror w19,w19,#16 |
| add v18.4s,v18.4s,v19.4s |
| ror w20,w20,#16 |
| eor v20.16b,v1.16b,v2.16b |
| add w15,w15,w21 |
| eor v21.16b,v5.16b,v6.16b |
| add w16,w16,w17 |
| eor v22.16b,v17.16b,v18.16b |
| add w13,w13,w19 |
| ushr v1.4s,v20.4s,#20 |
| add w14,w14,w20 |
| ushr v5.4s,v21.4s,#20 |
| eor w10,w10,w15 |
| ushr v17.4s,v22.4s,#20 |
| eor w11,w11,w16 |
| sli v1.4s,v20.4s,#12 |
| eor w12,w12,w13 |
| sli v5.4s,v21.4s,#12 |
| eor w9,w9,w14 |
| sli v17.4s,v22.4s,#12 |
| ror w10,w10,#20 |
| add v0.4s,v0.4s,v1.4s |
| ror w11,w11,#20 |
| add v4.4s,v4.4s,v5.4s |
| ror w12,w12,#20 |
| add v16.4s,v16.4s,v17.4s |
| ror w9,w9,#20 |
| eor v20.16b,v3.16b,v0.16b |
| add w5,w5,w10 |
| eor v21.16b,v7.16b,v4.16b |
| add w6,w6,w11 |
| eor v22.16b,v19.16b,v16.16b |
| add w7,w7,w12 |
| ushr v3.4s,v20.4s,#24 |
| add w8,w8,w9 |
| ushr v7.4s,v21.4s,#24 |
| eor w21,w21,w5 |
| ushr v19.4s,v22.4s,#24 |
| eor w17,w17,w6 |
| sli v3.4s,v20.4s,#8 |
| eor w19,w19,w7 |
| sli v7.4s,v21.4s,#8 |
| eor w20,w20,w8 |
| sli v19.4s,v22.4s,#8 |
| ror w21,w21,#24 |
| add v2.4s,v2.4s,v3.4s |
| ror w17,w17,#24 |
| add v6.4s,v6.4s,v7.4s |
| ror w19,w19,#24 |
| add v18.4s,v18.4s,v19.4s |
| ror w20,w20,#24 |
| eor v20.16b,v1.16b,v2.16b |
| add w15,w15,w21 |
| eor v21.16b,v5.16b,v6.16b |
| add w16,w16,w17 |
| eor v22.16b,v17.16b,v18.16b |
| add w13,w13,w19 |
| ushr v1.4s,v20.4s,#25 |
| add w14,w14,w20 |
| ushr v5.4s,v21.4s,#25 |
| eor w10,w10,w15 |
| ushr v17.4s,v22.4s,#25 |
| eor w11,w11,w16 |
| sli v1.4s,v20.4s,#7 |
| eor w12,w12,w13 |
| sli v5.4s,v21.4s,#7 |
| eor w9,w9,w14 |
| sli v17.4s,v22.4s,#7 |
| ror w10,w10,#25 |
| ext v2.16b,v2.16b,v2.16b,#8 |
| ror w11,w11,#25 |
| ext v6.16b,v6.16b,v6.16b,#8 |
| ror w12,w12,#25 |
| ext v18.16b,v18.16b,v18.16b,#8 |
| ror w9,w9,#25 |
| ext v3.16b,v3.16b,v3.16b,#4 |
| ext v7.16b,v7.16b,v7.16b,#4 |
| ext v19.16b,v19.16b,v19.16b,#4 |
| ext v1.16b,v1.16b,v1.16b,#12 |
| ext v5.16b,v5.16b,v5.16b,#12 |
| ext v17.16b,v17.16b,v17.16b,#12 |
| cbnz x4,Loop_neon |
| |
| add w5,w5,w22 // accumulate key block |
| add v0.4s,v0.4s,v24.4s |
| add x6,x6,x22,lsr#32 |
| add v4.4s,v4.4s,v24.4s |
| add w7,w7,w23 |
| add v16.4s,v16.4s,v24.4s |
| add x8,x8,x23,lsr#32 |
| add v2.4s,v2.4s,v26.4s |
| add w9,w9,w24 |
| add v6.4s,v6.4s,v26.4s |
| add x10,x10,x24,lsr#32 |
| add v18.4s,v18.4s,v26.4s |
| add w11,w11,w25 |
| add v3.4s,v3.4s,v27.4s |
| add x12,x12,x25,lsr#32 |
| add w13,w13,w26 |
| add v7.4s,v7.4s,v28.4s |
| add x14,x14,x26,lsr#32 |
| add w15,w15,w27 |
| add v19.4s,v19.4s,v29.4s |
| add x16,x16,x27,lsr#32 |
| add w17,w17,w28 |
| add v1.4s,v1.4s,v25.4s |
| add x19,x19,x28,lsr#32 |
| add w20,w20,w30 |
| add v5.4s,v5.4s,v25.4s |
| add x21,x21,x30,lsr#32 |
| add v17.4s,v17.4s,v25.4s |
| |
| b.lo Ltail_neon |
| |
| add x5,x5,x6,lsl#32 // pack |
| add x7,x7,x8,lsl#32 |
| ldp x6,x8,[x1,#0] // load input |
| add x9,x9,x10,lsl#32 |
| add x11,x11,x12,lsl#32 |
| ldp x10,x12,[x1,#16] |
| add x13,x13,x14,lsl#32 |
| add x15,x15,x16,lsl#32 |
| ldp x14,x16,[x1,#32] |
| add x17,x17,x19,lsl#32 |
| add x20,x20,x21,lsl#32 |
| ldp x19,x21,[x1,#48] |
| add x1,x1,#64 |
| #ifdef __ARMEB__ |
| rev x5,x5 |
| rev x7,x7 |
| rev x9,x9 |
| rev x11,x11 |
| rev x13,x13 |
| rev x15,x15 |
| rev x17,x17 |
| rev x20,x20 |
| #endif |
| ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 |
| eor x5,x5,x6 |
| eor x7,x7,x8 |
| eor x9,x9,x10 |
| eor x11,x11,x12 |
| eor x13,x13,x14 |
| eor v0.16b,v0.16b,v20.16b |
| eor x15,x15,x16 |
| eor v1.16b,v1.16b,v21.16b |
| eor x17,x17,x19 |
| eor v2.16b,v2.16b,v22.16b |
| eor x20,x20,x21 |
| eor v3.16b,v3.16b,v23.16b |
| ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 |
| |
| stp x5,x7,[x0,#0] // store output |
| add x28,x28,#4 // increment counter |
| stp x9,x11,[x0,#16] |
| add v27.4s,v27.4s,v31.4s // += 4 |
| stp x13,x15,[x0,#32] |
| add v28.4s,v28.4s,v31.4s |
| stp x17,x20,[x0,#48] |
| add v29.4s,v29.4s,v31.4s |
| add x0,x0,#64 |
| |
| st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 |
| ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 |
| |
| eor v4.16b,v4.16b,v20.16b |
| eor v5.16b,v5.16b,v21.16b |
| eor v6.16b,v6.16b,v22.16b |
| eor v7.16b,v7.16b,v23.16b |
| st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 |
| |
| eor v16.16b,v16.16b,v0.16b |
| eor v17.16b,v17.16b,v1.16b |
| eor v18.16b,v18.16b,v2.16b |
| eor v19.16b,v19.16b,v3.16b |
| st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 |
| |
| b.hi Loop_outer_neon |
| |
| ldp x19,x20,[x29,#16] |
| add sp,sp,#64 |
| ldp x21,x22,[x29,#32] |
| ldp x23,x24,[x29,#48] |
| ldp x25,x26,[x29,#64] |
| ldp x27,x28,[x29,#80] |
| ldp x29,x30,[sp],#96 |
| ret |
| |
| Ltail_neon: |
| add x2,x2,#256 |
| cmp x2,#64 |
| b.lo Less_than_64 |
| |
| add x5,x5,x6,lsl#32 // pack |
| add x7,x7,x8,lsl#32 |
| ldp x6,x8,[x1,#0] // load input |
| add x9,x9,x10,lsl#32 |
| add x11,x11,x12,lsl#32 |
| ldp x10,x12,[x1,#16] |
| add x13,x13,x14,lsl#32 |
| add x15,x15,x16,lsl#32 |
| ldp x14,x16,[x1,#32] |
| add x17,x17,x19,lsl#32 |
| add x20,x20,x21,lsl#32 |
| ldp x19,x21,[x1,#48] |
| add x1,x1,#64 |
| #ifdef __ARMEB__ |
| rev x5,x5 |
| rev x7,x7 |
| rev x9,x9 |
| rev x11,x11 |
| rev x13,x13 |
| rev x15,x15 |
| rev x17,x17 |
| rev x20,x20 |
| #endif |
| eor x5,x5,x6 |
| eor x7,x7,x8 |
| eor x9,x9,x10 |
| eor x11,x11,x12 |
| eor x13,x13,x14 |
| eor x15,x15,x16 |
| eor x17,x17,x19 |
| eor x20,x20,x21 |
| |
| stp x5,x7,[x0,#0] // store output |
| add x28,x28,#4 // increment counter |
| stp x9,x11,[x0,#16] |
| stp x13,x15,[x0,#32] |
| stp x17,x20,[x0,#48] |
| add x0,x0,#64 |
| b.eq Ldone_neon |
| sub x2,x2,#64 |
| cmp x2,#64 |
| b.lo Less_than_128 |
| |
| ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 |
| eor v0.16b,v0.16b,v20.16b |
| eor v1.16b,v1.16b,v21.16b |
| eor v2.16b,v2.16b,v22.16b |
| eor v3.16b,v3.16b,v23.16b |
| st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 |
| b.eq Ldone_neon |
| sub x2,x2,#64 |
| cmp x2,#64 |
| b.lo Less_than_192 |
| |
| ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 |
| eor v4.16b,v4.16b,v20.16b |
| eor v5.16b,v5.16b,v21.16b |
| eor v6.16b,v6.16b,v22.16b |
| eor v7.16b,v7.16b,v23.16b |
| st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 |
| b.eq Ldone_neon |
| sub x2,x2,#64 |
| |
| st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] |
| b Last_neon |
| |
| Less_than_128: |
| st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] |
| b Last_neon |
| Less_than_192: |
| st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] |
| b Last_neon |
| |
| .align 4 |
| Last_neon: |
| sub x0,x0,#1 |
| add x1,x1,x2 |
| add x0,x0,x2 |
| add x4,sp,x2 |
| neg x2,x2 |
| |
| Loop_tail_neon: |
| ldrb w10,[x1,x2] |
| ldrb w11,[x4,x2] |
| add x2,x2,#1 |
| eor w10,w10,w11 |
| strb w10,[x0,x2] |
| cbnz x2,Loop_tail_neon |
| |
| stp xzr,xzr,[sp,#0] |
| stp xzr,xzr,[sp,#16] |
| stp xzr,xzr,[sp,#32] |
| stp xzr,xzr,[sp,#48] |
| |
| Ldone_neon: |
| ldp x19,x20,[x29,#16] |
| add sp,sp,#64 |
| ldp x21,x22,[x29,#32] |
| ldp x23,x24,[x29,#48] |
| ldp x25,x26,[x29,#64] |
| ldp x27,x28,[x29,#80] |
| ldp x29,x30,[sp],#96 |
| ret |
| |
| |
| .align 5 |
| ChaCha20_512_neon: |
| stp x29,x30,[sp,#-96]! |
| add x29,sp,#0 |
| |
| adr x5,Lsigma |
| stp x19,x20,[sp,#16] |
| stp x21,x22,[sp,#32] |
| stp x23,x24,[sp,#48] |
| stp x25,x26,[sp,#64] |
| stp x27,x28,[sp,#80] |
| |
| L512_or_more_neon: |
| sub sp,sp,#128+64 |
| |
| ldp x22,x23,[x5] // load sigma |
| ld1 {v24.4s},[x5],#16 |
| ldp x24,x25,[x3] // load key |
| ldp x26,x27,[x3,#16] |
| ld1 {v25.4s,v26.4s},[x3] |
| ldp x28,x30,[x4] // load counter |
| ld1 {v27.4s},[x4] |
| ld1 {v31.4s},[x5] |
| #ifdef __ARMEB__ |
| rev64 v24.4s,v24.4s |
| ror x24,x24,#32 |
| ror x25,x25,#32 |
| ror x26,x26,#32 |
| ror x27,x27,#32 |
| ror x28,x28,#32 |
| ror x30,x30,#32 |
| #endif |
| add v27.4s,v27.4s,v31.4s // += 1 |
| stp q24,q25,[sp,#0] // off-load key block, invariant part |
| add v27.4s,v27.4s,v31.4s // not typo |
| str q26,[sp,#32] |
| add v28.4s,v27.4s,v31.4s |
| add v29.4s,v28.4s,v31.4s |
| add v30.4s,v29.4s,v31.4s |
| shl v31.4s,v31.4s,#2 // 1 -> 4 |
| |
| stp d8,d9,[sp,#128+0] // meet ABI requirements |
| stp d10,d11,[sp,#128+16] |
| stp d12,d13,[sp,#128+32] |
| stp d14,d15,[sp,#128+48] |
| |
| sub x2,x2,#512 // not typo |
| |
| Loop_outer_512_neon: |
| mov v0.16b,v24.16b |
| mov v4.16b,v24.16b |
| mov v8.16b,v24.16b |
| mov v12.16b,v24.16b |
| mov v16.16b,v24.16b |
| mov v20.16b,v24.16b |
| mov v1.16b,v25.16b |
| mov w5,w22 // unpack key block |
| mov v5.16b,v25.16b |
| lsr x6,x22,#32 |
| mov v9.16b,v25.16b |
| mov w7,w23 |
| mov v13.16b,v25.16b |
| lsr x8,x23,#32 |
| mov v17.16b,v25.16b |
| mov w9,w24 |
| mov v21.16b,v25.16b |
| lsr x10,x24,#32 |
| mov v3.16b,v27.16b |
| mov w11,w25 |
| mov v7.16b,v28.16b |
| lsr x12,x25,#32 |
| mov v11.16b,v29.16b |
| mov w13,w26 |
| mov v15.16b,v30.16b |
| lsr x14,x26,#32 |
| mov v2.16b,v26.16b |
| mov w15,w27 |
| mov v6.16b,v26.16b |
| lsr x16,x27,#32 |
| add v19.4s,v3.4s,v31.4s // +4 |
| mov w17,w28 |
| add v23.4s,v7.4s,v31.4s // +4 |
| lsr x19,x28,#32 |
| mov v10.16b,v26.16b |
| mov w20,w30 |
| mov v14.16b,v26.16b |
| lsr x21,x30,#32 |
| mov v18.16b,v26.16b |
| stp q27,q28,[sp,#48] // off-load key block, variable part |
| mov v22.16b,v26.16b |
| str q29,[sp,#80] |
| |
| mov x4,#5 |
| subs x2,x2,#512 |
| Loop_upper_neon: |
| sub x4,x4,#1 |
| add v0.4s,v0.4s,v1.4s |
| add w5,w5,w9 |
| add v4.4s,v4.4s,v5.4s |
| add w6,w6,w10 |
| add v8.4s,v8.4s,v9.4s |
| add w7,w7,w11 |
| add v12.4s,v12.4s,v13.4s |
| add w8,w8,w12 |
| add v16.4s,v16.4s,v17.4s |
| eor w17,w17,w5 |
| add v20.4s,v20.4s,v21.4s |
| eor w19,w19,w6 |
| eor v3.16b,v3.16b,v0.16b |
| eor w20,w20,w7 |
| eor v7.16b,v7.16b,v4.16b |
| eor w21,w21,w8 |
| eor v11.16b,v11.16b,v8.16b |
| ror w17,w17,#16 |
| eor v15.16b,v15.16b,v12.16b |
| ror w19,w19,#16 |
| eor v19.16b,v19.16b,v16.16b |
| ror w20,w20,#16 |
| eor v23.16b,v23.16b,v20.16b |
| ror w21,w21,#16 |
| rev32 v3.8h,v3.8h |
| add w13,w13,w17 |
| rev32 v7.8h,v7.8h |
| add w14,w14,w19 |
| rev32 v11.8h,v11.8h |
| add w15,w15,w20 |
| rev32 v15.8h,v15.8h |
| add w16,w16,w21 |
| rev32 v19.8h,v19.8h |
| eor w9,w9,w13 |
| rev32 v23.8h,v23.8h |
| eor w10,w10,w14 |
| add v2.4s,v2.4s,v3.4s |
| eor w11,w11,w15 |
| add v6.4s,v6.4s,v7.4s |
| eor w12,w12,w16 |
| add v10.4s,v10.4s,v11.4s |
| ror w9,w9,#20 |
| add v14.4s,v14.4s,v15.4s |
| ror w10,w10,#20 |
| add v18.4s,v18.4s,v19.4s |
| ror w11,w11,#20 |
| add v22.4s,v22.4s,v23.4s |
| ror w12,w12,#20 |
| eor v24.16b,v1.16b,v2.16b |
| add w5,w5,w9 |
| eor v25.16b,v5.16b,v6.16b |
| add w6,w6,w10 |
| eor v26.16b,v9.16b,v10.16b |
| add w7,w7,w11 |
| eor v27.16b,v13.16b,v14.16b |
| add w8,w8,w12 |
| eor v28.16b,v17.16b,v18.16b |
| eor w17,w17,w5 |
| eor v29.16b,v21.16b,v22.16b |
| eor w19,w19,w6 |
| ushr v1.4s,v24.4s,#20 |
| eor w20,w20,w7 |
| ushr v5.4s,v25.4s,#20 |
| eor w21,w21,w8 |
| ushr v9.4s,v26.4s,#20 |
| ror w17,w17,#24 |
| ushr v13.4s,v27.4s,#20 |
| ror w19,w19,#24 |
| ushr v17.4s,v28.4s,#20 |
| ror w20,w20,#24 |
| ushr v21.4s,v29.4s,#20 |
| ror w21,w21,#24 |
| sli v1.4s,v24.4s,#12 |
| add w13,w13,w17 |
| sli v5.4s,v25.4s,#12 |
| add w14,w14,w19 |
| sli v9.4s,v26.4s,#12 |
| add w15,w15,w20 |
| sli v13.4s,v27.4s,#12 |
| add w16,w16,w21 |
| sli v17.4s,v28.4s,#12 |
| eor w9,w9,w13 |
| sli v21.4s,v29.4s,#12 |
| eor w10,w10,w14 |
| add v0.4s,v0.4s,v1.4s |
| eor w11,w11,w15 |
| add v4.4s,v4.4s,v5.4s |
| eor w12,w12,w16 |
| add v8.4s,v8.4s,v9.4s |
| ror w9,w9,#25 |
| add v12.4s,v12.4s,v13.4s |
| ror w10,w10,#25 |
| add v16.4s,v16.4s,v17.4s |
| ror w11,w11,#25 |
| add v20.4s,v20.4s,v21.4s |
| ror w12,w12,#25 |
| eor v24.16b,v3.16b,v0.16b |
| add w5,w5,w10 |
| eor v25.16b,v7.16b,v4.16b |
| add w6,w6,w11 |
| eor v26.16b,v11.16b,v8.16b |
| add w7,w7,w12 |
| eor v27.16b,v15.16b,v12.16b |
| add w8,w8,w9 |
| eor v28.16b,v19.16b,v16.16b |
| eor w21,w21,w5 |
| eor v29.16b,v23.16b,v20.16b |
| eor w17,w17,w6 |
| ushr v3.4s,v24.4s,#24 |
| eor w19,w19,w7 |
| ushr v7.4s,v25.4s,#24 |
| eor w20,w20,w8 |
| ushr v11.4s,v26.4s,#24 |
| ror w21,w21,#16 |
| ushr v15.4s,v27.4s,#24 |
| ror w17,w17,#16 |
| ushr v19.4s,v28.4s,#24 |
| ror w19,w19,#16 |
| ushr v23.4s,v29.4s,#24 |
| ror w20,w20,#16 |
| sli v3.4s,v24.4s,#8 |
| add w15,w15,w21 |
| sli v7.4s,v25.4s,#8 |
| add w16,w16,w17 |
| sli v11.4s,v26.4s,#8 |
| add w13,w13,w19 |
| sli v15.4s,v27.4s,#8 |
| add w14,w14,w20 |
| sli v19.4s,v28.4s,#8 |
| eor w10,w10,w15 |
| sli v23.4s,v29.4s,#8 |
| eor w11,w11,w16 |
| add v2.4s,v2.4s,v3.4s |
| eor w12,w12,w13 |
| add v6.4s,v6.4s,v7.4s |
| eor w9,w9,w14 |
| add v10.4s,v10.4s,v11.4s |
| ror w10,w10,#20 |
| add v14.4s,v14.4s,v15.4s |
| ror w11,w11,#20 |
| add v18.4s,v18.4s,v19.4s |
| ror w12,w12,#20 |
| add v22.4s,v22.4s,v23.4s |
| ror w9,w9,#20 |
| eor v24.16b,v1.16b,v2.16b |
| add w5,w5,w10 |
| eor v25.16b,v5.16b,v6.16b |
| add w6,w6,w11 |
| eor v26.16b,v9.16b,v10.16b |
| add w7,w7,w12 |
| eor v27.16b,v13.16b,v14.16b |
| add w8,w8,w9 |
| eor v28.16b,v17.16b,v18.16b |
| eor w21,w21,w5 |
| eor v29.16b,v21.16b,v22.16b |
| eor w17,w17,w6 |
| ushr v1.4s,v24.4s,#25 |
| eor w19,w19,w7 |
| ushr v5.4s,v25.4s,#25 |
| eor w20,w20,w8 |
| ushr v9.4s,v26.4s,#25 |
| ror w21,w21,#24 |
| ushr v13.4s,v27.4s,#25 |
| ror w17,w17,#24 |
| ushr v17.4s,v28.4s,#25 |
| ror w19,w19,#24 |
| ushr v21.4s,v29.4s,#25 |
| ror w20,w20,#24 |
| sli v1.4s,v24.4s,#7 |
| add w15,w15,w21 |
| sli v5.4s,v25.4s,#7 |
| add w16,w16,w17 |
| sli v9.4s,v26.4s,#7 |
| add w13,w13,w19 |
| sli v13.4s,v27.4s,#7 |
| add w14,w14,w20 |
| sli v17.4s,v28.4s,#7 |
| eor w10,w10,w15 |
| sli v21.4s,v29.4s,#7 |
| eor w11,w11,w16 |
| ext v2.16b,v2.16b,v2.16b,#8 |
| eor w12,w12,w13 |
| ext v6.16b,v6.16b,v6.16b,#8 |
| eor w9,w9,w14 |
| ext v10.16b,v10.16b,v10.16b,#8 |
| ror w10,w10,#25 |
| ext v14.16b,v14.16b,v14.16b,#8 |
| ror w11,w11,#25 |
| ext v18.16b,v18.16b,v18.16b,#8 |
| ror w12,w12,#25 |
| ext v22.16b,v22.16b,v22.16b,#8 |
| ror w9,w9,#25 |
| ext v3.16b,v3.16b,v3.16b,#12 |
| ext v7.16b,v7.16b,v7.16b,#12 |
| ext v11.16b,v11.16b,v11.16b,#12 |
| ext v15.16b,v15.16b,v15.16b,#12 |
| ext v19.16b,v19.16b,v19.16b,#12 |
| ext v23.16b,v23.16b,v23.16b,#12 |
| ext v1.16b,v1.16b,v1.16b,#4 |
| ext v5.16b,v5.16b,v5.16b,#4 |
| ext v9.16b,v9.16b,v9.16b,#4 |
| ext v13.16b,v13.16b,v13.16b,#4 |
| ext v17.16b,v17.16b,v17.16b,#4 |
| ext v21.16b,v21.16b,v21.16b,#4 |
| add v0.4s,v0.4s,v1.4s |
| add w5,w5,w9 |
| add v4.4s,v4.4s,v5.4s |
| add w6,w6,w10 |
| add v8.4s,v8.4s,v9.4s |
| add w7,w7,w11 |
| add v12.4s,v12.4s,v13.4s |
| add w8,w8,w12 |
| add v16.4s,v16.4s,v17.4s |
| eor w17,w17,w5 |
| add v20.4s,v20.4s,v21.4s |
| eor w19,w19,w6 |
| eor v3.16b,v3.16b,v0.16b |
| eor w20,w20,w7 |
| eor v7.16b,v7.16b,v4.16b |
| eor w21,w21,w8 |
| eor v11.16b,v11.16b,v8.16b |
| ror w17,w17,#16 |
| eor v15.16b,v15.16b,v12.16b |
| ror w19,w19,#16 |
| eor v19.16b,v19.16b,v16.16b |
| ror w20,w20,#16 |
| eor v23.16b,v23.16b,v20.16b |
| ror w21,w21,#16 |
| rev32 v3.8h,v3.8h |
| add w13,w13,w17 |
| rev32 v7.8h,v7.8h |
| add w14,w14,w19 |
| rev32 v11.8h,v11.8h |
| add w15,w15,w20 |
| rev32 v15.8h,v15.8h |
| add w16,w16,w21 |
| rev32 v19.8h,v19.8h |
| eor w9,w9,w13 |
| rev32 v23.8h,v23.8h |
| eor w10,w10,w14 |
| add v2.4s,v2.4s,v3.4s |
| eor w11,w11,w15 |
| add v6.4s,v6.4s,v7.4s |
| eor w12,w12,w16 |
| add v10.4s,v10.4s,v11.4s |
| ror w9,w9,#20 |
| add v14.4s,v14.4s,v15.4s |
| ror w10,w10,#20 |
| add v18.4s,v18.4s,v19.4s |
| ror w11,w11,#20 |
| add v22.4s,v22.4s,v23.4s |
| ror w12,w12,#20 |
| eor v24.16b,v1.16b,v2.16b |
| add w5,w5,w9 |
| eor v25.16b,v5.16b,v6.16b |
| add w6,w6,w10 |
| eor v26.16b,v9.16b,v10.16b |
| add w7,w7,w11 |
| eor v27.16b,v13.16b,v14.16b |
| add w8,w8,w12 |
| eor v28.16b,v17.16b,v18.16b |
| eor w17,w17,w5 |
| eor v29.16b,v21.16b,v22.16b |
| eor w19,w19,w6 |
| ushr v1.4s,v24.4s,#20 |
| eor w20,w20,w7 |
| ushr v5.4s,v25.4s,#20 |
| eor w21,w21,w8 |
| ushr v9.4s,v26.4s,#20 |
| ror w17,w17,#24 |
| ushr v13.4s,v27.4s,#20 |
| ror w19,w19,#24 |
| ushr v17.4s,v28.4s,#20 |
| ror w20,w20,#24 |
| ushr v21.4s,v29.4s,#20 |
| ror w21,w21,#24 |
| sli v1.4s,v24.4s,#12 |
| add w13,w13,w17 |
| sli v5.4s,v25.4s,#12 |
| add w14,w14,w19 |
| sli v9.4s,v26.4s,#12 |
| add w15,w15,w20 |
| sli v13.4s,v27.4s,#12 |
| add w16,w16,w21 |
| sli v17.4s,v28.4s,#12 |
| eor w9,w9,w13 |
| sli v21.4s,v29.4s,#12 |
| eor w10,w10,w14 |
| add v0.4s,v0.4s,v1.4s |
| eor w11,w11,w15 |
| add v4.4s,v4.4s,v5.4s |
| eor w12,w12,w16 |
| add v8.4s,v8.4s,v9.4s |
| ror w9,w9,#25 |
| add v12.4s,v12.4s,v13.4s |
| ror w10,w10,#25 |
| add v16.4s,v16.4s,v17.4s |
| ror w11,w11,#25 |
| add v20.4s,v20.4s,v21.4s |
| ror w12,w12,#25 |
| eor v24.16b,v3.16b,v0.16b |
| add w5,w5,w10 |
| eor v25.16b,v7.16b,v4.16b |
| add w6,w6,w11 |
| eor v26.16b,v11.16b,v8.16b |
| add w7,w7,w12 |
| eor v27.16b,v15.16b,v12.16b |
| add w8,w8,w9 |
| eor v28.16b,v19.16b,v16.16b |
| eor w21,w21,w5 |
| eor v29.16b,v23.16b,v20.16b |
| eor w17,w17,w6 |
| ushr v3.4s,v24.4s,#24 |
| eor w19,w19,w7 |
| ushr v7.4s,v25.4s,#24 |
| eor w20,w20,w8 |
| ushr v11.4s,v26.4s,#24 |
| ror w21,w21,#16 |
| ushr v15.4s,v27.4s,#24 |
| ror w17,w17,#16 |
| ushr v19.4s,v28.4s,#24 |
| ror w19,w19,#16 |
| ushr v23.4s,v29.4s,#24 |
| ror w20,w20,#16 |
| sli v3.4s,v24.4s,#8 |
| add w15,w15,w21 |
| sli v7.4s,v25.4s,#8 |
| add w16,w16,w17 |
| sli v11.4s,v26.4s,#8 |
| add w13,w13,w19 |
| sli v15.4s,v27.4s,#8 |
| add w14,w14,w20 |
| sli v19.4s,v28.4s,#8 |
| eor w10,w10,w15 |
| sli v23.4s,v29.4s,#8 |
| eor w11,w11,w16 |
| add v2.4s,v2.4s,v3.4s |
| eor w12,w12,w13 |
| add v6.4s,v6.4s,v7.4s |
| eor w9,w9,w14 |
| add v10.4s,v10.4s,v11.4s |
| ror w10,w10,#20 |
| add v14.4s,v14.4s,v15.4s |
| ror w11,w11,#20 |
| add v18.4s,v18.4s,v19.4s |
| ror w12,w12,#20 |
| add v22.4s,v22.4s,v23.4s |
| ror w9,w9,#20 |
| eor v24.16b,v1.16b,v2.16b |
| add w5,w5,w10 |
| eor v25.16b,v5.16b,v6.16b |
| add w6,w6,w11 |
| eor v26.16b,v9.16b,v10.16b |
| add w7,w7,w12 |
| eor v27.16b,v13.16b,v14.16b |
| add w8,w8,w9 |
| eor v28.16b,v17.16b,v18.16b |
| eor w21,w21,w5 |
| eor v29.16b,v21.16b,v22.16b |
| eor w17,w17,w6 |
| ushr v1.4s,v24.4s,#25 |
| eor w19,w19,w7 |
| ushr v5.4s,v25.4s,#25 |
| eor w20,w20,w8 |
| ushr v9.4s,v26.4s,#25 |
| ror w21,w21,#24 |
| ushr v13.4s,v27.4s,#25 |
| ror w17,w17,#24 |
| ushr v17.4s,v28.4s,#25 |
| ror w19,w19,#24 |
| ushr v21.4s,v29.4s,#25 |
| ror w20,w20,#24 |
| sli v1.4s,v24.4s,#7 |
| add w15,w15,w21 |
| sli v5.4s,v25.4s,#7 |
| add w16,w16,w17 |
| sli v9.4s,v26.4s,#7 |
| add w13,w13,w19 |
| sli v13.4s,v27.4s,#7 |
| add w14,w14,w20 |
| sli v17.4s,v28.4s,#7 |
| eor w10,w10,w15 |
| sli v21.4s,v29.4s,#7 |
| eor w11,w11,w16 |
| ext v2.16b,v2.16b,v2.16b,#8 |
| eor w12,w12,w13 |
| ext v6.16b,v6.16b,v6.16b,#8 |
| eor w9,w9,w14 |
| ext v10.16b,v10.16b,v10.16b,#8 |
| ror w10,w10,#25 |
| ext v14.16b,v14.16b,v14.16b,#8 |
| ror w11,w11,#25 |
| ext v18.16b,v18.16b,v18.16b,#8 |
| ror w12,w12,#25 |
| ext v22.16b,v22.16b,v22.16b,#8 |
| ror w9,w9,#25 |
| ext v3.16b,v3.16b,v3.16b,#4 |
| ext v7.16b,v7.16b,v7.16b,#4 |
| ext v11.16b,v11.16b,v11.16b,#4 |
| ext v15.16b,v15.16b,v15.16b,#4 |
| ext v19.16b,v19.16b,v19.16b,#4 |
| ext v23.16b,v23.16b,v23.16b,#4 |
| ext v1.16b,v1.16b,v1.16b,#12 |
| ext v5.16b,v5.16b,v5.16b,#12 |
| ext v9.16b,v9.16b,v9.16b,#12 |
| ext v13.16b,v13.16b,v13.16b,#12 |
| ext v17.16b,v17.16b,v17.16b,#12 |
| ext v21.16b,v21.16b,v21.16b,#12 |
| cbnz x4,Loop_upper_neon |
| |
| add w5,w5,w22 // accumulate key block |
| add x6,x6,x22,lsr#32 |
| add w7,w7,w23 |
| add x8,x8,x23,lsr#32 |
| add w9,w9,w24 |
| add x10,x10,x24,lsr#32 |
| add w11,w11,w25 |
| add x12,x12,x25,lsr#32 |
| add w13,w13,w26 |
| add x14,x14,x26,lsr#32 |
| add w15,w15,w27 |
| add x16,x16,x27,lsr#32 |
| add w17,w17,w28 |
| add x19,x19,x28,lsr#32 |
| add w20,w20,w30 |
| add x21,x21,x30,lsr#32 |
| |
| add x5,x5,x6,lsl#32 // pack |
| add x7,x7,x8,lsl#32 |
| ldp x6,x8,[x1,#0] // load input |
| add x9,x9,x10,lsl#32 |
| add x11,x11,x12,lsl#32 |
| ldp x10,x12,[x1,#16] |
| add x13,x13,x14,lsl#32 |
| add x15,x15,x16,lsl#32 |
| ldp x14,x16,[x1,#32] |
| add x17,x17,x19,lsl#32 |
| add x20,x20,x21,lsl#32 |
| ldp x19,x21,[x1,#48] |
| add x1,x1,#64 |
| #ifdef __ARMEB__ |
| rev x5,x5 |
| rev x7,x7 |
| rev x9,x9 |
| rev x11,x11 |
| rev x13,x13 |
| rev x15,x15 |
| rev x17,x17 |
| rev x20,x20 |
| #endif |
| eor x5,x5,x6 |
| eor x7,x7,x8 |
| eor x9,x9,x10 |
| eor x11,x11,x12 |
| eor x13,x13,x14 |
| eor x15,x15,x16 |
| eor x17,x17,x19 |
| eor x20,x20,x21 |
| |
| stp x5,x7,[x0,#0] // store output |
| add x28,x28,#1 // increment counter |
| mov w5,w22 // unpack key block |
| lsr x6,x22,#32 |
| stp x9,x11,[x0,#16] |
| mov w7,w23 |
| lsr x8,x23,#32 |
| stp x13,x15,[x0,#32] |
| mov w9,w24 |
| lsr x10,x24,#32 |
| stp x17,x20,[x0,#48] |
| add x0,x0,#64 |
| mov w11,w25 |
| lsr x12,x25,#32 |
| mov w13,w26 |
| lsr x14,x26,#32 |
| mov w15,w27 |
| lsr x16,x27,#32 |
| mov w17,w28 |
| lsr x19,x28,#32 |
| mov w20,w30 |
| lsr x21,x30,#32 |
| |
| mov x4,#5 |
| Loop_lower_neon: |
| sub x4,x4,#1 |
| add v0.4s,v0.4s,v1.4s |
| add w5,w5,w9 |
| add v4.4s,v4.4s,v5.4s |
| add w6,w6,w10 |
| add v8.4s,v8.4s,v9.4s |
| add w7,w7,w11 |
| add v12.4s,v12.4s,v13.4s |
| add w8,w8,w12 |
| add v16.4s,v16.4s,v17.4s |
| eor w17,w17,w5 |
| add v20.4s,v20.4s,v21.4s |
| eor w19,w19,w6 |
| eor v3.16b,v3.16b,v0.16b |
| eor w20,w20,w7 |
| eor v7.16b,v7.16b,v4.16b |
| eor w21,w21,w8 |
| eor v11.16b,v11.16b,v8.16b |
| ror w17,w17,#16 |
| eor v15.16b,v15.16b,v12.16b |
| ror w19,w19,#16 |
| eor v19.16b,v19.16b,v16.16b |
| ror w20,w20,#16 |
| eor v23.16b,v23.16b,v20.16b |
| ror w21,w21,#16 |
| rev32 v3.8h,v3.8h |
| add w13,w13,w17 |
| rev32 v7.8h,v7.8h |
| add w14,w14,w19 |
| rev32 v11.8h,v11.8h |
| add w15,w15,w20 |
| rev32 v15.8h,v15.8h |
| add w16,w16,w21 |
| rev32 v19.8h,v19.8h |
| eor w9,w9,w13 |
| rev32 v23.8h,v23.8h |
| eor w10,w10,w14 |
| add v2.4s,v2.4s,v3.4s |
| eor w11,w11,w15 |
| add v6.4s,v6.4s,v7.4s |
| eor w12,w12,w16 |
| add v10.4s,v10.4s,v11.4s |
| ror w9,w9,#20 |
| add v14.4s,v14.4s,v15.4s |
| ror w10,w10,#20 |
| add v18.4s,v18.4s,v19.4s |
| ror w11,w11,#20 |
| add v22.4s,v22.4s,v23.4s |
| ror w12,w12,#20 |
| eor v24.16b,v1.16b,v2.16b |
| add w5,w5,w9 |
| eor v25.16b,v5.16b,v6.16b |
| add w6,w6,w10 |
| eor v26.16b,v9.16b,v10.16b |
| add w7,w7,w11 |
| eor v27.16b,v13.16b,v14.16b |
| add w8,w8,w12 |
| eor v28.16b,v17.16b,v18.16b |
| eor w17,w17,w5 |
| eor v29.16b,v21.16b,v22.16b |
| eor w19,w19,w6 |
| ushr v1.4s,v24.4s,#20 |
| eor w20,w20,w7 |
| ushr v5.4s,v25.4s,#20 |
| eor w21,w21,w8 |
| ushr v9.4s,v26.4s,#20 |
| ror w17,w17,#24 |
| ushr v13.4s,v27.4s,#20 |
| ror w19,w19,#24 |
| ushr v17.4s,v28.4s,#20 |
| ror w20,w20,#24 |
| ushr v21.4s,v29.4s,#20 |
| ror w21,w21,#24 |
| sli v1.4s,v24.4s,#12 |
| add w13,w13,w17 |
| sli v5.4s,v25.4s,#12 |
| add w14,w14,w19 |
| sli v9.4s,v26.4s,#12 |
| add w15,w15,w20 |
| sli v13.4s,v27.4s,#12 |
| add w16,w16,w21 |
| sli v17.4s,v28.4s,#12 |
| eor w9,w9,w13 |
| sli v21.4s,v29.4s,#12 |
| eor w10,w10,w14 |
| add v0.4s,v0.4s,v1.4s |
| eor w11,w11,w15 |
| add v4.4s,v4.4s,v5.4s |
| eor w12,w12,w16 |
| add v8.4s,v8.4s,v9.4s |
| ror w9,w9,#25 |
| add v12.4s,v12.4s,v13.4s |
| ror w10,w10,#25 |
| add v16.4s,v16.4s,v17.4s |
| ror w11,w11,#25 |
| add v20.4s,v20.4s,v21.4s |
| ror w12,w12,#25 |
| eor v24.16b,v3.16b,v0.16b |
| add w5,w5,w10 |
| eor v25.16b,v7.16b,v4.16b |
| add w6,w6,w11 |
| eor v26.16b,v11.16b,v8.16b |
| add w7,w7,w12 |
| eor v27.16b,v15.16b,v12.16b |
| add w8,w8,w9 |
| eor v28.16b,v19.16b,v16.16b |
| eor w21,w21,w5 |
| eor v29.16b,v23.16b,v20.16b |
| eor w17,w17,w6 |
| ushr v3.4s,v24.4s,#24 |
| eor w19,w19,w7 |
| ushr v7.4s,v25.4s,#24 |
| eor w20,w20,w8 |
| ushr v11.4s,v26.4s,#24 |
| ror w21,w21,#16 |
| ushr v15.4s,v27.4s,#24 |
| ror w17,w17,#16 |
| ushr v19.4s,v28.4s,#24 |
| ror w19,w19,#16 |
| ushr v23.4s,v29.4s,#24 |
| ror w20,w20,#16 |
| sli v3.4s,v24.4s,#8 |
| add w15,w15,w21 |
| sli v7.4s,v25.4s,#8 |
| add w16,w16,w17 |
| sli v11.4s,v26.4s,#8 |
| add w13,w13,w19 |
| sli v15.4s,v27.4s,#8 |
| add w14,w14,w20 |
| sli v19.4s,v28.4s,#8 |
| eor w10,w10,w15 |
| sli v23.4s,v29.4s,#8 |
| eor w11,w11,w16 |
| add v2.4s,v2.4s,v3.4s |
| eor w12,w12,w13 |
| add v6.4s,v6.4s,v7.4s |
| eor w9,w9,w14 |
| add v10.4s,v10.4s,v11.4s |
| ror w10,w10,#20 |
| add v14.4s,v14.4s,v15.4s |
| ror w11,w11,#20 |
| add v18.4s,v18.4s,v19.4s |
| ror w12,w12,#20 |
| add v22.4s,v22.4s,v23.4s |
| ror w9,w9,#20 |
| eor v24.16b,v1.16b,v2.16b |
| add w5,w5,w10 |
| eor v25.16b,v5.16b,v6.16b |
| add w6,w6,w11 |
| eor v26.16b,v9.16b,v10.16b |
| add w7,w7,w12 |
| eor v27.16b,v13.16b,v14.16b |
| add w8,w8,w9 |
| eor v28.16b,v17.16b,v18.16b |
| eor w21,w21,w5 |
| eor v29.16b,v21.16b,v22.16b |
| eor w17,w17,w6 |
| ushr v1.4s,v24.4s,#25 |
| eor w19,w19,w7 |
| ushr v5.4s,v25.4s,#25 |
| eor w20,w20,w8 |
| ushr v9.4s,v26.4s,#25 |
| ror w21,w21,#24 |
| ushr v13.4s,v27.4s,#25 |
| ror w17,w17,#24 |
| ushr v17.4s,v28.4s,#25 |
| ror w19,w19,#24 |
| ushr v21.4s,v29.4s,#25 |
| ror w20,w20,#24 |
| sli v1.4s,v24.4s,#7 |
| add w15,w15,w21 |
| sli v5.4s,v25.4s,#7 |
| add w16,w16,w17 |
| sli v9.4s,v26.4s,#7 |
| add w13,w13,w19 |
| sli v13.4s,v27.4s,#7 |
| add w14,w14,w20 |
| sli v17.4s,v28.4s,#7 |
| eor w10,w10,w15 |
| sli v21.4s,v29.4s,#7 |
| eor w11,w11,w16 |
| ext v2.16b,v2.16b,v2.16b,#8 |
| eor w12,w12,w13 |
| ext v6.16b,v6.16b,v6.16b,#8 |
| eor w9,w9,w14 |
| ext v10.16b,v10.16b,v10.16b,#8 |
| ror w10,w10,#25 |
| ext v14.16b,v14.16b,v14.16b,#8 |
| ror w11,w11,#25 |
| ext v18.16b,v18.16b,v18.16b,#8 |
| ror w12,w12,#25 |
| ext v22.16b,v22.16b,v22.16b,#8 |
| ror w9,w9,#25 |
| ext v3.16b,v3.16b,v3.16b,#12 |
| ext v7.16b,v7.16b,v7.16b,#12 |
| ext v11.16b,v11.16b,v11.16b,#12 |
| ext v15.16b,v15.16b,v15.16b,#12 |
| ext v19.16b,v19.16b,v19.16b,#12 |
| ext v23.16b,v23.16b,v23.16b,#12 |
| ext v1.16b,v1.16b,v1.16b,#4 |
| ext v5.16b,v5.16b,v5.16b,#4 |
| ext v9.16b,v9.16b,v9.16b,#4 |
| ext v13.16b,v13.16b,v13.16b,#4 |
| ext v17.16b,v17.16b,v17.16b,#4 |
| ext v21.16b,v21.16b,v21.16b,#4 |
| add v0.4s,v0.4s,v1.4s |
| add w5,w5,w9 |
| add v4.4s,v4.4s,v5.4s |
| add w6,w6,w10 |
| add v8.4s,v8.4s,v9.4s |
| add w7,w7,w11 |
| add v12.4s,v12.4s,v13.4s |
| add w8,w8,w12 |
| add v16.4s,v16.4s,v17.4s |
| eor w17,w17,w5 |
| add v20.4s,v20.4s,v21.4s |
| eor w19,w19,w6 |
| eor v3.16b,v3.16b,v0.16b |
| eor w20,w20,w7 |
| eor v7.16b,v7.16b,v4.16b |
| eor w21,w21,w8 |
| eor v11.16b,v11.16b,v8.16b |
| ror w17,w17,#16 |
| eor v15.16b,v15.16b,v12.16b |
| ror w19,w19,#16 |
| eor v19.16b,v19.16b,v16.16b |
| ror w20,w20,#16 |
| eor v23.16b,v23.16b,v20.16b |
| ror w21,w21,#16 |
| rev32 v3.8h,v3.8h |
| add w13,w13,w17 |
| rev32 v7.8h,v7.8h |
| add w14,w14,w19 |
| rev32 v11.8h,v11.8h |
| add w15,w15,w20 |
| rev32 v15.8h,v15.8h |
| add w16,w16,w21 |
| rev32 v19.8h,v19.8h |
| eor w9,w9,w13 |
| rev32 v23.8h,v23.8h |
| eor w10,w10,w14 |
| add v2.4s,v2.4s,v3.4s |
| eor w11,w11,w15 |
| add v6.4s,v6.4s,v7.4s |
| eor w12,w12,w16 |
| add v10.4s,v10.4s,v11.4s |
| ror w9,w9,#20 |
| add v14.4s,v14.4s,v15.4s |
| ror w10,w10,#20 |
| add v18.4s,v18.4s,v19.4s |
| ror w11,w11,#20 |
| add v22.4s,v22.4s,v23.4s |
| ror w12,w12,#20 |
| eor v24.16b,v1.16b,v2.16b |
| add w5,w5,w9 |
| eor v25.16b,v5.16b,v6.16b |
| add w6,w6,w10 |
| eor v26.16b,v9.16b,v10.16b |
| add w7,w7,w11 |
| eor v27.16b,v13.16b,v14.16b |
| add w8,w8,w12 |
| eor v28.16b,v17.16b,v18.16b |
| eor w17,w17,w5 |
| eor v29.16b,v21.16b,v22.16b |
| eor w19,w19,w6 |
| ushr v1.4s,v24.4s,#20 |
| eor w20,w20,w7 |
| ushr v5.4s,v25.4s,#20 |
| eor w21,w21,w8 |
| ushr v9.4s,v26.4s,#20 |
| ror w17,w17,#24 |
| ushr v13.4s,v27.4s,#20 |
| ror w19,w19,#24 |
| ushr v17.4s,v28.4s,#20 |
| ror w20,w20,#24 |
| ushr v21.4s,v29.4s,#20 |
| ror w21,w21,#24 |
| sli v1.4s,v24.4s,#12 |
| add w13,w13,w17 |
| sli v5.4s,v25.4s,#12 |
| add w14,w14,w19 |
| sli v9.4s,v26.4s,#12 |
| add w15,w15,w20 |
| sli v13.4s,v27.4s,#12 |
| add w16,w16,w21 |
| sli v17.4s,v28.4s,#12 |
| eor w9,w9,w13 |
| sli v21.4s,v29.4s,#12 |
| eor w10,w10,w14 |
| add v0.4s,v0.4s,v1.4s |
| eor w11,w11,w15 |
| add v4.4s,v4.4s,v5.4s |
| eor w12,w12,w16 |
| add v8.4s,v8.4s,v9.4s |
| ror w9,w9,#25 |
| add v12.4s,v12.4s,v13.4s |
| ror w10,w10,#25 |
| add v16.4s,v16.4s,v17.4s |
| ror w11,w11,#25 |
| add v20.4s,v20.4s,v21.4s |
| ror w12,w12,#25 |
| eor v24.16b,v3.16b,v0.16b |
| add w5,w5,w10 |
| eor v25.16b,v7.16b,v4.16b |
| add w6,w6,w11 |
| eor v26.16b,v11.16b,v8.16b |
| add w7,w7,w12 |
| eor v27.16b,v15.16b,v12.16b |
| add w8,w8,w9 |
| eor v28.16b,v19.16b,v16.16b |
| eor w21,w21,w5 |
| eor v29.16b,v23.16b,v20.16b |
| eor w17,w17,w6 |
| ushr v3.4s,v24.4s,#24 |
| eor w19,w19,w7 |
| ushr v7.4s,v25.4s,#24 |
| eor w20,w20,w8 |
| ushr v11.4s,v26.4s,#24 |
| ror w21,w21,#16 |
| ushr v15.4s,v27.4s,#24 |
| ror w17,w17,#16 |
| ushr v19.4s,v28.4s,#24 |
| ror w19,w19,#16 |
| ushr v23.4s,v29.4s,#24 |
| ror w20,w20,#16 |
| sli v3.4s,v24.4s,#8 |
| add w15,w15,w21 |
| sli v7.4s,v25.4s,#8 |
| add w16,w16,w17 |
| sli v11.4s,v26.4s,#8 |
| add w13,w13,w19 |
| sli v15.4s,v27.4s,#8 |
| add w14,w14,w20 |
| sli v19.4s,v28.4s,#8 |
| eor w10,w10,w15 |
| sli v23.4s,v29.4s,#8 |
| eor w11,w11,w16 |
| add v2.4s,v2.4s,v3.4s |
| eor w12,w12,w13 |
| add v6.4s,v6.4s,v7.4s |
| eor w9,w9,w14 |
| add v10.4s,v10.4s,v11.4s |
| ror w10,w10,#20 |
| add v14.4s,v14.4s,v15.4s |
| ror w11,w11,#20 |
| add v18.4s,v18.4s,v19.4s |
| ror w12,w12,#20 |
| add v22.4s,v22.4s,v23.4s |
| ror w9,w9,#20 |
| eor v24.16b,v1.16b,v2.16b |
| add w5,w5,w10 |
| eor v25.16b,v5.16b,v6.16b |
| add w6,w6,w11 |
| eor v26.16b,v9.16b,v10.16b |
| add w7,w7,w12 |
| eor v27.16b,v13.16b,v14.16b |
| add w8,w8,w9 |
| eor v28.16b,v17.16b,v18.16b |
| eor w21,w21,w5 |
| eor v29.16b,v21.16b,v22.16b |
| eor w17,w17,w6 |
| ushr v1.4s,v24.4s,#25 |
| eor w19,w19,w7 |
| ushr v5.4s,v25.4s,#25 |
| eor w20,w20,w8 |
| ushr v9.4s,v26.4s,#25 |
| ror w21,w21,#24 |
| ushr v13.4s,v27.4s,#25 |
| ror w17,w17,#24 |
| ushr v17.4s,v28.4s,#25 |
| ror w19,w19,#24 |
| ushr v21.4s,v29.4s,#25 |
| ror w20,w20,#24 |
| sli v1.4s,v24.4s,#7 |
| add w15,w15,w21 |
| sli v5.4s,v25.4s,#7 |
| add w16,w16,w17 |
| sli v9.4s,v26.4s,#7 |
| add w13,w13,w19 |
| sli v13.4s,v27.4s,#7 |
| add w14,w14,w20 |
| sli v17.4s,v28.4s,#7 |
| eor w10,w10,w15 |
| sli v21.4s,v29.4s,#7 |
| eor w11,w11,w16 |
| ext v2.16b,v2.16b,v2.16b,#8 |
| eor w12,w12,w13 |
| ext v6.16b,v6.16b,v6.16b,#8 |
| eor w9,w9,w14 |
| ext v10.16b,v10.16b,v10.16b,#8 |
| ror w10,w10,#25 |
| ext v14.16b,v14.16b,v14.16b,#8 |
| ror w11,w11,#25 |
| ext v18.16b,v18.16b,v18.16b,#8 |
| ror w12,w12,#25 |
| ext v22.16b,v22.16b,v22.16b,#8 |
| ror w9,w9,#25 |
| ext v3.16b,v3.16b,v3.16b,#4 |
| ext v7.16b,v7.16b,v7.16b,#4 |
| ext v11.16b,v11.16b,v11.16b,#4 |
| ext v15.16b,v15.16b,v15.16b,#4 |
| ext v19.16b,v19.16b,v19.16b,#4 |
| ext v23.16b,v23.16b,v23.16b,#4 |
| ext v1.16b,v1.16b,v1.16b,#12 |
| ext v5.16b,v5.16b,v5.16b,#12 |
| ext v9.16b,v9.16b,v9.16b,#12 |
| ext v13.16b,v13.16b,v13.16b,#12 |
| ext v17.16b,v17.16b,v17.16b,#12 |
| ext v21.16b,v21.16b,v21.16b,#12 |
| cbnz x4,Loop_lower_neon |
| |
| add w5,w5,w22 // accumulate key block |
| ldp q24,q25,[sp,#0] |
| add x6,x6,x22,lsr#32 |
| ldp q26,q27,[sp,#32] |
| add w7,w7,w23 |
| ldp q28,q29,[sp,#64] |
| add x8,x8,x23,lsr#32 |
| add v0.4s,v0.4s,v24.4s |
| add w9,w9,w24 |
| add v4.4s,v4.4s,v24.4s |
| add x10,x10,x24,lsr#32 |
| add v8.4s,v8.4s,v24.4s |
| add w11,w11,w25 |
| add v12.4s,v12.4s,v24.4s |
| add x12,x12,x25,lsr#32 |
| add v16.4s,v16.4s,v24.4s |
| add w13,w13,w26 |
| add v20.4s,v20.4s,v24.4s |
| add x14,x14,x26,lsr#32 |
| add v2.4s,v2.4s,v26.4s |
| add w15,w15,w27 |
| add v6.4s,v6.4s,v26.4s |
| add x16,x16,x27,lsr#32 |
| add v10.4s,v10.4s,v26.4s |
| add w17,w17,w28 |
| add v14.4s,v14.4s,v26.4s |
| add x19,x19,x28,lsr#32 |
| add v18.4s,v18.4s,v26.4s |
| add w20,w20,w30 |
| add v22.4s,v22.4s,v26.4s |
| add x21,x21,x30,lsr#32 |
| add v19.4s,v19.4s,v31.4s // +4 |
| add x5,x5,x6,lsl#32 // pack |
| add v23.4s,v23.4s,v31.4s // +4 |
| add x7,x7,x8,lsl#32 |
| add v3.4s,v3.4s,v27.4s |
| ldp x6,x8,[x1,#0] // load input |
| add v7.4s,v7.4s,v28.4s |
| add x9,x9,x10,lsl#32 |
| add v11.4s,v11.4s,v29.4s |
| add x11,x11,x12,lsl#32 |
| add v15.4s,v15.4s,v30.4s |
| ldp x10,x12,[x1,#16] |
| add v19.4s,v19.4s,v27.4s |
| add x13,x13,x14,lsl#32 |
| add v23.4s,v23.4s,v28.4s |
| add x15,x15,x16,lsl#32 |
| add v1.4s,v1.4s,v25.4s |
| ldp x14,x16,[x1,#32] |
| add v5.4s,v5.4s,v25.4s |
| add x17,x17,x19,lsl#32 |
| add v9.4s,v9.4s,v25.4s |
| add x20,x20,x21,lsl#32 |
| add v13.4s,v13.4s,v25.4s |
| ldp x19,x21,[x1,#48] |
| add v17.4s,v17.4s,v25.4s |
| add x1,x1,#64 |
| add v21.4s,v21.4s,v25.4s |
| |
| #ifdef __ARMEB__ |
| rev x5,x5 |
| rev x7,x7 |
| rev x9,x9 |
| rev x11,x11 |
| rev x13,x13 |
| rev x15,x15 |
| rev x17,x17 |
| rev x20,x20 |
| #endif |
| ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 |
| eor x5,x5,x6 |
| eor x7,x7,x8 |
| eor x9,x9,x10 |
| eor x11,x11,x12 |
| eor x13,x13,x14 |
| eor v0.16b,v0.16b,v24.16b |
| eor x15,x15,x16 |
| eor v1.16b,v1.16b,v25.16b |
| eor x17,x17,x19 |
| eor v2.16b,v2.16b,v26.16b |
| eor x20,x20,x21 |
| eor v3.16b,v3.16b,v27.16b |
| ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 |
| |
| stp x5,x7,[x0,#0] // store output |
| add x28,x28,#7 // increment counter |
| stp x9,x11,[x0,#16] |
| stp x13,x15,[x0,#32] |
| stp x17,x20,[x0,#48] |
| add x0,x0,#64 |
| st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 |
| |
| ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 |
| eor v4.16b,v4.16b,v24.16b |
| eor v5.16b,v5.16b,v25.16b |
| eor v6.16b,v6.16b,v26.16b |
| eor v7.16b,v7.16b,v27.16b |
| st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 |
| |
| ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 |
| eor v8.16b,v8.16b,v0.16b |
| ldp q24,q25,[sp,#0] |
| eor v9.16b,v9.16b,v1.16b |
| ldp q26,q27,[sp,#32] |
| eor v10.16b,v10.16b,v2.16b |
| eor v11.16b,v11.16b,v3.16b |
| st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 |
| |
| ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 |
| eor v12.16b,v12.16b,v4.16b |
| eor v13.16b,v13.16b,v5.16b |
| eor v14.16b,v14.16b,v6.16b |
| eor v15.16b,v15.16b,v7.16b |
| st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 |
| |
| ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 |
| eor v16.16b,v16.16b,v8.16b |
| eor v17.16b,v17.16b,v9.16b |
| eor v18.16b,v18.16b,v10.16b |
| eor v19.16b,v19.16b,v11.16b |
| st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 |
| |
| shl v0.4s,v31.4s,#1 // 4 -> 8 |
| eor v20.16b,v20.16b,v12.16b |
| eor v21.16b,v21.16b,v13.16b |
| eor v22.16b,v22.16b,v14.16b |
| eor v23.16b,v23.16b,v15.16b |
| st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 |
| |
| add v27.4s,v27.4s,v0.4s // += 8 |
| add v28.4s,v28.4s,v0.4s |
| add v29.4s,v29.4s,v0.4s |
| add v30.4s,v30.4s,v0.4s |
| |
| b.hs Loop_outer_512_neon |
| |
| adds x2,x2,#512 |
| ushr v0.4s,v31.4s,#2 // 4 -> 1 |
| |
| ldp d8,d9,[sp,#128+0] // meet ABI requirements |
| ldp d10,d11,[sp,#128+16] |
| ldp d12,d13,[sp,#128+32] |
| ldp d14,d15,[sp,#128+48] |
| |
| stp q24,q31,[sp,#0] // wipe off-load area |
| stp q24,q31,[sp,#32] |
| stp q24,q31,[sp,#64] |
| |
| b.eq Ldone_512_neon |
| |
| cmp x2,#192 |
| sub v27.4s,v27.4s,v0.4s // -= 1 |
| sub v28.4s,v28.4s,v0.4s |
| sub v29.4s,v29.4s,v0.4s |
| add sp,sp,#128 |
| b.hs Loop_outer_neon |
| |
| eor v25.16b,v25.16b,v25.16b |
| eor v26.16b,v26.16b,v26.16b |
| eor v27.16b,v27.16b,v27.16b |
| eor v28.16b,v28.16b,v28.16b |
| eor v29.16b,v29.16b,v29.16b |
| eor v30.16b,v30.16b,v30.16b |
| b Loop_outer |
| |
| Ldone_512_neon: |
| ldp x19,x20,[x29,#16] |
| add sp,sp,#128+64 |
| ldp x21,x22,[x29,#32] |
| ldp x23,x24,[x29,#48] |
| ldp x25,x26,[x29,#64] |
| ldp x27,x28,[x29,#80] |
| ldp x29,x30,[sp],#96 |
| ret |
| |