nexmon – Rev 1

Subversion Repositories:
Rev:
#if defined(__aarch64__)
#include <openssl/arm_arch.h>

.text



.align  5
.Lsigma:
.quad   0x3320646e61707865,0x6b20657479622d32           // endian-neutral
.Lone:
.long   1,0,0,0
.LOPENSSL_armcap_P:
#ifdef  __ILP32__
.long   OPENSSL_armcap_P-.
#else
.quad   OPENSSL_armcap_P-.
#endif
.byte   67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align  2

.globl  ChaCha20_ctr32
.hidden ChaCha20_ctr32
.type   ChaCha20_ctr32,%function
.align  5
ChaCha20_ctr32:
        cbz     x2,.Labort
        adr     x5,.LOPENSSL_armcap_P
        cmp     x2,#192
        b.lo    .Lshort
#ifdef  __ILP32__
        ldrsw   x6,[x5]
#else
        ldr     x6,[x5]
#endif
        ldr     w17,[x6,x5]
        tst     w17,#ARMV7_NEON
        b.ne    ChaCha20_neon

.Lshort:
        stp     x29,x30,[sp,#-96]!
        add     x29,sp,#0

        adr     x5,.Lsigma
        stp     x19,x20,[sp,#16]
        stp     x21,x22,[sp,#32]
        stp     x23,x24,[sp,#48]
        stp     x25,x26,[sp,#64]
        stp     x27,x28,[sp,#80]
        sub     sp,sp,#64

        ldp     x22,x23,[x5]            // load sigma
        ldp     x24,x25,[x3]            // load key
        ldp     x26,x27,[x3,#16]
        ldp     x28,x30,[x4]            // load counter
#ifdef  __ARMEB__
        ror     x24,x24,#32
        ror     x25,x25,#32
        ror     x26,x26,#32
        ror     x27,x27,#32
        ror     x28,x28,#32
        ror     x30,x30,#32
#endif

.Loop_outer:
        mov     w5,w22                  // unpack key block
        lsr     x6,x22,#32
        mov     w7,w23
        lsr     x8,x23,#32
        mov     w9,w24
        lsr     x10,x24,#32
        mov     w11,w25
        lsr     x12,x25,#32
        mov     w13,w26
        lsr     x14,x26,#32
        mov     w15,w27
        lsr     x16,x27,#32
        mov     w17,w28
        lsr     x19,x28,#32
        mov     w20,w30
        lsr     x21,x30,#32

        mov     x4,#10
        subs    x2,x2,#64
.Loop:
        sub     x4,x4,#1
        add     w5,w5,w9
        add     w6,w6,w10
        add     w7,w7,w11
        add     w8,w8,w12
        eor     w17,w17,w5
        eor     w19,w19,w6
        eor     w20,w20,w7
        eor     w21,w21,w8
        ror     w17,w17,#16
        ror     w19,w19,#16
        ror     w20,w20,#16
        ror     w21,w21,#16
        add     w13,w13,w17
        add     w14,w14,w19
        add     w15,w15,w20
        add     w16,w16,w21
        eor     w9,w9,w13
        eor     w10,w10,w14
        eor     w11,w11,w15
        eor     w12,w12,w16
        ror     w9,w9,#20
        ror     w10,w10,#20
        ror     w11,w11,#20
        ror     w12,w12,#20
        add     w5,w5,w9
        add     w6,w6,w10
        add     w7,w7,w11
        add     w8,w8,w12
        eor     w17,w17,w5
        eor     w19,w19,w6
        eor     w20,w20,w7
        eor     w21,w21,w8
        ror     w17,w17,#24
        ror     w19,w19,#24
        ror     w20,w20,#24
        ror     w21,w21,#24
        add     w13,w13,w17
        add     w14,w14,w19
        add     w15,w15,w20
        add     w16,w16,w21
        eor     w9,w9,w13
        eor     w10,w10,w14
        eor     w11,w11,w15
        eor     w12,w12,w16
        ror     w9,w9,#25
        ror     w10,w10,#25
        ror     w11,w11,#25
        ror     w12,w12,#25
        add     w5,w5,w10
        add     w6,w6,w11
        add     w7,w7,w12
        add     w8,w8,w9
        eor     w21,w21,w5
        eor     w17,w17,w6
        eor     w19,w19,w7
        eor     w20,w20,w8
        ror     w21,w21,#16
        ror     w17,w17,#16
        ror     w19,w19,#16
        ror     w20,w20,#16
        add     w15,w15,w21
        add     w16,w16,w17
        add     w13,w13,w19
        add     w14,w14,w20
        eor     w10,w10,w15
        eor     w11,w11,w16
        eor     w12,w12,w13
        eor     w9,w9,w14
        ror     w10,w10,#20
        ror     w11,w11,#20
        ror     w12,w12,#20
        ror     w9,w9,#20
        add     w5,w5,w10
        add     w6,w6,w11
        add     w7,w7,w12
        add     w8,w8,w9
        eor     w21,w21,w5
        eor     w17,w17,w6
        eor     w19,w19,w7
        eor     w20,w20,w8
        ror     w21,w21,#24
        ror     w17,w17,#24
        ror     w19,w19,#24
        ror     w20,w20,#24
        add     w15,w15,w21
        add     w16,w16,w17
        add     w13,w13,w19
        add     w14,w14,w20
        eor     w10,w10,w15
        eor     w11,w11,w16
        eor     w12,w12,w13
        eor     w9,w9,w14
        ror     w10,w10,#25
        ror     w11,w11,#25
        ror     w12,w12,#25
        ror     w9,w9,#25
        cbnz    x4,.Loop

        add     w5,w5,w22               // accumulate key block
        add     x6,x6,x22,lsr#32
        add     w7,w7,w23
        add     x8,x8,x23,lsr#32
        add     w9,w9,w24
        add     x10,x10,x24,lsr#32
        add     w11,w11,w25
        add     x12,x12,x25,lsr#32
        add     w13,w13,w26
        add     x14,x14,x26,lsr#32
        add     w15,w15,w27
        add     x16,x16,x27,lsr#32
        add     w17,w17,w28
        add     x19,x19,x28,lsr#32
        add     w20,w20,w30
        add     x21,x21,x30,lsr#32

        b.lo    .Ltail

        add     x5,x5,x6,lsl#32 // pack
        add     x7,x7,x8,lsl#32
        ldp     x6,x8,[x1,#0]           // load input
        add     x9,x9,x10,lsl#32
        add     x11,x11,x12,lsl#32
        ldp     x10,x12,[x1,#16]
        add     x13,x13,x14,lsl#32
        add     x15,x15,x16,lsl#32
        ldp     x14,x16,[x1,#32]
        add     x17,x17,x19,lsl#32
        add     x20,x20,x21,lsl#32
        ldp     x19,x21,[x1,#48]
        add     x1,x1,#64
#ifdef  __ARMEB__
        rev     x5,x5
        rev     x7,x7
        rev     x9,x9
        rev     x11,x11
        rev     x13,x13
        rev     x15,x15
        rev     x17,x17
        rev     x20,x20
#endif
        eor     x5,x5,x6
        eor     x7,x7,x8
        eor     x9,x9,x10
        eor     x11,x11,x12
        eor     x13,x13,x14
        eor     x15,x15,x16
        eor     x17,x17,x19
        eor     x20,x20,x21

        stp     x5,x7,[x0,#0]           // store output
        add     x28,x28,#1                      // increment counter
        stp     x9,x11,[x0,#16]
        stp     x13,x15,[x0,#32]
        stp     x17,x20,[x0,#48]
        add     x0,x0,#64

        b.hi    .Loop_outer

        ldp     x19,x20,[x29,#16]
        add     sp,sp,#64
        ldp     x21,x22,[x29,#32]
        ldp     x23,x24,[x29,#48]
        ldp     x25,x26,[x29,#64]
        ldp     x27,x28,[x29,#80]
        ldp     x29,x30,[sp],#96
.Labort:
        ret

.align  4
.Ltail:
        add     x2,x2,#64
.Less_than_64:
        sub     x0,x0,#1
        add     x1,x1,x2
        add     x0,x0,x2
        add     x4,sp,x2
        neg     x2,x2

        add     x5,x5,x6,lsl#32 // pack
        add     x7,x7,x8,lsl#32
        add     x9,x9,x10,lsl#32
        add     x11,x11,x12,lsl#32
        add     x13,x13,x14,lsl#32
        add     x15,x15,x16,lsl#32
        add     x17,x17,x19,lsl#32
        add     x20,x20,x21,lsl#32
#ifdef  __ARMEB__
        rev     x5,x5
        rev     x7,x7
        rev     x9,x9
        rev     x11,x11
        rev     x13,x13
        rev     x15,x15
        rev     x17,x17
        rev     x20,x20
#endif
        stp     x5,x7,[sp,#0]
        stp     x9,x11,[sp,#16]
        stp     x13,x15,[sp,#32]
        stp     x17,x20,[sp,#48]

.Loop_tail:
        ldrb    w10,[x1,x2]
        ldrb    w11,[x4,x2]
        add     x2,x2,#1
        eor     w10,w10,w11
        strb    w10,[x0,x2]
        cbnz    x2,.Loop_tail

        stp     xzr,xzr,[sp,#0]
        stp     xzr,xzr,[sp,#16]
        stp     xzr,xzr,[sp,#32]
        stp     xzr,xzr,[sp,#48]

        ldp     x19,x20,[x29,#16]
        add     sp,sp,#64
        ldp     x21,x22,[x29,#32]
        ldp     x23,x24,[x29,#48]
        ldp     x25,x26,[x29,#64]
        ldp     x27,x28,[x29,#80]
        ldp     x29,x30,[sp],#96
        ret
.size   ChaCha20_ctr32,.-ChaCha20_ctr32

.type   ChaCha20_neon,%function
.align  5
ChaCha20_neon:
        stp     x29,x30,[sp,#-96]!
        add     x29,sp,#0

        adr     x5,.Lsigma
        stp     x19,x20,[sp,#16]
        stp     x21,x22,[sp,#32]
        stp     x23,x24,[sp,#48]
        stp     x25,x26,[sp,#64]
        stp     x27,x28,[sp,#80]
        cmp     x2,#512
        b.hs    .L512_or_more_neon

        sub     sp,sp,#64

        ldp     x22,x23,[x5]            // load sigma
        ld1     {v24.4s},[x5],#16
        ldp     x24,x25,[x3]            // load key
        ldp     x26,x27,[x3,#16]
        ld1     {v25.4s,v26.4s},[x3]
        ldp     x28,x30,[x4]            // load counter
        ld1     {v27.4s},[x4]
        ld1     {v31.4s},[x5]
#ifdef  __ARMEB__
        rev64   v24.4s,v24.4s
        ror     x24,x24,#32
        ror     x25,x25,#32
        ror     x26,x26,#32
        ror     x27,x27,#32
        ror     x28,x28,#32
        ror     x30,x30,#32
#endif
        add     v27.4s,v27.4s,v31.4s            // += 1
        add     v28.4s,v27.4s,v31.4s
        add     v29.4s,v28.4s,v31.4s
        shl     v31.4s,v31.4s,#2                        // 1 -> 4

.Loop_outer_neon:
        mov     w5,w22                  // unpack key block
        lsr     x6,x22,#32
        mov     v0.16b,v24.16b
        mov     w7,w23
        lsr     x8,x23,#32
        mov     v4.16b,v24.16b
        mov     w9,w24
        lsr     x10,x24,#32
        mov     v16.16b,v24.16b
        mov     w11,w25
        mov     v1.16b,v25.16b
        lsr     x12,x25,#32
        mov     v5.16b,v25.16b
        mov     w13,w26
        mov     v17.16b,v25.16b
        lsr     x14,x26,#32
        mov     v3.16b,v27.16b
        mov     w15,w27
        mov     v7.16b,v28.16b
        lsr     x16,x27,#32
        mov     v19.16b,v29.16b
        mov     w17,w28
        mov     v2.16b,v26.16b
        lsr     x19,x28,#32
        mov     v6.16b,v26.16b
        mov     w20,w30
        mov     v18.16b,v26.16b
        lsr     x21,x30,#32

        mov     x4,#10
        subs    x2,x2,#256
.Loop_neon:
        sub     x4,x4,#1
        add     v0.4s,v0.4s,v1.4s
        add     w5,w5,w9
        add     v4.4s,v4.4s,v5.4s
        add     w6,w6,w10
        add     v16.4s,v16.4s,v17.4s
        add     w7,w7,w11
        eor     v3.16b,v3.16b,v0.16b
        add     w8,w8,w12
        eor     v7.16b,v7.16b,v4.16b
        eor     w17,w17,w5
        eor     v19.16b,v19.16b,v16.16b
        eor     w19,w19,w6
        rev32   v3.8h,v3.8h
        eor     w20,w20,w7
        rev32   v7.8h,v7.8h
        eor     w21,w21,w8
        rev32   v19.8h,v19.8h
        ror     w17,w17,#16
        add     v2.4s,v2.4s,v3.4s
        ror     w19,w19,#16
        add     v6.4s,v6.4s,v7.4s
        ror     w20,w20,#16
        add     v18.4s,v18.4s,v19.4s
        ror     w21,w21,#16
        eor     v20.16b,v1.16b,v2.16b
        add     w13,w13,w17
        eor     v21.16b,v5.16b,v6.16b
        add     w14,w14,w19
        eor     v22.16b,v17.16b,v18.16b
        add     w15,w15,w20
        ushr    v1.4s,v20.4s,#20
        add     w16,w16,w21
        ushr    v5.4s,v21.4s,#20
        eor     w9,w9,w13
        ushr    v17.4s,v22.4s,#20
        eor     w10,w10,w14
        sli     v1.4s,v20.4s,#12
        eor     w11,w11,w15
        sli     v5.4s,v21.4s,#12
        eor     w12,w12,w16
        sli     v17.4s,v22.4s,#12
        ror     w9,w9,#20
        add     v0.4s,v0.4s,v1.4s
        ror     w10,w10,#20
        add     v4.4s,v4.4s,v5.4s
        ror     w11,w11,#20
        add     v16.4s,v16.4s,v17.4s
        ror     w12,w12,#20
        eor     v20.16b,v3.16b,v0.16b
        add     w5,w5,w9
        eor     v21.16b,v7.16b,v4.16b
        add     w6,w6,w10
        eor     v22.16b,v19.16b,v16.16b
        add     w7,w7,w11
        ushr    v3.4s,v20.4s,#24
        add     w8,w8,w12
        ushr    v7.4s,v21.4s,#24
        eor     w17,w17,w5
        ushr    v19.4s,v22.4s,#24
        eor     w19,w19,w6
        sli     v3.4s,v20.4s,#8
        eor     w20,w20,w7
        sli     v7.4s,v21.4s,#8
        eor     w21,w21,w8
        sli     v19.4s,v22.4s,#8
        ror     w17,w17,#24
        add     v2.4s,v2.4s,v3.4s
        ror     w19,w19,#24
        add     v6.4s,v6.4s,v7.4s
        ror     w20,w20,#24
        add     v18.4s,v18.4s,v19.4s
        ror     w21,w21,#24
        eor     v20.16b,v1.16b,v2.16b
        add     w13,w13,w17
        eor     v21.16b,v5.16b,v6.16b
        add     w14,w14,w19
        eor     v22.16b,v17.16b,v18.16b
        add     w15,w15,w20
        ushr    v1.4s,v20.4s,#25
        add     w16,w16,w21
        ushr    v5.4s,v21.4s,#25
        eor     w9,w9,w13
        ushr    v17.4s,v22.4s,#25
        eor     w10,w10,w14
        sli     v1.4s,v20.4s,#7
        eor     w11,w11,w15
        sli     v5.4s,v21.4s,#7
        eor     w12,w12,w16
        sli     v17.4s,v22.4s,#7
        ror     w9,w9,#25
        ext     v2.16b,v2.16b,v2.16b,#8
        ror     w10,w10,#25
        ext     v6.16b,v6.16b,v6.16b,#8
        ror     w11,w11,#25
        ext     v18.16b,v18.16b,v18.16b,#8
        ror     w12,w12,#25
        ext     v3.16b,v3.16b,v3.16b,#12
        ext     v7.16b,v7.16b,v7.16b,#12
        ext     v19.16b,v19.16b,v19.16b,#12
        ext     v1.16b,v1.16b,v1.16b,#4
        ext     v5.16b,v5.16b,v5.16b,#4
        ext     v17.16b,v17.16b,v17.16b,#4
        add     v0.4s,v0.4s,v1.4s
        add     w5,w5,w10
        add     v4.4s,v4.4s,v5.4s
        add     w6,w6,w11
        add     v16.4s,v16.4s,v17.4s
        add     w7,w7,w12
        eor     v3.16b,v3.16b,v0.16b
        add     w8,w8,w9
        eor     v7.16b,v7.16b,v4.16b
        eor     w21,w21,w5
        eor     v19.16b,v19.16b,v16.16b
        eor     w17,w17,w6
        rev32   v3.8h,v3.8h
        eor     w19,w19,w7
        rev32   v7.8h,v7.8h
        eor     w20,w20,w8
        rev32   v19.8h,v19.8h
        ror     w21,w21,#16
        add     v2.4s,v2.4s,v3.4s
        ror     w17,w17,#16
        add     v6.4s,v6.4s,v7.4s
        ror     w19,w19,#16
        add     v18.4s,v18.4s,v19.4s
        ror     w20,w20,#16
        eor     v20.16b,v1.16b,v2.16b
        add     w15,w15,w21
        eor     v21.16b,v5.16b,v6.16b
        add     w16,w16,w17
        eor     v22.16b,v17.16b,v18.16b
        add     w13,w13,w19
        ushr    v1.4s,v20.4s,#20
        add     w14,w14,w20
        ushr    v5.4s,v21.4s,#20
        eor     w10,w10,w15
        ushr    v17.4s,v22.4s,#20
        eor     w11,w11,w16
        sli     v1.4s,v20.4s,#12
        eor     w12,w12,w13
        sli     v5.4s,v21.4s,#12
        eor     w9,w9,w14
        sli     v17.4s,v22.4s,#12
        ror     w10,w10,#20
        add     v0.4s,v0.4s,v1.4s
        ror     w11,w11,#20
        add     v4.4s,v4.4s,v5.4s
        ror     w12,w12,#20
        add     v16.4s,v16.4s,v17.4s
        ror     w9,w9,#20
        eor     v20.16b,v3.16b,v0.16b
        add     w5,w5,w10
        eor     v21.16b,v7.16b,v4.16b
        add     w6,w6,w11
        eor     v22.16b,v19.16b,v16.16b
        add     w7,w7,w12
        ushr    v3.4s,v20.4s,#24
        add     w8,w8,w9
        ushr    v7.4s,v21.4s,#24
        eor     w21,w21,w5
        ushr    v19.4s,v22.4s,#24
        eor     w17,w17,w6
        sli     v3.4s,v20.4s,#8
        eor     w19,w19,w7
        sli     v7.4s,v21.4s,#8
        eor     w20,w20,w8
        sli     v19.4s,v22.4s,#8
        ror     w21,w21,#24
        add     v2.4s,v2.4s,v3.4s
        ror     w17,w17,#24
        add     v6.4s,v6.4s,v7.4s
        ror     w19,w19,#24
        add     v18.4s,v18.4s,v19.4s
        ror     w20,w20,#24
        eor     v20.16b,v1.16b,v2.16b
        add     w15,w15,w21
        eor     v21.16b,v5.16b,v6.16b
        add     w16,w16,w17
        eor     v22.16b,v17.16b,v18.16b
        add     w13,w13,w19
        ushr    v1.4s,v20.4s,#25
        add     w14,w14,w20
        ushr    v5.4s,v21.4s,#25
        eor     w10,w10,w15
        ushr    v17.4s,v22.4s,#25
        eor     w11,w11,w16
        sli     v1.4s,v20.4s,#7
        eor     w12,w12,w13
        sli     v5.4s,v21.4s,#7
        eor     w9,w9,w14
        sli     v17.4s,v22.4s,#7
        ror     w10,w10,#25
        ext     v2.16b,v2.16b,v2.16b,#8
        ror     w11,w11,#25
        ext     v6.16b,v6.16b,v6.16b,#8
        ror     w12,w12,#25
        ext     v18.16b,v18.16b,v18.16b,#8
        ror     w9,w9,#25
        ext     v3.16b,v3.16b,v3.16b,#4
        ext     v7.16b,v7.16b,v7.16b,#4
        ext     v19.16b,v19.16b,v19.16b,#4
        ext     v1.16b,v1.16b,v1.16b,#12
        ext     v5.16b,v5.16b,v5.16b,#12
        ext     v17.16b,v17.16b,v17.16b,#12
        cbnz    x4,.Loop_neon

        add     w5,w5,w22               // accumulate key block
        add     v0.4s,v0.4s,v24.4s
        add     x6,x6,x22,lsr#32
        add     v4.4s,v4.4s,v24.4s
        add     w7,w7,w23
        add     v16.4s,v16.4s,v24.4s
        add     x8,x8,x23,lsr#32
        add     v2.4s,v2.4s,v26.4s
        add     w9,w9,w24
        add     v6.4s,v6.4s,v26.4s
        add     x10,x10,x24,lsr#32
        add     v18.4s,v18.4s,v26.4s
        add     w11,w11,w25
        add     v3.4s,v3.4s,v27.4s
        add     x12,x12,x25,lsr#32
        add     w13,w13,w26
        add     v7.4s,v7.4s,v28.4s
        add     x14,x14,x26,lsr#32
        add     w15,w15,w27
        add     v19.4s,v19.4s,v29.4s
        add     x16,x16,x27,lsr#32
        add     w17,w17,w28
        add     v1.4s,v1.4s,v25.4s
        add     x19,x19,x28,lsr#32
        add     w20,w20,w30
        add     v5.4s,v5.4s,v25.4s
        add     x21,x21,x30,lsr#32
        add     v17.4s,v17.4s,v25.4s

        b.lo    .Ltail_neon

        add     x5,x5,x6,lsl#32 // pack
        add     x7,x7,x8,lsl#32
        ldp     x6,x8,[x1,#0]           // load input
        add     x9,x9,x10,lsl#32
        add     x11,x11,x12,lsl#32
        ldp     x10,x12,[x1,#16]
        add     x13,x13,x14,lsl#32
        add     x15,x15,x16,lsl#32
        ldp     x14,x16,[x1,#32]
        add     x17,x17,x19,lsl#32
        add     x20,x20,x21,lsl#32
        ldp     x19,x21,[x1,#48]
        add     x1,x1,#64
#ifdef  __ARMEB__
        rev     x5,x5
        rev     x7,x7
        rev     x9,x9
        rev     x11,x11
        rev     x13,x13
        rev     x15,x15
        rev     x17,x17
        rev     x20,x20
#endif
        ld1     {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
        eor     x5,x5,x6
        eor     x7,x7,x8
        eor     x9,x9,x10
        eor     x11,x11,x12
        eor     x13,x13,x14
        eor     v0.16b,v0.16b,v20.16b
        eor     x15,x15,x16
        eor     v1.16b,v1.16b,v21.16b
        eor     x17,x17,x19
        eor     v2.16b,v2.16b,v22.16b
        eor     x20,x20,x21
        eor     v3.16b,v3.16b,v23.16b
        ld1     {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64

        stp     x5,x7,[x0,#0]           // store output
        add     x28,x28,#4                      // increment counter
        stp     x9,x11,[x0,#16]
        add     v27.4s,v27.4s,v31.4s            // += 4
        stp     x13,x15,[x0,#32]
        add     v28.4s,v28.4s,v31.4s
        stp     x17,x20,[x0,#48]
        add     v29.4s,v29.4s,v31.4s
        add     x0,x0,#64

        st1     {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
        ld1     {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64

        eor     v4.16b,v4.16b,v20.16b
        eor     v5.16b,v5.16b,v21.16b
        eor     v6.16b,v6.16b,v22.16b
        eor     v7.16b,v7.16b,v23.16b
        st1     {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64

        eor     v16.16b,v16.16b,v0.16b
        eor     v17.16b,v17.16b,v1.16b
        eor     v18.16b,v18.16b,v2.16b
        eor     v19.16b,v19.16b,v3.16b
        st1     {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64

        b.hi    .Loop_outer_neon

        ldp     x19,x20,[x29,#16]
        add     sp,sp,#64
        ldp     x21,x22,[x29,#32]
        ldp     x23,x24,[x29,#48]
        ldp     x25,x26,[x29,#64]
        ldp     x27,x28,[x29,#80]
        ldp     x29,x30,[sp],#96
        ret

.Ltail_neon:
        add     x2,x2,#256
        cmp     x2,#64
        b.lo    .Less_than_64

        add     x5,x5,x6,lsl#32 // pack
        add     x7,x7,x8,lsl#32
        ldp     x6,x8,[x1,#0]           // load input
        add     x9,x9,x10,lsl#32
        add     x11,x11,x12,lsl#32
        ldp     x10,x12,[x1,#16]
        add     x13,x13,x14,lsl#32
        add     x15,x15,x16,lsl#32
        ldp     x14,x16,[x1,#32]
        add     x17,x17,x19,lsl#32
        add     x20,x20,x21,lsl#32
        ldp     x19,x21,[x1,#48]
        add     x1,x1,#64
#ifdef  __ARMEB__
        rev     x5,x5
        rev     x7,x7
        rev     x9,x9
        rev     x11,x11
        rev     x13,x13
        rev     x15,x15
        rev     x17,x17
        rev     x20,x20
#endif
        eor     x5,x5,x6
        eor     x7,x7,x8
        eor     x9,x9,x10
        eor     x11,x11,x12
        eor     x13,x13,x14
        eor     x15,x15,x16
        eor     x17,x17,x19
        eor     x20,x20,x21

        stp     x5,x7,[x0,#0]           // store output
        add     x28,x28,#4                      // increment counter
        stp     x9,x11,[x0,#16]
        stp     x13,x15,[x0,#32]
        stp     x17,x20,[x0,#48]
        add     x0,x0,#64
        b.eq    .Ldone_neon
        sub     x2,x2,#64
        cmp     x2,#64
        b.lo    .Less_than_128

        ld1     {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
        eor     v0.16b,v0.16b,v20.16b
        eor     v1.16b,v1.16b,v21.16b
        eor     v2.16b,v2.16b,v22.16b
        eor     v3.16b,v3.16b,v23.16b
        st1     {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
        b.eq    .Ldone_neon
        sub     x2,x2,#64
        cmp     x2,#64
        b.lo    .Less_than_192

        ld1     {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
        eor     v4.16b,v4.16b,v20.16b
        eor     v5.16b,v5.16b,v21.16b
        eor     v6.16b,v6.16b,v22.16b
        eor     v7.16b,v7.16b,v23.16b
        st1     {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
        b.eq    .Ldone_neon
        sub     x2,x2,#64

        st1     {v16.16b,v17.16b,v18.16b,v19.16b},[sp]
        b       .Last_neon

.Less_than_128:
        st1     {v0.16b,v1.16b,v2.16b,v3.16b},[sp]
        b       .Last_neon
.Less_than_192:
        st1     {v4.16b,v5.16b,v6.16b,v7.16b},[sp]
        b       .Last_neon

.align  4
.Last_neon:
        sub     x0,x0,#1
        add     x1,x1,x2
        add     x0,x0,x2
        add     x4,sp,x2
        neg     x2,x2

.Loop_tail_neon:
        ldrb    w10,[x1,x2]
        ldrb    w11,[x4,x2]
        add     x2,x2,#1
        eor     w10,w10,w11
        strb    w10,[x0,x2]
        cbnz    x2,.Loop_tail_neon

        stp     xzr,xzr,[sp,#0]
        stp     xzr,xzr,[sp,#16]
        stp     xzr,xzr,[sp,#32]
        stp     xzr,xzr,[sp,#48]

.Ldone_neon:
        ldp     x19,x20,[x29,#16]
        add     sp,sp,#64
        ldp     x21,x22,[x29,#32]
        ldp     x23,x24,[x29,#48]
        ldp     x25,x26,[x29,#64]
        ldp     x27,x28,[x29,#80]
        ldp     x29,x30,[sp],#96
        ret
.size   ChaCha20_neon,.-ChaCha20_neon
.type   ChaCha20_512_neon,%function
.align  5
ChaCha20_512_neon:
        stp     x29,x30,[sp,#-96]!
        add     x29,sp,#0

        adr     x5,.Lsigma
        stp     x19,x20,[sp,#16]
        stp     x21,x22,[sp,#32]
        stp     x23,x24,[sp,#48]
        stp     x25,x26,[sp,#64]
        stp     x27,x28,[sp,#80]

.L512_or_more_neon:
        sub     sp,sp,#128+64

        ldp     x22,x23,[x5]            // load sigma
        ld1     {v24.4s},[x5],#16
        ldp     x24,x25,[x3]            // load key
        ldp     x26,x27,[x3,#16]
        ld1     {v25.4s,v26.4s},[x3]
        ldp     x28,x30,[x4]            // load counter
        ld1     {v27.4s},[x4]
        ld1     {v31.4s},[x5]
#ifdef  __ARMEB__
        rev64   v24.4s,v24.4s
        ror     x24,x24,#32
        ror     x25,x25,#32
        ror     x26,x26,#32
        ror     x27,x27,#32
        ror     x28,x28,#32
        ror     x30,x30,#32
#endif
        add     v27.4s,v27.4s,v31.4s            // += 1
        stp     q24,q25,[sp,#0]         // off-load key block, invariant part
        add     v27.4s,v27.4s,v31.4s            // not typo
        str     q26,[sp,#32]
        add     v28.4s,v27.4s,v31.4s
        add     v29.4s,v28.4s,v31.4s
        add     v30.4s,v29.4s,v31.4s
        shl     v31.4s,v31.4s,#2                        // 1 -> 4

        stp     d8,d9,[sp,#128+0]               // meet ABI requirements
        stp     d10,d11,[sp,#128+16]
        stp     d12,d13,[sp,#128+32]
        stp     d14,d15,[sp,#128+48]

        sub     x2,x2,#512                      // not typo

.Loop_outer_512_neon:
        mov     v0.16b,v24.16b
        mov     v4.16b,v24.16b
        mov     v8.16b,v24.16b
        mov     v12.16b,v24.16b
        mov     v16.16b,v24.16b
        mov     v20.16b,v24.16b
        mov     v1.16b,v25.16b
        mov     w5,w22                  // unpack key block
        mov     v5.16b,v25.16b
        lsr     x6,x22,#32
        mov     v9.16b,v25.16b
        mov     w7,w23
        mov     v13.16b,v25.16b
        lsr     x8,x23,#32
        mov     v17.16b,v25.16b
        mov     w9,w24
        mov     v21.16b,v25.16b
        lsr     x10,x24,#32
        mov     v3.16b,v27.16b
        mov     w11,w25
        mov     v7.16b,v28.16b
        lsr     x12,x25,#32
        mov     v11.16b,v29.16b
        mov     w13,w26
        mov     v15.16b,v30.16b
        lsr     x14,x26,#32
        mov     v2.16b,v26.16b
        mov     w15,w27
        mov     v6.16b,v26.16b
        lsr     x16,x27,#32
        add     v19.4s,v3.4s,v31.4s                     // +4
        mov     w17,w28
        add     v23.4s,v7.4s,v31.4s                     // +4
        lsr     x19,x28,#32
        mov     v10.16b,v26.16b
        mov     w20,w30
        mov     v14.16b,v26.16b
        lsr     x21,x30,#32
        mov     v18.16b,v26.16b
        stp     q27,q28,[sp,#48]                // off-load key block, variable part
        mov     v22.16b,v26.16b
        str     q29,[sp,#80]

        mov     x4,#5
        subs    x2,x2,#512
.Loop_upper_neon:
        sub     x4,x4,#1
        add     v0.4s,v0.4s,v1.4s
        add     w5,w5,w9
        add     v4.4s,v4.4s,v5.4s
        add     w6,w6,w10
        add     v8.4s,v8.4s,v9.4s
        add     w7,w7,w11
        add     v12.4s,v12.4s,v13.4s
        add     w8,w8,w12
        add     v16.4s,v16.4s,v17.4s
        eor     w17,w17,w5
        add     v20.4s,v20.4s,v21.4s
        eor     w19,w19,w6
        eor     v3.16b,v3.16b,v0.16b
        eor     w20,w20,w7
        eor     v7.16b,v7.16b,v4.16b
        eor     w21,w21,w8
        eor     v11.16b,v11.16b,v8.16b
        ror     w17,w17,#16
        eor     v15.16b,v15.16b,v12.16b
        ror     w19,w19,#16
        eor     v19.16b,v19.16b,v16.16b
        ror     w20,w20,#16
        eor     v23.16b,v23.16b,v20.16b
        ror     w21,w21,#16
        rev32   v3.8h,v3.8h
        add     w13,w13,w17
        rev32   v7.8h,v7.8h
        add     w14,w14,w19
        rev32   v11.8h,v11.8h
        add     w15,w15,w20
        rev32   v15.8h,v15.8h
        add     w16,w16,w21
        rev32   v19.8h,v19.8h
        eor     w9,w9,w13
        rev32   v23.8h,v23.8h
        eor     w10,w10,w14
        add     v2.4s,v2.4s,v3.4s
        eor     w11,w11,w15
        add     v6.4s,v6.4s,v7.4s
        eor     w12,w12,w16
        add     v10.4s,v10.4s,v11.4s
        ror     w9,w9,#20
        add     v14.4s,v14.4s,v15.4s
        ror     w10,w10,#20
        add     v18.4s,v18.4s,v19.4s
        ror     w11,w11,#20
        add     v22.4s,v22.4s,v23.4s
        ror     w12,w12,#20
        eor     v24.16b,v1.16b,v2.16b
        add     w5,w5,w9
        eor     v25.16b,v5.16b,v6.16b
        add     w6,w6,w10
        eor     v26.16b,v9.16b,v10.16b
        add     w7,w7,w11
        eor     v27.16b,v13.16b,v14.16b
        add     w8,w8,w12
        eor     v28.16b,v17.16b,v18.16b
        eor     w17,w17,w5
        eor     v29.16b,v21.16b,v22.16b
        eor     w19,w19,w6
        ushr    v1.4s,v24.4s,#20
        eor     w20,w20,w7
        ushr    v5.4s,v25.4s,#20
        eor     w21,w21,w8
        ushr    v9.4s,v26.4s,#20
        ror     w17,w17,#24
        ushr    v13.4s,v27.4s,#20
        ror     w19,w19,#24
        ushr    v17.4s,v28.4s,#20
        ror     w20,w20,#24
        ushr    v21.4s,v29.4s,#20
        ror     w21,w21,#24
        sli     v1.4s,v24.4s,#12
        add     w13,w13,w17
        sli     v5.4s,v25.4s,#12
        add     w14,w14,w19
        sli     v9.4s,v26.4s,#12
        add     w15,w15,w20
        sli     v13.4s,v27.4s,#12
        add     w16,w16,w21
        sli     v17.4s,v28.4s,#12
        eor     w9,w9,w13
        sli     v21.4s,v29.4s,#12
        eor     w10,w10,w14
        add     v0.4s,v0.4s,v1.4s
        eor     w11,w11,w15
        add     v4.4s,v4.4s,v5.4s
        eor     w12,w12,w16
        add     v8.4s,v8.4s,v9.4s
        ror     w9,w9,#25
        add     v12.4s,v12.4s,v13.4s
        ror     w10,w10,#25
        add     v16.4s,v16.4s,v17.4s
        ror     w11,w11,#25
        add     v20.4s,v20.4s,v21.4s
        ror     w12,w12,#25
        eor     v24.16b,v3.16b,v0.16b
        add     w5,w5,w10
        eor     v25.16b,v7.16b,v4.16b
        add     w6,w6,w11
        eor     v26.16b,v11.16b,v8.16b
        add     w7,w7,w12
        eor     v27.16b,v15.16b,v12.16b
        add     w8,w8,w9
        eor     v28.16b,v19.16b,v16.16b
        eor     w21,w21,w5
        eor     v29.16b,v23.16b,v20.16b
        eor     w17,w17,w6
        ushr    v3.4s,v24.4s,#24
        eor     w19,w19,w7
        ushr    v7.4s,v25.4s,#24
        eor     w20,w20,w8
        ushr    v11.4s,v26.4s,#24
        ror     w21,w21,#16
        ushr    v15.4s,v27.4s,#24
        ror     w17,w17,#16
        ushr    v19.4s,v28.4s,#24
        ror     w19,w19,#16
        ushr    v23.4s,v29.4s,#24
        ror     w20,w20,#16
        sli     v3.4s,v24.4s,#8
        add     w15,w15,w21
        sli     v7.4s,v25.4s,#8
        add     w16,w16,w17
        sli     v11.4s,v26.4s,#8
        add     w13,w13,w19
        sli     v15.4s,v27.4s,#8
        add     w14,w14,w20
        sli     v19.4s,v28.4s,#8
        eor     w10,w10,w15
        sli     v23.4s,v29.4s,#8
        eor     w11,w11,w16
        add     v2.4s,v2.4s,v3.4s
        eor     w12,w12,w13
        add     v6.4s,v6.4s,v7.4s
        eor     w9,w9,w14
        add     v10.4s,v10.4s,v11.4s
        ror     w10,w10,#20
        add     v14.4s,v14.4s,v15.4s
        ror     w11,w11,#20
        add     v18.4s,v18.4s,v19.4s
        ror     w12,w12,#20
        add     v22.4s,v22.4s,v23.4s
        ror     w9,w9,#20
        eor     v24.16b,v1.16b,v2.16b
        add     w5,w5,w10
        eor     v25.16b,v5.16b,v6.16b
        add     w6,w6,w11
        eor     v26.16b,v9.16b,v10.16b
        add     w7,w7,w12
        eor     v27.16b,v13.16b,v14.16b
        add     w8,w8,w9
        eor     v28.16b,v17.16b,v18.16b
        eor     w21,w21,w5
        eor     v29.16b,v21.16b,v22.16b
        eor     w17,w17,w6
        ushr    v1.4s,v24.4s,#25
        eor     w19,w19,w7
        ushr    v5.4s,v25.4s,#25
        eor     w20,w20,w8
        ushr    v9.4s,v26.4s,#25
        ror     w21,w21,#24
        ushr    v13.4s,v27.4s,#25
        ror     w17,w17,#24
        ushr    v17.4s,v28.4s,#25
        ror     w19,w19,#24
        ushr    v21.4s,v29.4s,#25
        ror     w20,w20,#24
        sli     v1.4s,v24.4s,#7
        add     w15,w15,w21
        sli     v5.4s,v25.4s,#7
        add     w16,w16,w17
        sli     v9.4s,v26.4s,#7
        add     w13,w13,w19
        sli     v13.4s,v27.4s,#7
        add     w14,w14,w20
        sli     v17.4s,v28.4s,#7
        eor     w10,w10,w15
        sli     v21.4s,v29.4s,#7
        eor     w11,w11,w16
        ext     v2.16b,v2.16b,v2.16b,#8
        eor     w12,w12,w13
        ext     v6.16b,v6.16b,v6.16b,#8
        eor     w9,w9,w14
        ext     v10.16b,v10.16b,v10.16b,#8
        ror     w10,w10,#25
        ext     v14.16b,v14.16b,v14.16b,#8
        ror     w11,w11,#25
        ext     v18.16b,v18.16b,v18.16b,#8
        ror     w12,w12,#25
        ext     v22.16b,v22.16b,v22.16b,#8
        ror     w9,w9,#25
        ext     v3.16b,v3.16b,v3.16b,#12
        ext     v7.16b,v7.16b,v7.16b,#12
        ext     v11.16b,v11.16b,v11.16b,#12
        ext     v15.16b,v15.16b,v15.16b,#12
        ext     v19.16b,v19.16b,v19.16b,#12
        ext     v23.16b,v23.16b,v23.16b,#12
        ext     v1.16b,v1.16b,v1.16b,#4
        ext     v5.16b,v5.16b,v5.16b,#4
        ext     v9.16b,v9.16b,v9.16b,#4
        ext     v13.16b,v13.16b,v13.16b,#4
        ext     v17.16b,v17.16b,v17.16b,#4
        ext     v21.16b,v21.16b,v21.16b,#4
        add     v0.4s,v0.4s,v1.4s
        add     w5,w5,w9
        add     v4.4s,v4.4s,v5.4s
        add     w6,w6,w10
        add     v8.4s,v8.4s,v9.4s
        add     w7,w7,w11
        add     v12.4s,v12.4s,v13.4s
        add     w8,w8,w12
        add     v16.4s,v16.4s,v17.4s
        eor     w17,w17,w5
        add     v20.4s,v20.4s,v21.4s
        eor     w19,w19,w6
        eor     v3.16b,v3.16b,v0.16b
        eor     w20,w20,w7
        eor     v7.16b,v7.16b,v4.16b
        eor     w21,w21,w8
        eor     v11.16b,v11.16b,v8.16b
        ror     w17,w17,#16
        eor     v15.16b,v15.16b,v12.16b
        ror     w19,w19,#16
        eor     v19.16b,v19.16b,v16.16b
        ror     w20,w20,#16
        eor     v23.16b,v23.16b,v20.16b
        ror     w21,w21,#16
        rev32   v3.8h,v3.8h
        add     w13,w13,w17
        rev32   v7.8h,v7.8h
        add     w14,w14,w19
        rev32   v11.8h,v11.8h
        add     w15,w15,w20
        rev32   v15.8h,v15.8h
        add     w16,w16,w21
        rev32   v19.8h,v19.8h
        eor     w9,w9,w13
        rev32   v23.8h,v23.8h
        eor     w10,w10,w14
        add     v2.4s,v2.4s,v3.4s
        eor     w11,w11,w15
        add     v6.4s,v6.4s,v7.4s
        eor     w12,w12,w16
        add     v10.4s,v10.4s,v11.4s
        ror     w9,w9,#20
        add     v14.4s,v14.4s,v15.4s
        ror     w10,w10,#20
        add     v18.4s,v18.4s,v19.4s
        ror     w11,w11,#20
        add     v22.4s,v22.4s,v23.4s
        ror     w12,w12,#20
        eor     v24.16b,v1.16b,v2.16b
        add     w5,w5,w9
        eor     v25.16b,v5.16b,v6.16b
        add     w6,w6,w10
        eor     v26.16b,v9.16b,v10.16b
        add     w7,w7,w11
        eor     v27.16b,v13.16b,v14.16b
        add     w8,w8,w12
        eor     v28.16b,v17.16b,v18.16b
        eor     w17,w17,w5
        eor     v29.16b,v21.16b,v22.16b
        eor     w19,w19,w6
        ushr    v1.4s,v24.4s,#20
        eor     w20,w20,w7
        ushr    v5.4s,v25.4s,#20
        eor     w21,w21,w8
        ushr    v9.4s,v26.4s,#20
        ror     w17,w17,#24
        ushr    v13.4s,v27.4s,#20
        ror     w19,w19,#24
        ushr    v17.4s,v28.4s,#20
        ror     w20,w20,#24
        ushr    v21.4s,v29.4s,#20
        ror     w21,w21,#24
        sli     v1.4s,v24.4s,#12
        add     w13,w13,w17
        sli     v5.4s,v25.4s,#12
        add     w14,w14,w19
        sli     v9.4s,v26.4s,#12
        add     w15,w15,w20
        sli     v13.4s,v27.4s,#12
        add     w16,w16,w21
        sli     v17.4s,v28.4s,#12
        eor     w9,w9,w13
        sli     v21.4s,v29.4s,#12
        eor     w10,w10,w14
        add     v0.4s,v0.4s,v1.4s
        eor     w11,w11,w15
        add     v4.4s,v4.4s,v5.4s
        eor     w12,w12,w16
        add     v8.4s,v8.4s,v9.4s
        ror     w9,w9,#25
        add     v12.4s,v12.4s,v13.4s
        ror     w10,w10,#25
        add     v16.4s,v16.4s,v17.4s
        ror     w11,w11,#25
        add     v20.4s,v20.4s,v21.4s
        ror     w12,w12,#25
        eor     v24.16b,v3.16b,v0.16b
        add     w5,w5,w10
        eor     v25.16b,v7.16b,v4.16b
        add     w6,w6,w11
        eor     v26.16b,v11.16b,v8.16b
        add     w7,w7,w12
        eor     v27.16b,v15.16b,v12.16b
        add     w8,w8,w9
        eor     v28.16b,v19.16b,v16.16b
        eor     w21,w21,w5
        eor     v29.16b,v23.16b,v20.16b
        eor     w17,w17,w6
        ushr    v3.4s,v24.4s,#24
        eor     w19,w19,w7
        ushr    v7.4s,v25.4s,#24
        eor     w20,w20,w8
        ushr    v11.4s,v26.4s,#24
        ror     w21,w21,#16
        ushr    v15.4s,v27.4s,#24
        ror     w17,w17,#16
        ushr    v19.4s,v28.4s,#24
        ror     w19,w19,#16
        ushr    v23.4s,v29.4s,#24
        ror     w20,w20,#16
        sli     v3.4s,v24.4s,#8
        add     w15,w15,w21
        sli     v7.4s,v25.4s,#8
        add     w16,w16,w17
        sli     v11.4s,v26.4s,#8
        add     w13,w13,w19
        sli     v15.4s,v27.4s,#8
        add     w14,w14,w20
        sli     v19.4s,v28.4s,#8
        eor     w10,w10,w15
        sli     v23.4s,v29.4s,#8
        eor     w11,w11,w16
        add     v2.4s,v2.4s,v3.4s
        eor     w12,w12,w13
        add     v6.4s,v6.4s,v7.4s
        eor     w9,w9,w14
        add     v10.4s,v10.4s,v11.4s
        ror     w10,w10,#20
        add     v14.4s,v14.4s,v15.4s
        ror     w11,w11,#20
        add     v18.4s,v18.4s,v19.4s
        ror     w12,w12,#20
        add     v22.4s,v22.4s,v23.4s
        ror     w9,w9,#20
        eor     v24.16b,v1.16b,v2.16b
        add     w5,w5,w10
        eor     v25.16b,v5.16b,v6.16b
        add     w6,w6,w11
        eor     v26.16b,v9.16b,v10.16b
        add     w7,w7,w12
        eor     v27.16b,v13.16b,v14.16b
        add     w8,w8,w9
        eor     v28.16b,v17.16b,v18.16b
        eor     w21,w21,w5
        eor     v29.16b,v21.16b,v22.16b
        eor     w17,w17,w6
        ushr    v1.4s,v24.4s,#25
        eor     w19,w19,w7
        ushr    v5.4s,v25.4s,#25
        eor     w20,w20,w8
        ushr    v9.4s,v26.4s,#25
        ror     w21,w21,#24
        ushr    v13.4s,v27.4s,#25
        ror     w17,w17,#24
        ushr    v17.4s,v28.4s,#25
        ror     w19,w19,#24
        ushr    v21.4s,v29.4s,#25
        ror     w20,w20,#24
        sli     v1.4s,v24.4s,#7
        add     w15,w15,w21
        sli     v5.4s,v25.4s,#7
        add     w16,w16,w17
        sli     v9.4s,v26.4s,#7
        add     w13,w13,w19
        sli     v13.4s,v27.4s,#7
        add     w14,w14,w20
        sli     v17.4s,v28.4s,#7
        eor     w10,w10,w15
        sli     v21.4s,v29.4s,#7
        eor     w11,w11,w16
        ext     v2.16b,v2.16b,v2.16b,#8
        eor     w12,w12,w13
        ext     v6.16b,v6.16b,v6.16b,#8
        eor     w9,w9,w14
        ext     v10.16b,v10.16b,v10.16b,#8
        ror     w10,w10,#25
        ext     v14.16b,v14.16b,v14.16b,#8
        ror     w11,w11,#25
        ext     v18.16b,v18.16b,v18.16b,#8
        ror     w12,w12,#25
        ext     v22.16b,v22.16b,v22.16b,#8
        ror     w9,w9,#25
        ext     v3.16b,v3.16b,v3.16b,#4
        ext     v7.16b,v7.16b,v7.16b,#4
        ext     v11.16b,v11.16b,v11.16b,#4
        ext     v15.16b,v15.16b,v15.16b,#4
        ext     v19.16b,v19.16b,v19.16b,#4
        ext     v23.16b,v23.16b,v23.16b,#4
        ext     v1.16b,v1.16b,v1.16b,#12
        ext     v5.16b,v5.16b,v5.16b,#12
        ext     v9.16b,v9.16b,v9.16b,#12
        ext     v13.16b,v13.16b,v13.16b,#12
        ext     v17.16b,v17.16b,v17.16b,#12
        ext     v21.16b,v21.16b,v21.16b,#12
        cbnz    x4,.Loop_upper_neon

        add     w5,w5,w22               // accumulate key block
        add     x6,x6,x22,lsr#32
        add     w7,w7,w23
        add     x8,x8,x23,lsr#32
        add     w9,w9,w24
        add     x10,x10,x24,lsr#32
        add     w11,w11,w25
        add     x12,x12,x25,lsr#32
        add     w13,w13,w26
        add     x14,x14,x26,lsr#32
        add     w15,w15,w27
        add     x16,x16,x27,lsr#32
        add     w17,w17,w28
        add     x19,x19,x28,lsr#32
        add     w20,w20,w30
        add     x21,x21,x30,lsr#32

        add     x5,x5,x6,lsl#32 // pack
        add     x7,x7,x8,lsl#32
        ldp     x6,x8,[x1,#0]           // load input
        add     x9,x9,x10,lsl#32
        add     x11,x11,x12,lsl#32
        ldp     x10,x12,[x1,#16]
        add     x13,x13,x14,lsl#32
        add     x15,x15,x16,lsl#32
        ldp     x14,x16,[x1,#32]
        add     x17,x17,x19,lsl#32
        add     x20,x20,x21,lsl#32
        ldp     x19,x21,[x1,#48]
        add     x1,x1,#64
#ifdef  __ARMEB__
        rev     x5,x5
        rev     x7,x7
        rev     x9,x9
        rev     x11,x11
        rev     x13,x13
        rev     x15,x15
        rev     x17,x17
        rev     x20,x20
#endif
        eor     x5,x5,x6
        eor     x7,x7,x8
        eor     x9,x9,x10
        eor     x11,x11,x12
        eor     x13,x13,x14
        eor     x15,x15,x16
        eor     x17,x17,x19
        eor     x20,x20,x21

        stp     x5,x7,[x0,#0]           // store output
        add     x28,x28,#1                      // increment counter
        mov     w5,w22                  // unpack key block
        lsr     x6,x22,#32
        stp     x9,x11,[x0,#16]
        mov     w7,w23
        lsr     x8,x23,#32
        stp     x13,x15,[x0,#32]
        mov     w9,w24
        lsr     x10,x24,#32
        stp     x17,x20,[x0,#48]
        add     x0,x0,#64
        mov     w11,w25
        lsr     x12,x25,#32
        mov     w13,w26
        lsr     x14,x26,#32
        mov     w15,w27
        lsr     x16,x27,#32
        mov     w17,w28
        lsr     x19,x28,#32
        mov     w20,w30
        lsr     x21,x30,#32

        mov     x4,#5
.Loop_lower_neon:
        sub     x4,x4,#1
        add     v0.4s,v0.4s,v1.4s
        add     w5,w5,w9
        add     v4.4s,v4.4s,v5.4s
        add     w6,w6,w10
        add     v8.4s,v8.4s,v9.4s
        add     w7,w7,w11
        add     v12.4s,v12.4s,v13.4s
        add     w8,w8,w12
        add     v16.4s,v16.4s,v17.4s
        eor     w17,w17,w5
        add     v20.4s,v20.4s,v21.4s
        eor     w19,w19,w6
        eor     v3.16b,v3.16b,v0.16b
        eor     w20,w20,w7
        eor     v7.16b,v7.16b,v4.16b
        eor     w21,w21,w8
        eor     v11.16b,v11.16b,v8.16b
        ror     w17,w17,#16
        eor     v15.16b,v15.16b,v12.16b
        ror     w19,w19,#16
        eor     v19.16b,v19.16b,v16.16b
        ror     w20,w20,#16
        eor     v23.16b,v23.16b,v20.16b
        ror     w21,w21,#16
        rev32   v3.8h,v3.8h
        add     w13,w13,w17
        rev32   v7.8h,v7.8h
        add     w14,w14,w19
        rev32   v11.8h,v11.8h
        add     w15,w15,w20
        rev32   v15.8h,v15.8h
        add     w16,w16,w21
        rev32   v19.8h,v19.8h
        eor     w9,w9,w13
        rev32   v23.8h,v23.8h
        eor     w10,w10,w14
        add     v2.4s,v2.4s,v3.4s
        eor     w11,w11,w15
        add     v6.4s,v6.4s,v7.4s
        eor     w12,w12,w16
        add     v10.4s,v10.4s,v11.4s
        ror     w9,w9,#20
        add     v14.4s,v14.4s,v15.4s
        ror     w10,w10,#20
        add     v18.4s,v18.4s,v19.4s
        ror     w11,w11,#20
        add     v22.4s,v22.4s,v23.4s
        ror     w12,w12,#20
        eor     v24.16b,v1.16b,v2.16b
        add     w5,w5,w9
        eor     v25.16b,v5.16b,v6.16b
        add     w6,w6,w10
        eor     v26.16b,v9.16b,v10.16b
        add     w7,w7,w11
        eor     v27.16b,v13.16b,v14.16b
        add     w8,w8,w12
        eor     v28.16b,v17.16b,v18.16b
        eor     w17,w17,w5
        eor     v29.16b,v21.16b,v22.16b
        eor     w19,w19,w6
        ushr    v1.4s,v24.4s,#20
        eor     w20,w20,w7
        ushr    v5.4s,v25.4s,#20
        eor     w21,w21,w8
        ushr    v9.4s,v26.4s,#20
        ror     w17,w17,#24
        ushr    v13.4s,v27.4s,#20
        ror     w19,w19,#24
        ushr    v17.4s,v28.4s,#20
        ror     w20,w20,#24
        ushr    v21.4s,v29.4s,#20
        ror     w21,w21,#24
        sli     v1.4s,v24.4s,#12
        add     w13,w13,w17
        sli     v5.4s,v25.4s,#12
        add     w14,w14,w19
        sli     v9.4s,v26.4s,#12
        add     w15,w15,w20
        sli     v13.4s,v27.4s,#12
        add     w16,w16,w21
        sli     v17.4s,v28.4s,#12
        eor     w9,w9,w13
        sli     v21.4s,v29.4s,#12
        eor     w10,w10,w14
        add     v0.4s,v0.4s,v1.4s
        eor     w11,w11,w15
        add     v4.4s,v4.4s,v5.4s
        eor     w12,w12,w16
        add     v8.4s,v8.4s,v9.4s
        ror     w9,w9,#25
        add     v12.4s,v12.4s,v13.4s
        ror     w10,w10,#25
        add     v16.4s,v16.4s,v17.4s
        ror     w11,w11,#25
        add     v20.4s,v20.4s,v21.4s
        ror     w12,w12,#25
        eor     v24.16b,v3.16b,v0.16b
        add     w5,w5,w10
        eor     v25.16b,v7.16b,v4.16b
        add     w6,w6,w11
        eor     v26.16b,v11.16b,v8.16b
        add     w7,w7,w12
        eor     v27.16b,v15.16b,v12.16b
        add     w8,w8,w9
        eor     v28.16b,v19.16b,v16.16b
        eor     w21,w21,w5
        eor     v29.16b,v23.16b,v20.16b
        eor     w17,w17,w6
        ushr    v3.4s,v24.4s,#24
        eor     w19,w19,w7
        ushr    v7.4s,v25.4s,#24
        eor     w20,w20,w8
        ushr    v11.4s,v26.4s,#24
        ror     w21,w21,#16
        ushr    v15.4s,v27.4s,#24
        ror     w17,w17,#16
        ushr    v19.4s,v28.4s,#24
        ror     w19,w19,#16
        ushr    v23.4s,v29.4s,#24
        ror     w20,w20,#16
        sli     v3.4s,v24.4s,#8
        add     w15,w15,w21
        sli     v7.4s,v25.4s,#8
        add     w16,w16,w17
        sli     v11.4s,v26.4s,#8
        add     w13,w13,w19
        sli     v15.4s,v27.4s,#8
        add     w14,w14,w20
        sli     v19.4s,v28.4s,#8
        eor     w10,w10,w15
        sli     v23.4s,v29.4s,#8
        eor     w11,w11,w16
        add     v2.4s,v2.4s,v3.4s
        eor     w12,w12,w13
        add     v6.4s,v6.4s,v7.4s
        eor     w9,w9,w14
        add     v10.4s,v10.4s,v11.4s
        ror     w10,w10,#20
        add     v14.4s,v14.4s,v15.4s
        ror     w11,w11,#20
        add     v18.4s,v18.4s,v19.4s
        ror     w12,w12,#20
        add     v22.4s,v22.4s,v23.4s
        ror     w9,w9,#20
        eor     v24.16b,v1.16b,v2.16b
        add     w5,w5,w10
        eor     v25.16b,v5.16b,v6.16b
        add     w6,w6,w11
        eor     v26.16b,v9.16b,v10.16b
        add     w7,w7,w12
        eor     v27.16b,v13.16b,v14.16b
        add     w8,w8,w9
        eor     v28.16b,v17.16b,v18.16b
        eor     w21,w21,w5
        eor     v29.16b,v21.16b,v22.16b
        eor     w17,w17,w6
        ushr    v1.4s,v24.4s,#25
        eor     w19,w19,w7
        ushr    v5.4s,v25.4s,#25
        eor     w20,w20,w8
        ushr    v9.4s,v26.4s,#25
        ror     w21,w21,#24
        ushr    v13.4s,v27.4s,#25
        ror     w17,w17,#24
        ushr    v17.4s,v28.4s,#25
        ror     w19,w19,#24
        ushr    v21.4s,v29.4s,#25
        ror     w20,w20,#24
        sli     v1.4s,v24.4s,#7
        add     w15,w15,w21
        sli     v5.4s,v25.4s,#7
        add     w16,w16,w17
        sli     v9.4s,v26.4s,#7
        add     w13,w13,w19
        sli     v13.4s,v27.4s,#7
        add     w14,w14,w20
        sli     v17.4s,v28.4s,#7
        eor     w10,w10,w15
        sli     v21.4s,v29.4s,#7
        eor     w11,w11,w16
        ext     v2.16b,v2.16b,v2.16b,#8
        eor     w12,w12,w13
        ext     v6.16b,v6.16b,v6.16b,#8
        eor     w9,w9,w14
        ext     v10.16b,v10.16b,v10.16b,#8
        ror     w10,w10,#25
        ext     v14.16b,v14.16b,v14.16b,#8
        ror     w11,w11,#25
        ext     v18.16b,v18.16b,v18.16b,#8
        ror     w12,w12,#25
        ext     v22.16b,v22.16b,v22.16b,#8
        ror     w9,w9,#25
        ext     v3.16b,v3.16b,v3.16b,#12
        ext     v7.16b,v7.16b,v7.16b,#12
        ext     v11.16b,v11.16b,v11.16b,#12
        ext     v15.16b,v15.16b,v15.16b,#12
        ext     v19.16b,v19.16b,v19.16b,#12
        ext     v23.16b,v23.16b,v23.16b,#12
        ext     v1.16b,v1.16b,v1.16b,#4
        ext     v5.16b,v5.16b,v5.16b,#4
        ext     v9.16b,v9.16b,v9.16b,#4
        ext     v13.16b,v13.16b,v13.16b,#4
        ext     v17.16b,v17.16b,v17.16b,#4
        ext     v21.16b,v21.16b,v21.16b,#4
        add     v0.4s,v0.4s,v1.4s
        add     w5,w5,w9
        add     v4.4s,v4.4s,v5.4s
        add     w6,w6,w10
        add     v8.4s,v8.4s,v9.4s
        add     w7,w7,w11
        add     v12.4s,v12.4s,v13.4s
        add     w8,w8,w12
        add     v16.4s,v16.4s,v17.4s
        eor     w17,w17,w5
        add     v20.4s,v20.4s,v21.4s
        eor     w19,w19,w6
        eor     v3.16b,v3.16b,v0.16b
        eor     w20,w20,w7
        eor     v7.16b,v7.16b,v4.16b
        eor     w21,w21,w8
        eor     v11.16b,v11.16b,v8.16b
        ror     w17,w17,#16
        eor     v15.16b,v15.16b,v12.16b
        ror     w19,w19,#16
        eor     v19.16b,v19.16b,v16.16b
        ror     w20,w20,#16
        eor     v23.16b,v23.16b,v20.16b
        ror     w21,w21,#16
        rev32   v3.8h,v3.8h
        add     w13,w13,w17
        rev32   v7.8h,v7.8h
        add     w14,w14,w19
        rev32   v11.8h,v11.8h
        add     w15,w15,w20
        rev32   v15.8h,v15.8h
        add     w16,w16,w21
        rev32   v19.8h,v19.8h
        eor     w9,w9,w13
        rev32   v23.8h,v23.8h
        eor     w10,w10,w14
        add     v2.4s,v2.4s,v3.4s
        eor     w11,w11,w15
        add     v6.4s,v6.4s,v7.4s
        eor     w12,w12,w16
        add     v10.4s,v10.4s,v11.4s
        ror     w9,w9,#20
        add     v14.4s,v14.4s,v15.4s
        ror     w10,w10,#20
        add     v18.4s,v18.4s,v19.4s
        ror     w11,w11,#20
        add     v22.4s,v22.4s,v23.4s
        ror     w12,w12,#20
        eor     v24.16b,v1.16b,v2.16b
        add     w5,w5,w9
        eor     v25.16b,v5.16b,v6.16b
        add     w6,w6,w10
        eor     v26.16b,v9.16b,v10.16b
        add     w7,w7,w11
        eor     v27.16b,v13.16b,v14.16b
        add     w8,w8,w12
        eor     v28.16b,v17.16b,v18.16b
        eor     w17,w17,w5
        eor     v29.16b,v21.16b,v22.16b
        eor     w19,w19,w6
        ushr    v1.4s,v24.4s,#20
        eor     w20,w20,w7
        ushr    v5.4s,v25.4s,#20
        eor     w21,w21,w8
        ushr    v9.4s,v26.4s,#20
        ror     w17,w17,#24
        ushr    v13.4s,v27.4s,#20
        ror     w19,w19,#24
        ushr    v17.4s,v28.4s,#20
        ror     w20,w20,#24
        ushr    v21.4s,v29.4s,#20
        ror     w21,w21,#24
        sli     v1.4s,v24.4s,#12
        add     w13,w13,w17
        sli     v5.4s,v25.4s,#12
        add     w14,w14,w19
        sli     v9.4s,v26.4s,#12
        add     w15,w15,w20
        sli     v13.4s,v27.4s,#12
        add     w16,w16,w21
        sli     v17.4s,v28.4s,#12
        eor     w9,w9,w13
        sli     v21.4s,v29.4s,#12
        eor     w10,w10,w14
        add     v0.4s,v0.4s,v1.4s
        eor     w11,w11,w15
        add     v4.4s,v4.4s,v5.4s
        eor     w12,w12,w16
        add     v8.4s,v8.4s,v9.4s
        ror     w9,w9,#25
        add     v12.4s,v12.4s,v13.4s
        ror     w10,w10,#25
        add     v16.4s,v16.4s,v17.4s
        ror     w11,w11,#25
        add     v20.4s,v20.4s,v21.4s
        ror     w12,w12,#25
        eor     v24.16b,v3.16b,v0.16b
        add     w5,w5,w10
        eor     v25.16b,v7.16b,v4.16b
        add     w6,w6,w11
        eor     v26.16b,v11.16b,v8.16b
        add     w7,w7,w12
        eor     v27.16b,v15.16b,v12.16b
        add     w8,w8,w9
        eor     v28.16b,v19.16b,v16.16b
        eor     w21,w21,w5
        eor     v29.16b,v23.16b,v20.16b
        eor     w17,w17,w6
        ushr    v3.4s,v24.4s,#24
        eor     w19,w19,w7
        ushr    v7.4s,v25.4s,#24
        eor     w20,w20,w8
        ushr    v11.4s,v26.4s,#24
        ror     w21,w21,#16
        ushr    v15.4s,v27.4s,#24
        ror     w17,w17,#16
        ushr    v19.4s,v28.4s,#24
        ror     w19,w19,#16
        ushr    v23.4s,v29.4s,#24
        ror     w20,w20,#16
        sli     v3.4s,v24.4s,#8
        add     w15,w15,w21
        sli     v7.4s,v25.4s,#8
        add     w16,w16,w17
        sli     v11.4s,v26.4s,#8
        add     w13,w13,w19
        sli     v15.4s,v27.4s,#8
        add     w14,w14,w20
        sli     v19.4s,v28.4s,#8
        eor     w10,w10,w15
        sli     v23.4s,v29.4s,#8
        eor     w11,w11,w16
        add     v2.4s,v2.4s,v3.4s
        eor     w12,w12,w13
        add     v6.4s,v6.4s,v7.4s
        eor     w9,w9,w14
        add     v10.4s,v10.4s,v11.4s
        ror     w10,w10,#20
        add     v14.4s,v14.4s,v15.4s
        ror     w11,w11,#20
        add     v18.4s,v18.4s,v19.4s
        ror     w12,w12,#20
        add     v22.4s,v22.4s,v23.4s
        ror     w9,w9,#20
        eor     v24.16b,v1.16b,v2.16b
        add     w5,w5,w10
        eor     v25.16b,v5.16b,v6.16b
        add     w6,w6,w11
        eor     v26.16b,v9.16b,v10.16b
        add     w7,w7,w12
        eor     v27.16b,v13.16b,v14.16b
        add     w8,w8,w9
        eor     v28.16b,v17.16b,v18.16b
        eor     w21,w21,w5
        eor     v29.16b,v21.16b,v22.16b
        eor     w17,w17,w6
        ushr    v1.4s,v24.4s,#25
        eor     w19,w19,w7
        ushr    v5.4s,v25.4s,#25
        eor     w20,w20,w8
        ushr    v9.4s,v26.4s,#25
        ror     w21,w21,#24
        ushr    v13.4s,v27.4s,#25
        ror     w17,w17,#24
        ushr    v17.4s,v28.4s,#25
        ror     w19,w19,#24
        ushr    v21.4s,v29.4s,#25
        ror     w20,w20,#24
        sli     v1.4s,v24.4s,#7
        add     w15,w15,w21
        sli     v5.4s,v25.4s,#7
        add     w16,w16,w17
        sli     v9.4s,v26.4s,#7
        add     w13,w13,w19
        sli     v13.4s,v27.4s,#7
        add     w14,w14,w20
        sli     v17.4s,v28.4s,#7
        eor     w10,w10,w15
        sli     v21.4s,v29.4s,#7
        eor     w11,w11,w16
        ext     v2.16b,v2.16b,v2.16b,#8
        eor     w12,w12,w13
        ext     v6.16b,v6.16b,v6.16b,#8
        eor     w9,w9,w14
        ext     v10.16b,v10.16b,v10.16b,#8
        ror     w10,w10,#25
        ext     v14.16b,v14.16b,v14.16b,#8
        ror     w11,w11,#25
        ext     v18.16b,v18.16b,v18.16b,#8
        ror     w12,w12,#25
        ext     v22.16b,v22.16b,v22.16b,#8
        ror     w9,w9,#25
        ext     v3.16b,v3.16b,v3.16b,#4
        ext     v7.16b,v7.16b,v7.16b,#4
        ext     v11.16b,v11.16b,v11.16b,#4
        ext     v15.16b,v15.16b,v15.16b,#4
        ext     v19.16b,v19.16b,v19.16b,#4
        ext     v23.16b,v23.16b,v23.16b,#4
        ext     v1.16b,v1.16b,v1.16b,#12
        ext     v5.16b,v5.16b,v5.16b,#12
        ext     v9.16b,v9.16b,v9.16b,#12
        ext     v13.16b,v13.16b,v13.16b,#12
        ext     v17.16b,v17.16b,v17.16b,#12
        ext     v21.16b,v21.16b,v21.16b,#12
        cbnz    x4,.Loop_lower_neon

        add     w5,w5,w22               // accumulate key block
        ldp     q24,q25,[sp,#0]
        add     x6,x6,x22,lsr#32
        ldp     q26,q27,[sp,#32]
        add     w7,w7,w23
        ldp     q28,q29,[sp,#64]
        add     x8,x8,x23,lsr#32
        add     v0.4s,v0.4s,v24.4s
        add     w9,w9,w24
        add     v4.4s,v4.4s,v24.4s
        add     x10,x10,x24,lsr#32
        add     v8.4s,v8.4s,v24.4s
        add     w11,w11,w25
        add     v12.4s,v12.4s,v24.4s
        add     x12,x12,x25,lsr#32
        add     v16.4s,v16.4s,v24.4s
        add     w13,w13,w26
        add     v20.4s,v20.4s,v24.4s
        add     x14,x14,x26,lsr#32
        add     v2.4s,v2.4s,v26.4s
        add     w15,w15,w27
        add     v6.4s,v6.4s,v26.4s
        add     x16,x16,x27,lsr#32
        add     v10.4s,v10.4s,v26.4s
        add     w17,w17,w28
        add     v14.4s,v14.4s,v26.4s
        add     x19,x19,x28,lsr#32
        add     v18.4s,v18.4s,v26.4s
        add     w20,w20,w30
        add     v22.4s,v22.4s,v26.4s
        add     x21,x21,x30,lsr#32
        add     v19.4s,v19.4s,v31.4s                    // +4
        add     x5,x5,x6,lsl#32 // pack
        add     v23.4s,v23.4s,v31.4s                    // +4
        add     x7,x7,x8,lsl#32
        add     v3.4s,v3.4s,v27.4s
        ldp     x6,x8,[x1,#0]           // load input
        add     v7.4s,v7.4s,v28.4s
        add     x9,x9,x10,lsl#32
        add     v11.4s,v11.4s,v29.4s
        add     x11,x11,x12,lsl#32
        add     v15.4s,v15.4s,v30.4s
        ldp     x10,x12,[x1,#16]
        add     v19.4s,v19.4s,v27.4s
        add     x13,x13,x14,lsl#32
        add     v23.4s,v23.4s,v28.4s
        add     x15,x15,x16,lsl#32
        add     v1.4s,v1.4s,v25.4s
        ldp     x14,x16,[x1,#32]
        add     v5.4s,v5.4s,v25.4s
        add     x17,x17,x19,lsl#32
        add     v9.4s,v9.4s,v25.4s
        add     x20,x20,x21,lsl#32
        add     v13.4s,v13.4s,v25.4s
        ldp     x19,x21,[x1,#48]
        add     v17.4s,v17.4s,v25.4s
        add     x1,x1,#64
        add     v21.4s,v21.4s,v25.4s

#ifdef  __ARMEB__
        rev     x5,x5
        rev     x7,x7
        rev     x9,x9
        rev     x11,x11
        rev     x13,x13
        rev     x15,x15
        rev     x17,x17
        rev     x20,x20
#endif
        ld1     {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
        eor     x5,x5,x6
        eor     x7,x7,x8
        eor     x9,x9,x10
        eor     x11,x11,x12
        eor     x13,x13,x14
        eor     v0.16b,v0.16b,v24.16b
        eor     x15,x15,x16
        eor     v1.16b,v1.16b,v25.16b
        eor     x17,x17,x19
        eor     v2.16b,v2.16b,v26.16b
        eor     x20,x20,x21
        eor     v3.16b,v3.16b,v27.16b
        ld1     {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64

        stp     x5,x7,[x0,#0]           // store output
        add     x28,x28,#7                      // increment counter
        stp     x9,x11,[x0,#16]
        stp     x13,x15,[x0,#32]
        stp     x17,x20,[x0,#48]
        add     x0,x0,#64
        st1     {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64

        ld1     {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
        eor     v4.16b,v4.16b,v24.16b
        eor     v5.16b,v5.16b,v25.16b
        eor     v6.16b,v6.16b,v26.16b
        eor     v7.16b,v7.16b,v27.16b
        st1     {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64

        ld1     {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
        eor     v8.16b,v8.16b,v0.16b
        ldp     q24,q25,[sp,#0]
        eor     v9.16b,v9.16b,v1.16b
        ldp     q26,q27,[sp,#32]
        eor     v10.16b,v10.16b,v2.16b
        eor     v11.16b,v11.16b,v3.16b
        st1     {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64

        ld1     {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
        eor     v12.16b,v12.16b,v4.16b
        eor     v13.16b,v13.16b,v5.16b
        eor     v14.16b,v14.16b,v6.16b
        eor     v15.16b,v15.16b,v7.16b
        st1     {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64

        ld1     {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
        eor     v16.16b,v16.16b,v8.16b
        eor     v17.16b,v17.16b,v9.16b
        eor     v18.16b,v18.16b,v10.16b
        eor     v19.16b,v19.16b,v11.16b
        st1     {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64

        shl     v0.4s,v31.4s,#1                 // 4 -> 8
        eor     v20.16b,v20.16b,v12.16b
        eor     v21.16b,v21.16b,v13.16b
        eor     v22.16b,v22.16b,v14.16b
        eor     v23.16b,v23.16b,v15.16b
        st1     {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64

        add     v27.4s,v27.4s,v0.4s                     // += 8
        add     v28.4s,v28.4s,v0.4s
        add     v29.4s,v29.4s,v0.4s
        add     v30.4s,v30.4s,v0.4s

        b.hs    .Loop_outer_512_neon

        adds    x2,x2,#512
        ushr    v0.4s,v31.4s,#2                 // 4 -> 1

        ldp     d8,d9,[sp,#128+0]               // meet ABI requirements
        ldp     d10,d11,[sp,#128+16]
        ldp     d12,d13,[sp,#128+32]
        ldp     d14,d15,[sp,#128+48]

        stp     q24,q31,[sp,#0]         // wipe off-load area
        stp     q24,q31,[sp,#32]
        stp     q24,q31,[sp,#64]

        b.eq    .Ldone_512_neon

        cmp     x2,#192
        sub     v27.4s,v27.4s,v0.4s                     // -= 1
        sub     v28.4s,v28.4s,v0.4s
        sub     v29.4s,v29.4s,v0.4s
        add     sp,sp,#128
        b.hs    .Loop_outer_neon

        eor     v25.16b,v25.16b,v25.16b
        eor     v26.16b,v26.16b,v26.16b
        eor     v27.16b,v27.16b,v27.16b
        eor     v28.16b,v28.16b,v28.16b
        eor     v29.16b,v29.16b,v29.16b
        eor     v30.16b,v30.16b,v30.16b
        b       .Loop_outer

.Ldone_512_neon:
        ldp     x19,x20,[x29,#16]
        add     sp,sp,#128+64
        ldp     x21,x22,[x29,#32]
        ldp     x23,x24,[x29,#48]
        ldp     x25,x26,[x29,#64]
        ldp     x27,x28,[x29,#80]
        ldp     x29,x30,[sp],#96
        ret
.size   ChaCha20_512_neon,.-ChaCha20_512_neon
#endif