nexmon – Rev 1

Subversion Repositories:
Rev:
default rel
%define XMMWORD
%define YMMWORD
%define ZMMWORD
section .text code align=64


EXTERN  OPENSSL_ia32cap_P

ALIGN   64
$L$zero:
        DD      0,0,0,0
$L$one:
        DD      1,0,0,0
$L$inc:
        DD      0,1,2,3
$L$four:
        DD      4,4,4,4
$L$incy:
        DD      0,2,4,6,1,3,5,7
$L$eight:
        DD      8,8,8,8,8,8,8,8
$L$rot16:
DB      0x2,0x3,0x0,0x1,0x6,0x7,0x4,0x5,0xa,0xb,0x8,0x9,0xe,0xf,0xc,0xd
$L$rot24:
DB      0x3,0x0,0x1,0x2,0x7,0x4,0x5,0x6,0xb,0x8,0x9,0xa,0xf,0xc,0xd,0xe
$L$sigma:
DB      101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107
DB      0
DB      67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54
DB      95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32
DB      98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115
DB      108,46,111,114,103,62,0
global  ChaCha20_ctr32

ALIGN   64
ChaCha20_ctr32:
        mov     QWORD[8+rsp],rdi        ;WIN64 prologue
        mov     QWORD[16+rsp],rsi
        mov     rax,rsp
$L$SEH_begin_ChaCha20_ctr32:
        mov     rdi,rcx
        mov     rsi,rdx
        mov     rdx,r8
        mov     rcx,r9
        mov     r8,QWORD[40+rsp]


        cmp     rdx,0
        je      NEAR $L$no_data
        mov     r10,QWORD[((OPENSSL_ia32cap_P+4))]
        test    r10d,512
        jnz     NEAR $L$ChaCha20_ssse3

        push    rbx
        push    rbp
        push    r12
        push    r13
        push    r14
        push    r15
        sub     rsp,64+24


        movdqu  xmm1,XMMWORD[rcx]
        movdqu  xmm2,XMMWORD[16+rcx]
        movdqu  xmm3,XMMWORD[r8]
        movdqa  xmm4,XMMWORD[$L$one]


        movdqa  XMMWORD[16+rsp],xmm1
        movdqa  XMMWORD[32+rsp],xmm2
        movdqa  XMMWORD[48+rsp],xmm3
        mov     rbp,rdx
        jmp     NEAR $L$oop_outer

ALIGN   32
$L$oop_outer:
        mov     eax,0x61707865
        mov     ebx,0x3320646e
        mov     ecx,0x79622d32
        mov     edx,0x6b206574
        mov     r8d,DWORD[16+rsp]
        mov     r9d,DWORD[20+rsp]
        mov     r10d,DWORD[24+rsp]
        mov     r11d,DWORD[28+rsp]
        movd    r12d,xmm3
        mov     r13d,DWORD[52+rsp]
        mov     r14d,DWORD[56+rsp]
        mov     r15d,DWORD[60+rsp]

        mov     QWORD[((64+0))+rsp],rbp
        mov     ebp,10
        mov     QWORD[((64+8))+rsp],rsi
DB      102,72,15,126,214
        mov     QWORD[((64+16))+rsp],rdi
        mov     rdi,rsi
        shr     rdi,32
        jmp     NEAR $L$oop

ALIGN   32
$L$oop:
        add     eax,r8d
        xor     r12d,eax
        rol     r12d,16
        add     ebx,r9d
        xor     r13d,ebx
        rol     r13d,16
        add     esi,r12d
        xor     r8d,esi
        rol     r8d,12
        add     edi,r13d
        xor     r9d,edi
        rol     r9d,12
        add     eax,r8d
        xor     r12d,eax
        rol     r12d,8
        add     ebx,r9d
        xor     r13d,ebx
        rol     r13d,8
        add     esi,r12d
        xor     r8d,esi
        rol     r8d,7
        add     edi,r13d
        xor     r9d,edi
        rol     r9d,7
        mov     DWORD[32+rsp],esi
        mov     DWORD[36+rsp],edi
        mov     esi,DWORD[40+rsp]
        mov     edi,DWORD[44+rsp]
        add     ecx,r10d
        xor     r14d,ecx
        rol     r14d,16
        add     edx,r11d
        xor     r15d,edx
        rol     r15d,16
        add     esi,r14d
        xor     r10d,esi
        rol     r10d,12
        add     edi,r15d
        xor     r11d,edi
        rol     r11d,12
        add     ecx,r10d
        xor     r14d,ecx
        rol     r14d,8
        add     edx,r11d
        xor     r15d,edx
        rol     r15d,8
        add     esi,r14d
        xor     r10d,esi
        rol     r10d,7
        add     edi,r15d
        xor     r11d,edi
        rol     r11d,7
        add     eax,r9d
        xor     r15d,eax
        rol     r15d,16
        add     ebx,r10d
        xor     r12d,ebx
        rol     r12d,16
        add     esi,r15d
        xor     r9d,esi
        rol     r9d,12
        add     edi,r12d
        xor     r10d,edi
        rol     r10d,12
        add     eax,r9d
        xor     r15d,eax
        rol     r15d,8
        add     ebx,r10d
        xor     r12d,ebx
        rol     r12d,8
        add     esi,r15d
        xor     r9d,esi
        rol     r9d,7
        add     edi,r12d
        xor     r10d,edi
        rol     r10d,7
        mov     DWORD[40+rsp],esi
        mov     DWORD[44+rsp],edi
        mov     esi,DWORD[32+rsp]
        mov     edi,DWORD[36+rsp]
        add     ecx,r11d
        xor     r13d,ecx
        rol     r13d,16
        add     edx,r8d
        xor     r14d,edx
        rol     r14d,16
        add     esi,r13d
        xor     r11d,esi
        rol     r11d,12
        add     edi,r14d
        xor     r8d,edi
        rol     r8d,12
        add     ecx,r11d
        xor     r13d,ecx
        rol     r13d,8
        add     edx,r8d
        xor     r14d,edx
        rol     r14d,8
        add     esi,r13d
        xor     r11d,esi
        rol     r11d,7
        add     edi,r14d
        xor     r8d,edi
        rol     r8d,7
        dec     ebp
        jnz     NEAR $L$oop
        mov     DWORD[36+rsp],edi
        mov     DWORD[32+rsp],esi
        mov     rbp,QWORD[64+rsp]
        movdqa  xmm1,xmm2
        mov     rsi,QWORD[((64+8))+rsp]
        paddd   xmm3,xmm4
        mov     rdi,QWORD[((64+16))+rsp]

        add     eax,0x61707865
        add     ebx,0x3320646e
        add     ecx,0x79622d32
        add     edx,0x6b206574
        add     r8d,DWORD[16+rsp]
        add     r9d,DWORD[20+rsp]
        add     r10d,DWORD[24+rsp]
        add     r11d,DWORD[28+rsp]
        add     r12d,DWORD[48+rsp]
        add     r13d,DWORD[52+rsp]
        add     r14d,DWORD[56+rsp]
        add     r15d,DWORD[60+rsp]
        paddd   xmm1,XMMWORD[32+rsp]

        cmp     rbp,64
        jb      NEAR $L$tail

        xor     eax,DWORD[rsi]
        xor     ebx,DWORD[4+rsi]
        xor     ecx,DWORD[8+rsi]
        xor     edx,DWORD[12+rsi]
        xor     r8d,DWORD[16+rsi]
        xor     r9d,DWORD[20+rsi]
        xor     r10d,DWORD[24+rsi]
        xor     r11d,DWORD[28+rsi]
        movdqu  xmm0,XMMWORD[32+rsi]
        xor     r12d,DWORD[48+rsi]
        xor     r13d,DWORD[52+rsi]
        xor     r14d,DWORD[56+rsi]
        xor     r15d,DWORD[60+rsi]
        lea     rsi,[64+rsi]
        pxor    xmm0,xmm1

        movdqa  XMMWORD[32+rsp],xmm2
        movd    DWORD[48+rsp],xmm3

        mov     DWORD[rdi],eax
        mov     DWORD[4+rdi],ebx
        mov     DWORD[8+rdi],ecx
        mov     DWORD[12+rdi],edx
        mov     DWORD[16+rdi],r8d
        mov     DWORD[20+rdi],r9d
        mov     DWORD[24+rdi],r10d
        mov     DWORD[28+rdi],r11d
        movdqu  XMMWORD[32+rdi],xmm0
        mov     DWORD[48+rdi],r12d
        mov     DWORD[52+rdi],r13d
        mov     DWORD[56+rdi],r14d
        mov     DWORD[60+rdi],r15d
        lea     rdi,[64+rdi]

        sub     rbp,64
        jnz     NEAR $L$oop_outer

        jmp     NEAR $L$done

ALIGN   16
$L$tail:
        mov     DWORD[rsp],eax
        mov     DWORD[4+rsp],ebx
        xor     rbx,rbx
        mov     DWORD[8+rsp],ecx
        mov     DWORD[12+rsp],edx
        mov     DWORD[16+rsp],r8d
        mov     DWORD[20+rsp],r9d
        mov     DWORD[24+rsp],r10d
        mov     DWORD[28+rsp],r11d
        movdqa  XMMWORD[32+rsp],xmm1
        mov     DWORD[48+rsp],r12d
        mov     DWORD[52+rsp],r13d
        mov     DWORD[56+rsp],r14d
        mov     DWORD[60+rsp],r15d

$L$oop_tail:
        movzx   eax,BYTE[rbx*1+rsi]
        movzx   edx,BYTE[rbx*1+rsp]
        lea     rbx,[1+rbx]
        xor     eax,edx
        mov     BYTE[((-1))+rbx*1+rdi],al
        dec     rbp
        jnz     NEAR $L$oop_tail

$L$done:
        add     rsp,64+24
        pop     r15
        pop     r14
        pop     r13
        pop     r12
        pop     rbp
        pop     rbx
$L$no_data:
        mov     rdi,QWORD[8+rsp]        ;WIN64 epilogue
        mov     rsi,QWORD[16+rsp]
        DB      0F3h,0C3h               ;repret
$L$SEH_end_ChaCha20_ctr32:

ALIGN   32
ChaCha20_ssse3:
        mov     QWORD[8+rsp],rdi        ;WIN64 prologue
        mov     QWORD[16+rsp],rsi
        mov     rax,rsp
$L$SEH_begin_ChaCha20_ssse3:
        mov     rdi,rcx
        mov     rsi,rdx
        mov     rdx,r8
        mov     rcx,r9
        mov     r8,QWORD[40+rsp]


$L$ChaCha20_ssse3:
        cmp     rdx,128
        ja      NEAR $L$ChaCha20_4x

$L$do_sse3_after_all:
        push    rbx
        push    rbp
        push    r12
        push    r13
        push    r14
        push    r15

        sub     rsp,64+72
        movaps  XMMWORD[(64+32)+rsp],xmm6
        movaps  XMMWORD[(64+48)+rsp],xmm7
        movdqa  xmm0,XMMWORD[$L$sigma]
        movdqu  xmm1,XMMWORD[rcx]
        movdqu  xmm2,XMMWORD[16+rcx]
        movdqu  xmm3,XMMWORD[r8]
        movdqa  xmm6,XMMWORD[$L$rot16]
        movdqa  xmm7,XMMWORD[$L$rot24]

        movdqa  XMMWORD[rsp],xmm0
        movdqa  XMMWORD[16+rsp],xmm1
        movdqa  XMMWORD[32+rsp],xmm2
        movdqa  XMMWORD[48+rsp],xmm3
        mov     ebp,10
        jmp     NEAR $L$oop_ssse3

ALIGN   32
$L$oop_outer_ssse3:
        movdqa  xmm3,XMMWORD[$L$one]
        movdqa  xmm0,XMMWORD[rsp]
        movdqa  xmm1,XMMWORD[16+rsp]
        movdqa  xmm2,XMMWORD[32+rsp]
        paddd   xmm3,XMMWORD[48+rsp]
        mov     ebp,10
        movdqa  XMMWORD[48+rsp],xmm3
        jmp     NEAR $L$oop_ssse3

ALIGN   32
$L$oop_ssse3:
        paddd   xmm0,xmm1
        pxor    xmm3,xmm0
DB      102,15,56,0,222
        paddd   xmm2,xmm3
        pxor    xmm1,xmm2
        movdqa  xmm4,xmm1
        psrld   xmm1,20
        pslld   xmm4,12
        por     xmm1,xmm4
        paddd   xmm0,xmm1
        pxor    xmm3,xmm0
DB      102,15,56,0,223
        paddd   xmm2,xmm3
        pxor    xmm1,xmm2
        movdqa  xmm4,xmm1
        psrld   xmm1,25
        pslld   xmm4,7
        por     xmm1,xmm4
        pshufd  xmm2,xmm2,78
        pshufd  xmm1,xmm1,57
        pshufd  xmm3,xmm3,147
        nop
        paddd   xmm0,xmm1
        pxor    xmm3,xmm0
DB      102,15,56,0,222
        paddd   xmm2,xmm3
        pxor    xmm1,xmm2
        movdqa  xmm4,xmm1
        psrld   xmm1,20
        pslld   xmm4,12
        por     xmm1,xmm4
        paddd   xmm0,xmm1
        pxor    xmm3,xmm0
DB      102,15,56,0,223
        paddd   xmm2,xmm3
        pxor    xmm1,xmm2
        movdqa  xmm4,xmm1
        psrld   xmm1,25
        pslld   xmm4,7
        por     xmm1,xmm4
        pshufd  xmm2,xmm2,78
        pshufd  xmm1,xmm1,147
        pshufd  xmm3,xmm3,57
        dec     ebp
        jnz     NEAR $L$oop_ssse3
        paddd   xmm0,XMMWORD[rsp]
        paddd   xmm1,XMMWORD[16+rsp]
        paddd   xmm2,XMMWORD[32+rsp]
        paddd   xmm3,XMMWORD[48+rsp]

        cmp     rdx,64
        jb      NEAR $L$tail_ssse3

        movdqu  xmm4,XMMWORD[rsi]
        movdqu  xmm5,XMMWORD[16+rsi]
        pxor    xmm0,xmm4
        movdqu  xmm4,XMMWORD[32+rsi]
        pxor    xmm1,xmm5
        movdqu  xmm5,XMMWORD[48+rsi]
        lea     rsi,[64+rsi]
        pxor    xmm2,xmm4
        pxor    xmm3,xmm5

        movdqu  XMMWORD[rdi],xmm0
        movdqu  XMMWORD[16+rdi],xmm1
        movdqu  XMMWORD[32+rdi],xmm2
        movdqu  XMMWORD[48+rdi],xmm3
        lea     rdi,[64+rdi]

        sub     rdx,64
        jnz     NEAR $L$oop_outer_ssse3

        jmp     NEAR $L$done_ssse3

ALIGN   16
$L$tail_ssse3:
        movdqa  XMMWORD[rsp],xmm0
        movdqa  XMMWORD[16+rsp],xmm1
        movdqa  XMMWORD[32+rsp],xmm2
        movdqa  XMMWORD[48+rsp],xmm3
        xor     rbx,rbx

$L$oop_tail_ssse3:
        movzx   eax,BYTE[rbx*1+rsi]
        movzx   ecx,BYTE[rbx*1+rsp]
        lea     rbx,[1+rbx]
        xor     eax,ecx
        mov     BYTE[((-1))+rbx*1+rdi],al
        dec     rdx
        jnz     NEAR $L$oop_tail_ssse3

$L$done_ssse3:
        movaps  xmm6,XMMWORD[((64+32))+rsp]
        movaps  xmm7,XMMWORD[((64+48))+rsp]
        add     rsp,64+72
        pop     r15
        pop     r14
        pop     r13
        pop     r12
        pop     rbp
        pop     rbx
        mov     rdi,QWORD[8+rsp]        ;WIN64 epilogue
        mov     rsi,QWORD[16+rsp]
        DB      0F3h,0C3h               ;repret
$L$SEH_end_ChaCha20_ssse3:

ALIGN   32
ChaCha20_4x:
        mov     QWORD[8+rsp],rdi        ;WIN64 prologue
        mov     QWORD[16+rsp],rsi
        mov     rax,rsp
$L$SEH_begin_ChaCha20_4x:
        mov     rdi,rcx
        mov     rsi,rdx
        mov     rdx,r8
        mov     rcx,r9
        mov     r8,QWORD[40+rsp]


$L$ChaCha20_4x:
        mov     r11,r10
        shr     r10,32
        test    r10,32
        jnz     NEAR $L$ChaCha20_8x
        cmp     rdx,192
        ja      NEAR $L$proceed4x

        and     r11,71303168
        cmp     r11,4194304
        je      NEAR $L$do_sse3_after_all

$L$proceed4x:
        lea     r11,[((-120))+rsp]
        sub     rsp,0x148+160
        movaps  XMMWORD[(-48)+r11],xmm6
        movaps  XMMWORD[(-32)+r11],xmm7
        movaps  XMMWORD[(-16)+r11],xmm8
        movaps  XMMWORD[r11],xmm9
        movaps  XMMWORD[16+r11],xmm10
        movaps  XMMWORD[32+r11],xmm11
        movaps  XMMWORD[48+r11],xmm12
        movaps  XMMWORD[64+r11],xmm13
        movaps  XMMWORD[80+r11],xmm14
        movaps  XMMWORD[96+r11],xmm15
        movdqa  xmm11,XMMWORD[$L$sigma]
        movdqu  xmm15,XMMWORD[rcx]
        movdqu  xmm7,XMMWORD[16+rcx]
        movdqu  xmm3,XMMWORD[r8]
        lea     rcx,[256+rsp]
        lea     r10,[$L$rot16]
        lea     r11,[$L$rot24]

        pshufd  xmm8,xmm11,0x00
        pshufd  xmm9,xmm11,0x55
        movdqa  XMMWORD[64+rsp],xmm8
        pshufd  xmm10,xmm11,0xaa
        movdqa  XMMWORD[80+rsp],xmm9
        pshufd  xmm11,xmm11,0xff
        movdqa  XMMWORD[96+rsp],xmm10
        movdqa  XMMWORD[112+rsp],xmm11

        pshufd  xmm12,xmm15,0x00
        pshufd  xmm13,xmm15,0x55
        movdqa  XMMWORD[(128-256)+rcx],xmm12
        pshufd  xmm14,xmm15,0xaa
        movdqa  XMMWORD[(144-256)+rcx],xmm13
        pshufd  xmm15,xmm15,0xff
        movdqa  XMMWORD[(160-256)+rcx],xmm14
        movdqa  XMMWORD[(176-256)+rcx],xmm15

        pshufd  xmm4,xmm7,0x00
        pshufd  xmm5,xmm7,0x55
        movdqa  XMMWORD[(192-256)+rcx],xmm4
        pshufd  xmm6,xmm7,0xaa
        movdqa  XMMWORD[(208-256)+rcx],xmm5
        pshufd  xmm7,xmm7,0xff
        movdqa  XMMWORD[(224-256)+rcx],xmm6
        movdqa  XMMWORD[(240-256)+rcx],xmm7

        pshufd  xmm0,xmm3,0x00
        pshufd  xmm1,xmm3,0x55
        paddd   xmm0,XMMWORD[$L$inc]
        pshufd  xmm2,xmm3,0xaa
        movdqa  XMMWORD[(272-256)+rcx],xmm1
        pshufd  xmm3,xmm3,0xff
        movdqa  XMMWORD[(288-256)+rcx],xmm2
        movdqa  XMMWORD[(304-256)+rcx],xmm3

        jmp     NEAR $L$oop_enter4x

ALIGN   32
$L$oop_outer4x:
        movdqa  xmm8,XMMWORD[64+rsp]
        movdqa  xmm9,XMMWORD[80+rsp]
        movdqa  xmm10,XMMWORD[96+rsp]
        movdqa  xmm11,XMMWORD[112+rsp]
        movdqa  xmm12,XMMWORD[((128-256))+rcx]
        movdqa  xmm13,XMMWORD[((144-256))+rcx]
        movdqa  xmm14,XMMWORD[((160-256))+rcx]
        movdqa  xmm15,XMMWORD[((176-256))+rcx]
        movdqa  xmm4,XMMWORD[((192-256))+rcx]
        movdqa  xmm5,XMMWORD[((208-256))+rcx]
        movdqa  xmm6,XMMWORD[((224-256))+rcx]
        movdqa  xmm7,XMMWORD[((240-256))+rcx]
        movdqa  xmm0,XMMWORD[((256-256))+rcx]
        movdqa  xmm1,XMMWORD[((272-256))+rcx]
        movdqa  xmm2,XMMWORD[((288-256))+rcx]
        movdqa  xmm3,XMMWORD[((304-256))+rcx]
        paddd   xmm0,XMMWORD[$L$four]

$L$oop_enter4x:
        movdqa  XMMWORD[32+rsp],xmm6
        movdqa  XMMWORD[48+rsp],xmm7
        movdqa  xmm7,XMMWORD[r10]
        mov     eax,10
        movdqa  XMMWORD[(256-256)+rcx],xmm0
        jmp     NEAR $L$oop4x

ALIGN   32
$L$oop4x:
        paddd   xmm8,xmm12
        paddd   xmm9,xmm13
        pxor    xmm0,xmm8
        pxor    xmm1,xmm9
DB      102,15,56,0,199
DB      102,15,56,0,207
        paddd   xmm4,xmm0
        paddd   xmm5,xmm1
        pxor    xmm12,xmm4
        pxor    xmm13,xmm5
        movdqa  xmm6,xmm12
        pslld   xmm12,12
        psrld   xmm6,20
        movdqa  xmm7,xmm13
        pslld   xmm13,12
        por     xmm12,xmm6
        psrld   xmm7,20
        movdqa  xmm6,XMMWORD[r11]
        por     xmm13,xmm7
        paddd   xmm8,xmm12
        paddd   xmm9,xmm13
        pxor    xmm0,xmm8
        pxor    xmm1,xmm9
DB      102,15,56,0,198
DB      102,15,56,0,206
        paddd   xmm4,xmm0
        paddd   xmm5,xmm1
        pxor    xmm12,xmm4
        pxor    xmm13,xmm5
        movdqa  xmm7,xmm12
        pslld   xmm12,7
        psrld   xmm7,25
        movdqa  xmm6,xmm13
        pslld   xmm13,7
        por     xmm12,xmm7
        psrld   xmm6,25
        movdqa  xmm7,XMMWORD[r10]
        por     xmm13,xmm6
        movdqa  XMMWORD[rsp],xmm4
        movdqa  XMMWORD[16+rsp],xmm5
        movdqa  xmm4,XMMWORD[32+rsp]
        movdqa  xmm5,XMMWORD[48+rsp]
        paddd   xmm10,xmm14
        paddd   xmm11,xmm15
        pxor    xmm2,xmm10
        pxor    xmm3,xmm11
DB      102,15,56,0,215
DB      102,15,56,0,223
        paddd   xmm4,xmm2
        paddd   xmm5,xmm3
        pxor    xmm14,xmm4
        pxor    xmm15,xmm5
        movdqa  xmm6,xmm14
        pslld   xmm14,12
        psrld   xmm6,20
        movdqa  xmm7,xmm15
        pslld   xmm15,12
        por     xmm14,xmm6
        psrld   xmm7,20
        movdqa  xmm6,XMMWORD[r11]
        por     xmm15,xmm7
        paddd   xmm10,xmm14
        paddd   xmm11,xmm15
        pxor    xmm2,xmm10
        pxor    xmm3,xmm11
DB      102,15,56,0,214
DB      102,15,56,0,222
        paddd   xmm4,xmm2
        paddd   xmm5,xmm3
        pxor    xmm14,xmm4
        pxor    xmm15,xmm5
        movdqa  xmm7,xmm14
        pslld   xmm14,7
        psrld   xmm7,25
        movdqa  xmm6,xmm15
        pslld   xmm15,7
        por     xmm14,xmm7
        psrld   xmm6,25
        movdqa  xmm7,XMMWORD[r10]
        por     xmm15,xmm6
        paddd   xmm8,xmm13
        paddd   xmm9,xmm14
        pxor    xmm3,xmm8
        pxor    xmm0,xmm9
DB      102,15,56,0,223
DB      102,15,56,0,199
        paddd   xmm4,xmm3
        paddd   xmm5,xmm0
        pxor    xmm13,xmm4
        pxor    xmm14,xmm5
        movdqa  xmm6,xmm13
        pslld   xmm13,12
        psrld   xmm6,20
        movdqa  xmm7,xmm14
        pslld   xmm14,12
        por     xmm13,xmm6
        psrld   xmm7,20
        movdqa  xmm6,XMMWORD[r11]
        por     xmm14,xmm7
        paddd   xmm8,xmm13
        paddd   xmm9,xmm14
        pxor    xmm3,xmm8
        pxor    xmm0,xmm9
DB      102,15,56,0,222
DB      102,15,56,0,198
        paddd   xmm4,xmm3
        paddd   xmm5,xmm0
        pxor    xmm13,xmm4
        pxor    xmm14,xmm5
        movdqa  xmm7,xmm13
        pslld   xmm13,7
        psrld   xmm7,25
        movdqa  xmm6,xmm14
        pslld   xmm14,7
        por     xmm13,xmm7
        psrld   xmm6,25
        movdqa  xmm7,XMMWORD[r10]
        por     xmm14,xmm6
        movdqa  XMMWORD[32+rsp],xmm4
        movdqa  XMMWORD[48+rsp],xmm5
        movdqa  xmm4,XMMWORD[rsp]
        movdqa  xmm5,XMMWORD[16+rsp]
        paddd   xmm10,xmm15
        paddd   xmm11,xmm12
        pxor    xmm1,xmm10
        pxor    xmm2,xmm11
DB      102,15,56,0,207
DB      102,15,56,0,215
        paddd   xmm4,xmm1
        paddd   xmm5,xmm2
        pxor    xmm15,xmm4
        pxor    xmm12,xmm5
        movdqa  xmm6,xmm15
        pslld   xmm15,12
        psrld   xmm6,20
        movdqa  xmm7,xmm12
        pslld   xmm12,12
        por     xmm15,xmm6
        psrld   xmm7,20
        movdqa  xmm6,XMMWORD[r11]
        por     xmm12,xmm7
        paddd   xmm10,xmm15
        paddd   xmm11,xmm12
        pxor    xmm1,xmm10
        pxor    xmm2,xmm11
DB      102,15,56,0,206
DB      102,15,56,0,214
        paddd   xmm4,xmm1
        paddd   xmm5,xmm2
        pxor    xmm15,xmm4
        pxor    xmm12,xmm5
        movdqa  xmm7,xmm15
        pslld   xmm15,7
        psrld   xmm7,25
        movdqa  xmm6,xmm12
        pslld   xmm12,7
        por     xmm15,xmm7
        psrld   xmm6,25
        movdqa  xmm7,XMMWORD[r10]
        por     xmm12,xmm6
        dec     eax
        jnz     NEAR $L$oop4x

        paddd   xmm8,XMMWORD[64+rsp]
        paddd   xmm9,XMMWORD[80+rsp]
        paddd   xmm10,XMMWORD[96+rsp]
        paddd   xmm11,XMMWORD[112+rsp]

        movdqa  xmm6,xmm8
        punpckldq       xmm8,xmm9
        movdqa  xmm7,xmm10
        punpckldq       xmm10,xmm11
        punpckhdq       xmm6,xmm9
        punpckhdq       xmm7,xmm11
        movdqa  xmm9,xmm8
        punpcklqdq      xmm8,xmm10
        movdqa  xmm11,xmm6
        punpcklqdq      xmm6,xmm7
        punpckhqdq      xmm9,xmm10
        punpckhqdq      xmm11,xmm7
        paddd   xmm12,XMMWORD[((128-256))+rcx]
        paddd   xmm13,XMMWORD[((144-256))+rcx]
        paddd   xmm14,XMMWORD[((160-256))+rcx]
        paddd   xmm15,XMMWORD[((176-256))+rcx]

        movdqa  XMMWORD[rsp],xmm8
        movdqa  XMMWORD[16+rsp],xmm9
        movdqa  xmm8,XMMWORD[32+rsp]
        movdqa  xmm9,XMMWORD[48+rsp]

        movdqa  xmm10,xmm12
        punpckldq       xmm12,xmm13
        movdqa  xmm7,xmm14
        punpckldq       xmm14,xmm15
        punpckhdq       xmm10,xmm13
        punpckhdq       xmm7,xmm15
        movdqa  xmm13,xmm12
        punpcklqdq      xmm12,xmm14
        movdqa  xmm15,xmm10
        punpcklqdq      xmm10,xmm7
        punpckhqdq      xmm13,xmm14
        punpckhqdq      xmm15,xmm7
        paddd   xmm4,XMMWORD[((192-256))+rcx]
        paddd   xmm5,XMMWORD[((208-256))+rcx]
        paddd   xmm8,XMMWORD[((224-256))+rcx]
        paddd   xmm9,XMMWORD[((240-256))+rcx]

        movdqa  XMMWORD[32+rsp],xmm6
        movdqa  XMMWORD[48+rsp],xmm11

        movdqa  xmm14,xmm4
        punpckldq       xmm4,xmm5
        movdqa  xmm7,xmm8
        punpckldq       xmm8,xmm9
        punpckhdq       xmm14,xmm5
        punpckhdq       xmm7,xmm9
        movdqa  xmm5,xmm4
        punpcklqdq      xmm4,xmm8
        movdqa  xmm9,xmm14
        punpcklqdq      xmm14,xmm7
        punpckhqdq      xmm5,xmm8
        punpckhqdq      xmm9,xmm7
        paddd   xmm0,XMMWORD[((256-256))+rcx]
        paddd   xmm1,XMMWORD[((272-256))+rcx]
        paddd   xmm2,XMMWORD[((288-256))+rcx]
        paddd   xmm3,XMMWORD[((304-256))+rcx]

        movdqa  xmm8,xmm0
        punpckldq       xmm0,xmm1
        movdqa  xmm7,xmm2
        punpckldq       xmm2,xmm3
        punpckhdq       xmm8,xmm1
        punpckhdq       xmm7,xmm3
        movdqa  xmm1,xmm0
        punpcklqdq      xmm0,xmm2
        movdqa  xmm3,xmm8
        punpcklqdq      xmm8,xmm7
        punpckhqdq      xmm1,xmm2
        punpckhqdq      xmm3,xmm7
        cmp     rdx,64*4
        jb      NEAR $L$tail4x

        movdqu  xmm6,XMMWORD[rsi]
        movdqu  xmm11,XMMWORD[16+rsi]
        movdqu  xmm2,XMMWORD[32+rsi]
        movdqu  xmm7,XMMWORD[48+rsi]
        pxor    xmm6,XMMWORD[rsp]
        pxor    xmm11,xmm12
        pxor    xmm2,xmm4
        pxor    xmm7,xmm0

        movdqu  XMMWORD[rdi],xmm6
        movdqu  xmm6,XMMWORD[64+rsi]
        movdqu  XMMWORD[16+rdi],xmm11
        movdqu  xmm11,XMMWORD[80+rsi]
        movdqu  XMMWORD[32+rdi],xmm2
        movdqu  xmm2,XMMWORD[96+rsi]
        movdqu  XMMWORD[48+rdi],xmm7
        movdqu  xmm7,XMMWORD[112+rsi]
        lea     rsi,[128+rsi]
        pxor    xmm6,XMMWORD[16+rsp]
        pxor    xmm11,xmm13
        pxor    xmm2,xmm5
        pxor    xmm7,xmm1

        movdqu  XMMWORD[64+rdi],xmm6
        movdqu  xmm6,XMMWORD[rsi]
        movdqu  XMMWORD[80+rdi],xmm11
        movdqu  xmm11,XMMWORD[16+rsi]
        movdqu  XMMWORD[96+rdi],xmm2
        movdqu  xmm2,XMMWORD[32+rsi]
        movdqu  XMMWORD[112+rdi],xmm7
        lea     rdi,[128+rdi]
        movdqu  xmm7,XMMWORD[48+rsi]
        pxor    xmm6,XMMWORD[32+rsp]
        pxor    xmm11,xmm10
        pxor    xmm2,xmm14
        pxor    xmm7,xmm8

        movdqu  XMMWORD[rdi],xmm6
        movdqu  xmm6,XMMWORD[64+rsi]
        movdqu  XMMWORD[16+rdi],xmm11
        movdqu  xmm11,XMMWORD[80+rsi]
        movdqu  XMMWORD[32+rdi],xmm2
        movdqu  xmm2,XMMWORD[96+rsi]
        movdqu  XMMWORD[48+rdi],xmm7
        movdqu  xmm7,XMMWORD[112+rsi]
        lea     rsi,[128+rsi]
        pxor    xmm6,XMMWORD[48+rsp]
        pxor    xmm11,xmm15
        pxor    xmm2,xmm9
        pxor    xmm7,xmm3
        movdqu  XMMWORD[64+rdi],xmm6
        movdqu  XMMWORD[80+rdi],xmm11
        movdqu  XMMWORD[96+rdi],xmm2
        movdqu  XMMWORD[112+rdi],xmm7
        lea     rdi,[128+rdi]

        sub     rdx,64*4
        jnz     NEAR $L$oop_outer4x

        jmp     NEAR $L$done4x

$L$tail4x:
        cmp     rdx,192
        jae     NEAR $L$192_or_more4x
        cmp     rdx,128
        jae     NEAR $L$128_or_more4x
        cmp     rdx,64
        jae     NEAR $L$64_or_more4x


        xor     r10,r10

        movdqa  XMMWORD[16+rsp],xmm12
        movdqa  XMMWORD[32+rsp],xmm4
        movdqa  XMMWORD[48+rsp],xmm0
        jmp     NEAR $L$oop_tail4x

ALIGN   32
$L$64_or_more4x:
        movdqu  xmm6,XMMWORD[rsi]
        movdqu  xmm11,XMMWORD[16+rsi]
        movdqu  xmm2,XMMWORD[32+rsi]
        movdqu  xmm7,XMMWORD[48+rsi]
        pxor    xmm6,XMMWORD[rsp]
        pxor    xmm11,xmm12
        pxor    xmm2,xmm4
        pxor    xmm7,xmm0
        movdqu  XMMWORD[rdi],xmm6
        movdqu  XMMWORD[16+rdi],xmm11
        movdqu  XMMWORD[32+rdi],xmm2
        movdqu  XMMWORD[48+rdi],xmm7
        je      NEAR $L$done4x

        movdqa  xmm6,XMMWORD[16+rsp]
        lea     rsi,[64+rsi]
        xor     r10,r10
        movdqa  XMMWORD[rsp],xmm6
        movdqa  XMMWORD[16+rsp],xmm13
        lea     rdi,[64+rdi]
        movdqa  XMMWORD[32+rsp],xmm5
        sub     rdx,64
        movdqa  XMMWORD[48+rsp],xmm1
        jmp     NEAR $L$oop_tail4x

ALIGN   32
$L$128_or_more4x:
        movdqu  xmm6,XMMWORD[rsi]
        movdqu  xmm11,XMMWORD[16+rsi]
        movdqu  xmm2,XMMWORD[32+rsi]
        movdqu  xmm7,XMMWORD[48+rsi]
        pxor    xmm6,XMMWORD[rsp]
        pxor    xmm11,xmm12
        pxor    xmm2,xmm4
        pxor    xmm7,xmm0

        movdqu  XMMWORD[rdi],xmm6
        movdqu  xmm6,XMMWORD[64+rsi]
        movdqu  XMMWORD[16+rdi],xmm11
        movdqu  xmm11,XMMWORD[80+rsi]
        movdqu  XMMWORD[32+rdi],xmm2
        movdqu  xmm2,XMMWORD[96+rsi]
        movdqu  XMMWORD[48+rdi],xmm7
        movdqu  xmm7,XMMWORD[112+rsi]
        pxor    xmm6,XMMWORD[16+rsp]
        pxor    xmm11,xmm13
        pxor    xmm2,xmm5
        pxor    xmm7,xmm1
        movdqu  XMMWORD[64+rdi],xmm6
        movdqu  XMMWORD[80+rdi],xmm11
        movdqu  XMMWORD[96+rdi],xmm2
        movdqu  XMMWORD[112+rdi],xmm7
        je      NEAR $L$done4x

        movdqa  xmm6,XMMWORD[32+rsp]
        lea     rsi,[128+rsi]
        xor     r10,r10
        movdqa  XMMWORD[rsp],xmm6
        movdqa  XMMWORD[16+rsp],xmm10
        lea     rdi,[128+rdi]
        movdqa  XMMWORD[32+rsp],xmm14
        sub     rdx,128
        movdqa  XMMWORD[48+rsp],xmm8
        jmp     NEAR $L$oop_tail4x

ALIGN   32
$L$192_or_more4x:
        movdqu  xmm6,XMMWORD[rsi]
        movdqu  xmm11,XMMWORD[16+rsi]
        movdqu  xmm2,XMMWORD[32+rsi]
        movdqu  xmm7,XMMWORD[48+rsi]
        pxor    xmm6,XMMWORD[rsp]
        pxor    xmm11,xmm12
        pxor    xmm2,xmm4
        pxor    xmm7,xmm0

        movdqu  XMMWORD[rdi],xmm6
        movdqu  xmm6,XMMWORD[64+rsi]
        movdqu  XMMWORD[16+rdi],xmm11
        movdqu  xmm11,XMMWORD[80+rsi]
        movdqu  XMMWORD[32+rdi],xmm2
        movdqu  xmm2,XMMWORD[96+rsi]
        movdqu  XMMWORD[48+rdi],xmm7
        movdqu  xmm7,XMMWORD[112+rsi]
        lea     rsi,[128+rsi]
        pxor    xmm6,XMMWORD[16+rsp]
        pxor    xmm11,xmm13
        pxor    xmm2,xmm5
        pxor    xmm7,xmm1

        movdqu  XMMWORD[64+rdi],xmm6
        movdqu  xmm6,XMMWORD[rsi]
        movdqu  XMMWORD[80+rdi],xmm11
        movdqu  xmm11,XMMWORD[16+rsi]
        movdqu  XMMWORD[96+rdi],xmm2
        movdqu  xmm2,XMMWORD[32+rsi]
        movdqu  XMMWORD[112+rdi],xmm7
        lea     rdi,[128+rdi]
        movdqu  xmm7,XMMWORD[48+rsi]
        pxor    xmm6,XMMWORD[32+rsp]
        pxor    xmm11,xmm10
        pxor    xmm2,xmm14
        pxor    xmm7,xmm8
        movdqu  XMMWORD[rdi],xmm6
        movdqu  XMMWORD[16+rdi],xmm11
        movdqu  XMMWORD[32+rdi],xmm2
        movdqu  XMMWORD[48+rdi],xmm7
        je      NEAR $L$done4x

        movdqa  xmm6,XMMWORD[48+rsp]
        lea     rsi,[64+rsi]
        xor     r10,r10
        movdqa  XMMWORD[rsp],xmm6
        movdqa  XMMWORD[16+rsp],xmm15
        lea     rdi,[64+rdi]
        movdqa  XMMWORD[32+rsp],xmm9
        sub     rdx,192
        movdqa  XMMWORD[48+rsp],xmm3

$L$oop_tail4x:
        movzx   eax,BYTE[r10*1+rsi]
        movzx   ecx,BYTE[r10*1+rsp]
        lea     r10,[1+r10]
        xor     eax,ecx
        mov     BYTE[((-1))+r10*1+rdi],al
        dec     rdx
        jnz     NEAR $L$oop_tail4x

$L$done4x:
        lea     r11,[((320+48))+rsp]
        movaps  xmm6,XMMWORD[((-48))+r11]
        movaps  xmm7,XMMWORD[((-32))+r11]
        movaps  xmm8,XMMWORD[((-16))+r11]
        movaps  xmm9,XMMWORD[r11]
        movaps  xmm10,XMMWORD[16+r11]
        movaps  xmm11,XMMWORD[32+r11]
        movaps  xmm12,XMMWORD[48+r11]
        movaps  xmm13,XMMWORD[64+r11]
        movaps  xmm14,XMMWORD[80+r11]
        movaps  xmm15,XMMWORD[96+r11]
        add     rsp,0x148+160
        mov     rdi,QWORD[8+rsp]        ;WIN64 epilogue
        mov     rsi,QWORD[16+rsp]
        DB      0F3h,0C3h               ;repret
$L$SEH_end_ChaCha20_4x:

ALIGN   32
ChaCha20_8x:
        mov     QWORD[8+rsp],rdi        ;WIN64 prologue
        mov     QWORD[16+rsp],rsi
        mov     rax,rsp
$L$SEH_begin_ChaCha20_8x:
        mov     rdi,rcx
        mov     rsi,rdx
        mov     rdx,r8
        mov     rcx,r9
        mov     r8,QWORD[40+rsp]


$L$ChaCha20_8x:
        mov     r10,rsp
        sub     rsp,0x280+176
        and     rsp,-32
        lea     r11,[((656+48))+rsp]
        movaps  XMMWORD[(-48)+r11],xmm6
        movaps  XMMWORD[(-32)+r11],xmm7
        movaps  XMMWORD[(-16)+r11],xmm8
        movaps  XMMWORD[r11],xmm9
        movaps  XMMWORD[16+r11],xmm10
        movaps  XMMWORD[32+r11],xmm11
        movaps  XMMWORD[48+r11],xmm12
        movaps  XMMWORD[64+r11],xmm13
        movaps  XMMWORD[80+r11],xmm14
        movaps  XMMWORD[96+r11],xmm15
        vzeroupper
        mov     QWORD[640+rsp],r10










        vbroadcasti128  ymm11,YMMWORD[$L$sigma]
        vbroadcasti128  ymm3,YMMWORD[rcx]
        vbroadcasti128  ymm15,YMMWORD[16+rcx]
        vbroadcasti128  ymm7,YMMWORD[r8]
        lea     rcx,[256+rsp]
        lea     rax,[512+rsp]
        lea     r10,[$L$rot16]
        lea     r11,[$L$rot24]

        vpshufd ymm8,ymm11,0x00
        vpshufd ymm9,ymm11,0x55
        vmovdqa YMMWORD[(128-256)+rcx],ymm8
        vpshufd ymm10,ymm11,0xaa
        vmovdqa YMMWORD[(160-256)+rcx],ymm9
        vpshufd ymm11,ymm11,0xff
        vmovdqa YMMWORD[(192-256)+rcx],ymm10
        vmovdqa YMMWORD[(224-256)+rcx],ymm11

        vpshufd ymm0,ymm3,0x00
        vpshufd ymm1,ymm3,0x55
        vmovdqa YMMWORD[(256-256)+rcx],ymm0
        vpshufd ymm2,ymm3,0xaa
        vmovdqa YMMWORD[(288-256)+rcx],ymm1
        vpshufd ymm3,ymm3,0xff
        vmovdqa YMMWORD[(320-256)+rcx],ymm2
        vmovdqa YMMWORD[(352-256)+rcx],ymm3

        vpshufd ymm12,ymm15,0x00
        vpshufd ymm13,ymm15,0x55
        vmovdqa YMMWORD[(384-512)+rax],ymm12
        vpshufd ymm14,ymm15,0xaa
        vmovdqa YMMWORD[(416-512)+rax],ymm13
        vpshufd ymm15,ymm15,0xff
        vmovdqa YMMWORD[(448-512)+rax],ymm14
        vmovdqa YMMWORD[(480-512)+rax],ymm15

        vpshufd ymm4,ymm7,0x00
        vpshufd ymm5,ymm7,0x55
        vpaddd  ymm4,ymm4,YMMWORD[$L$incy]
        vpshufd ymm6,ymm7,0xaa
        vmovdqa YMMWORD[(544-512)+rax],ymm5
        vpshufd ymm7,ymm7,0xff
        vmovdqa YMMWORD[(576-512)+rax],ymm6
        vmovdqa YMMWORD[(608-512)+rax],ymm7

        jmp     NEAR $L$oop_enter8x

ALIGN   32
$L$oop_outer8x:
        vmovdqa ymm8,YMMWORD[((128-256))+rcx]
        vmovdqa ymm9,YMMWORD[((160-256))+rcx]
        vmovdqa ymm10,YMMWORD[((192-256))+rcx]
        vmovdqa ymm11,YMMWORD[((224-256))+rcx]
        vmovdqa ymm0,YMMWORD[((256-256))+rcx]
        vmovdqa ymm1,YMMWORD[((288-256))+rcx]
        vmovdqa ymm2,YMMWORD[((320-256))+rcx]
        vmovdqa ymm3,YMMWORD[((352-256))+rcx]
        vmovdqa ymm12,YMMWORD[((384-512))+rax]
        vmovdqa ymm13,YMMWORD[((416-512))+rax]
        vmovdqa ymm14,YMMWORD[((448-512))+rax]
        vmovdqa ymm15,YMMWORD[((480-512))+rax]
        vmovdqa ymm4,YMMWORD[((512-512))+rax]
        vmovdqa ymm5,YMMWORD[((544-512))+rax]
        vmovdqa ymm6,YMMWORD[((576-512))+rax]
        vmovdqa ymm7,YMMWORD[((608-512))+rax]
        vpaddd  ymm4,ymm4,YMMWORD[$L$eight]

$L$oop_enter8x:
        vmovdqa YMMWORD[64+rsp],ymm14
        vmovdqa YMMWORD[96+rsp],ymm15
        vbroadcasti128  ymm15,YMMWORD[r10]
        vmovdqa YMMWORD[(512-512)+rax],ymm4
        mov     eax,10
        jmp     NEAR $L$oop8x

ALIGN   32
$L$oop8x:
        vpaddd  ymm8,ymm8,ymm0
        vpxor   ymm4,ymm8,ymm4
        vpshufb ymm4,ymm4,ymm15
        vpaddd  ymm9,ymm9,ymm1
        vpxor   ymm5,ymm9,ymm5
        vpshufb ymm5,ymm5,ymm15
        vpaddd  ymm12,ymm12,ymm4
        vpxor   ymm0,ymm12,ymm0
        vpslld  ymm14,ymm0,12
        vpsrld  ymm0,ymm0,20
        vpor    ymm0,ymm14,ymm0
        vbroadcasti128  ymm14,YMMWORD[r11]
        vpaddd  ymm13,ymm13,ymm5
        vpxor   ymm1,ymm13,ymm1
        vpslld  ymm15,ymm1,12
        vpsrld  ymm1,ymm1,20
        vpor    ymm1,ymm15,ymm1
        vpaddd  ymm8,ymm8,ymm0
        vpxor   ymm4,ymm8,ymm4
        vpshufb ymm4,ymm4,ymm14
        vpaddd  ymm9,ymm9,ymm1
        vpxor   ymm5,ymm9,ymm5
        vpshufb ymm5,ymm5,ymm14
        vpaddd  ymm12,ymm12,ymm4
        vpxor   ymm0,ymm12,ymm0
        vpslld  ymm15,ymm0,7
        vpsrld  ymm0,ymm0,25
        vpor    ymm0,ymm15,ymm0
        vbroadcasti128  ymm15,YMMWORD[r10]
        vpaddd  ymm13,ymm13,ymm5
        vpxor   ymm1,ymm13,ymm1
        vpslld  ymm14,ymm1,7
        vpsrld  ymm1,ymm1,25
        vpor    ymm1,ymm14,ymm1
        vmovdqa YMMWORD[rsp],ymm12
        vmovdqa YMMWORD[32+rsp],ymm13
        vmovdqa ymm12,YMMWORD[64+rsp]
        vmovdqa ymm13,YMMWORD[96+rsp]
        vpaddd  ymm10,ymm10,ymm2
        vpxor   ymm6,ymm10,ymm6
        vpshufb ymm6,ymm6,ymm15
        vpaddd  ymm11,ymm11,ymm3
        vpxor   ymm7,ymm11,ymm7
        vpshufb ymm7,ymm7,ymm15
        vpaddd  ymm12,ymm12,ymm6
        vpxor   ymm2,ymm12,ymm2
        vpslld  ymm14,ymm2,12
        vpsrld  ymm2,ymm2,20
        vpor    ymm2,ymm14,ymm2
        vbroadcasti128  ymm14,YMMWORD[r11]
        vpaddd  ymm13,ymm13,ymm7
        vpxor   ymm3,ymm13,ymm3
        vpslld  ymm15,ymm3,12
        vpsrld  ymm3,ymm3,20
        vpor    ymm3,ymm15,ymm3
        vpaddd  ymm10,ymm10,ymm2
        vpxor   ymm6,ymm10,ymm6
        vpshufb ymm6,ymm6,ymm14
        vpaddd  ymm11,ymm11,ymm3
        vpxor   ymm7,ymm11,ymm7
        vpshufb ymm7,ymm7,ymm14
        vpaddd  ymm12,ymm12,ymm6
        vpxor   ymm2,ymm12,ymm2
        vpslld  ymm15,ymm2,7
        vpsrld  ymm2,ymm2,25
        vpor    ymm2,ymm15,ymm2
        vbroadcasti128  ymm15,YMMWORD[r10]
        vpaddd  ymm13,ymm13,ymm7
        vpxor   ymm3,ymm13,ymm3
        vpslld  ymm14,ymm3,7
        vpsrld  ymm3,ymm3,25
        vpor    ymm3,ymm14,ymm3
        vpaddd  ymm8,ymm8,ymm1
        vpxor   ymm7,ymm8,ymm7
        vpshufb ymm7,ymm7,ymm15
        vpaddd  ymm9,ymm9,ymm2
        vpxor   ymm4,ymm9,ymm4
        vpshufb ymm4,ymm4,ymm15
        vpaddd  ymm12,ymm12,ymm7
        vpxor   ymm1,ymm12,ymm1
        vpslld  ymm14,ymm1,12
        vpsrld  ymm1,ymm1,20
        vpor    ymm1,ymm14,ymm1
        vbroadcasti128  ymm14,YMMWORD[r11]
        vpaddd  ymm13,ymm13,ymm4
        vpxor   ymm2,ymm13,ymm2
        vpslld  ymm15,ymm2,12
        vpsrld  ymm2,ymm2,20
        vpor    ymm2,ymm15,ymm2
        vpaddd  ymm8,ymm8,ymm1
        vpxor   ymm7,ymm8,ymm7
        vpshufb ymm7,ymm7,ymm14
        vpaddd  ymm9,ymm9,ymm2
        vpxor   ymm4,ymm9,ymm4
        vpshufb ymm4,ymm4,ymm14
        vpaddd  ymm12,ymm12,ymm7
        vpxor   ymm1,ymm12,ymm1
        vpslld  ymm15,ymm1,7
        vpsrld  ymm1,ymm1,25
        vpor    ymm1,ymm15,ymm1
        vbroadcasti128  ymm15,YMMWORD[r10]
        vpaddd  ymm13,ymm13,ymm4
        vpxor   ymm2,ymm13,ymm2
        vpslld  ymm14,ymm2,7
        vpsrld  ymm2,ymm2,25
        vpor    ymm2,ymm14,ymm2
        vmovdqa YMMWORD[64+rsp],ymm12
        vmovdqa YMMWORD[96+rsp],ymm13
        vmovdqa ymm12,YMMWORD[rsp]
        vmovdqa ymm13,YMMWORD[32+rsp]
        vpaddd  ymm10,ymm10,ymm3
        vpxor   ymm5,ymm10,ymm5
        vpshufb ymm5,ymm5,ymm15
        vpaddd  ymm11,ymm11,ymm0
        vpxor   ymm6,ymm11,ymm6
        vpshufb ymm6,ymm6,ymm15
        vpaddd  ymm12,ymm12,ymm5
        vpxor   ymm3,ymm12,ymm3
        vpslld  ymm14,ymm3,12
        vpsrld  ymm3,ymm3,20
        vpor    ymm3,ymm14,ymm3
        vbroadcasti128  ymm14,YMMWORD[r11]
        vpaddd  ymm13,ymm13,ymm6
        vpxor   ymm0,ymm13,ymm0
        vpslld  ymm15,ymm0,12
        vpsrld  ymm0,ymm0,20
        vpor    ymm0,ymm15,ymm0
        vpaddd  ymm10,ymm10,ymm3
        vpxor   ymm5,ymm10,ymm5
        vpshufb ymm5,ymm5,ymm14
        vpaddd  ymm11,ymm11,ymm0
        vpxor   ymm6,ymm11,ymm6
        vpshufb ymm6,ymm6,ymm14
        vpaddd  ymm12,ymm12,ymm5
        vpxor   ymm3,ymm12,ymm3
        vpslld  ymm15,ymm3,7
        vpsrld  ymm3,ymm3,25
        vpor    ymm3,ymm15,ymm3
        vbroadcasti128  ymm15,YMMWORD[r10]
        vpaddd  ymm13,ymm13,ymm6
        vpxor   ymm0,ymm13,ymm0
        vpslld  ymm14,ymm0,7
        vpsrld  ymm0,ymm0,25
        vpor    ymm0,ymm14,ymm0
        dec     eax
        jnz     NEAR $L$oop8x

        lea     rax,[512+rsp]
        vpaddd  ymm8,ymm8,YMMWORD[((128-256))+rcx]
        vpaddd  ymm9,ymm9,YMMWORD[((160-256))+rcx]
        vpaddd  ymm10,ymm10,YMMWORD[((192-256))+rcx]
        vpaddd  ymm11,ymm11,YMMWORD[((224-256))+rcx]

        vpunpckldq      ymm14,ymm8,ymm9
        vpunpckldq      ymm15,ymm10,ymm11
        vpunpckhdq      ymm8,ymm8,ymm9
        vpunpckhdq      ymm10,ymm10,ymm11
        vpunpcklqdq     ymm9,ymm14,ymm15
        vpunpckhqdq     ymm14,ymm14,ymm15
        vpunpcklqdq     ymm11,ymm8,ymm10
        vpunpckhqdq     ymm8,ymm8,ymm10
        vpaddd  ymm0,ymm0,YMMWORD[((256-256))+rcx]
        vpaddd  ymm1,ymm1,YMMWORD[((288-256))+rcx]
        vpaddd  ymm2,ymm2,YMMWORD[((320-256))+rcx]
        vpaddd  ymm3,ymm3,YMMWORD[((352-256))+rcx]

        vpunpckldq      ymm10,ymm0,ymm1
        vpunpckldq      ymm15,ymm2,ymm3
        vpunpckhdq      ymm0,ymm0,ymm1
        vpunpckhdq      ymm2,ymm2,ymm3
        vpunpcklqdq     ymm1,ymm10,ymm15
        vpunpckhqdq     ymm10,ymm10,ymm15
        vpunpcklqdq     ymm3,ymm0,ymm2
        vpunpckhqdq     ymm0,ymm0,ymm2
        vperm2i128      ymm15,ymm9,ymm1,0x20
        vperm2i128      ymm1,ymm9,ymm1,0x31
        vperm2i128      ymm9,ymm14,ymm10,0x20
        vperm2i128      ymm10,ymm14,ymm10,0x31
        vperm2i128      ymm14,ymm11,ymm3,0x20
        vperm2i128      ymm3,ymm11,ymm3,0x31
        vperm2i128      ymm11,ymm8,ymm0,0x20
        vperm2i128      ymm0,ymm8,ymm0,0x31
        vmovdqa YMMWORD[rsp],ymm15
        vmovdqa YMMWORD[32+rsp],ymm9
        vmovdqa ymm15,YMMWORD[64+rsp]
        vmovdqa ymm9,YMMWORD[96+rsp]

        vpaddd  ymm12,ymm12,YMMWORD[((384-512))+rax]
        vpaddd  ymm13,ymm13,YMMWORD[((416-512))+rax]
        vpaddd  ymm15,ymm15,YMMWORD[((448-512))+rax]
        vpaddd  ymm9,ymm9,YMMWORD[((480-512))+rax]

        vpunpckldq      ymm2,ymm12,ymm13
        vpunpckldq      ymm8,ymm15,ymm9
        vpunpckhdq      ymm12,ymm12,ymm13
        vpunpckhdq      ymm15,ymm15,ymm9
        vpunpcklqdq     ymm13,ymm2,ymm8
        vpunpckhqdq     ymm2,ymm2,ymm8
        vpunpcklqdq     ymm9,ymm12,ymm15
        vpunpckhqdq     ymm12,ymm12,ymm15
        vpaddd  ymm4,ymm4,YMMWORD[((512-512))+rax]
        vpaddd  ymm5,ymm5,YMMWORD[((544-512))+rax]
        vpaddd  ymm6,ymm6,YMMWORD[((576-512))+rax]
        vpaddd  ymm7,ymm7,YMMWORD[((608-512))+rax]

        vpunpckldq      ymm15,ymm4,ymm5
        vpunpckldq      ymm8,ymm6,ymm7
        vpunpckhdq      ymm4,ymm4,ymm5
        vpunpckhdq      ymm6,ymm6,ymm7
        vpunpcklqdq     ymm5,ymm15,ymm8
        vpunpckhqdq     ymm15,ymm15,ymm8
        vpunpcklqdq     ymm7,ymm4,ymm6
        vpunpckhqdq     ymm4,ymm4,ymm6
        vperm2i128      ymm8,ymm13,ymm5,0x20
        vperm2i128      ymm5,ymm13,ymm5,0x31
        vperm2i128      ymm13,ymm2,ymm15,0x20
        vperm2i128      ymm15,ymm2,ymm15,0x31
        vperm2i128      ymm2,ymm9,ymm7,0x20
        vperm2i128      ymm7,ymm9,ymm7,0x31
        vperm2i128      ymm9,ymm12,ymm4,0x20
        vperm2i128      ymm4,ymm12,ymm4,0x31
        vmovdqa ymm6,YMMWORD[rsp]
        vmovdqa ymm12,YMMWORD[32+rsp]

        cmp     rdx,64*8
        jb      NEAR $L$tail8x

        vpxor   ymm6,ymm6,YMMWORD[rsi]
        vpxor   ymm8,ymm8,YMMWORD[32+rsi]
        vpxor   ymm1,ymm1,YMMWORD[64+rsi]
        vpxor   ymm5,ymm5,YMMWORD[96+rsi]
        lea     rsi,[128+rsi]
        vmovdqu YMMWORD[rdi],ymm6
        vmovdqu YMMWORD[32+rdi],ymm8
        vmovdqu YMMWORD[64+rdi],ymm1
        vmovdqu YMMWORD[96+rdi],ymm5
        lea     rdi,[128+rdi]

        vpxor   ymm12,ymm12,YMMWORD[rsi]
        vpxor   ymm13,ymm13,YMMWORD[32+rsi]
        vpxor   ymm10,ymm10,YMMWORD[64+rsi]
        vpxor   ymm15,ymm15,YMMWORD[96+rsi]
        lea     rsi,[128+rsi]
        vmovdqu YMMWORD[rdi],ymm12
        vmovdqu YMMWORD[32+rdi],ymm13
        vmovdqu YMMWORD[64+rdi],ymm10
        vmovdqu YMMWORD[96+rdi],ymm15
        lea     rdi,[128+rdi]

        vpxor   ymm14,ymm14,YMMWORD[rsi]
        vpxor   ymm2,ymm2,YMMWORD[32+rsi]
        vpxor   ymm3,ymm3,YMMWORD[64+rsi]
        vpxor   ymm7,ymm7,YMMWORD[96+rsi]
        lea     rsi,[128+rsi]
        vmovdqu YMMWORD[rdi],ymm14
        vmovdqu YMMWORD[32+rdi],ymm2
        vmovdqu YMMWORD[64+rdi],ymm3
        vmovdqu YMMWORD[96+rdi],ymm7
        lea     rdi,[128+rdi]

        vpxor   ymm11,ymm11,YMMWORD[rsi]
        vpxor   ymm9,ymm9,YMMWORD[32+rsi]
        vpxor   ymm0,ymm0,YMMWORD[64+rsi]
        vpxor   ymm4,ymm4,YMMWORD[96+rsi]
        lea     rsi,[128+rsi]
        vmovdqu YMMWORD[rdi],ymm11
        vmovdqu YMMWORD[32+rdi],ymm9
        vmovdqu YMMWORD[64+rdi],ymm0
        vmovdqu YMMWORD[96+rdi],ymm4
        lea     rdi,[128+rdi]

        sub     rdx,64*8
        jnz     NEAR $L$oop_outer8x

        jmp     NEAR $L$done8x

$L$tail8x:
        cmp     rdx,448
        jae     NEAR $L$448_or_more8x
        cmp     rdx,384
        jae     NEAR $L$384_or_more8x
        cmp     rdx,320
        jae     NEAR $L$320_or_more8x
        cmp     rdx,256
        jae     NEAR $L$256_or_more8x
        cmp     rdx,192
        jae     NEAR $L$192_or_more8x
        cmp     rdx,128
        jae     NEAR $L$128_or_more8x
        cmp     rdx,64
        jae     NEAR $L$64_or_more8x

        xor     r10,r10
        vmovdqa YMMWORD[rsp],ymm6
        vmovdqa YMMWORD[32+rsp],ymm8
        jmp     NEAR $L$oop_tail8x

ALIGN   32
$L$64_or_more8x:
        vpxor   ymm6,ymm6,YMMWORD[rsi]
        vpxor   ymm8,ymm8,YMMWORD[32+rsi]
        vmovdqu YMMWORD[rdi],ymm6
        vmovdqu YMMWORD[32+rdi],ymm8
        je      NEAR $L$done8x

        lea     rsi,[64+rsi]
        xor     r10,r10
        vmovdqa YMMWORD[rsp],ymm1
        lea     rdi,[64+rdi]
        sub     rdx,64
        vmovdqa YMMWORD[32+rsp],ymm5
        jmp     NEAR $L$oop_tail8x

ALIGN   32
$L$128_or_more8x:
        vpxor   ymm6,ymm6,YMMWORD[rsi]
        vpxor   ymm8,ymm8,YMMWORD[32+rsi]
        vpxor   ymm1,ymm1,YMMWORD[64+rsi]
        vpxor   ymm5,ymm5,YMMWORD[96+rsi]
        vmovdqu YMMWORD[rdi],ymm6
        vmovdqu YMMWORD[32+rdi],ymm8
        vmovdqu YMMWORD[64+rdi],ymm1
        vmovdqu YMMWORD[96+rdi],ymm5
        je      NEAR $L$done8x

        lea     rsi,[128+rsi]
        xor     r10,r10
        vmovdqa YMMWORD[rsp],ymm12
        lea     rdi,[128+rdi]
        sub     rdx,128
        vmovdqa YMMWORD[32+rsp],ymm13
        jmp     NEAR $L$oop_tail8x

ALIGN   32
$L$192_or_more8x:
        vpxor   ymm6,ymm6,YMMWORD[rsi]
        vpxor   ymm8,ymm8,YMMWORD[32+rsi]
        vpxor   ymm1,ymm1,YMMWORD[64+rsi]
        vpxor   ymm5,ymm5,YMMWORD[96+rsi]
        vpxor   ymm12,ymm12,YMMWORD[128+rsi]
        vpxor   ymm13,ymm13,YMMWORD[160+rsi]
        vmovdqu YMMWORD[rdi],ymm6
        vmovdqu YMMWORD[32+rdi],ymm8
        vmovdqu YMMWORD[64+rdi],ymm1
        vmovdqu YMMWORD[96+rdi],ymm5
        vmovdqu YMMWORD[128+rdi],ymm12
        vmovdqu YMMWORD[160+rdi],ymm13
        je      NEAR $L$done8x

        lea     rsi,[192+rsi]
        xor     r10,r10
        vmovdqa YMMWORD[rsp],ymm10
        lea     rdi,[192+rdi]
        sub     rdx,192
        vmovdqa YMMWORD[32+rsp],ymm15
        jmp     NEAR $L$oop_tail8x

ALIGN   32
$L$256_or_more8x:
        vpxor   ymm6,ymm6,YMMWORD[rsi]
        vpxor   ymm8,ymm8,YMMWORD[32+rsi]
        vpxor   ymm1,ymm1,YMMWORD[64+rsi]
        vpxor   ymm5,ymm5,YMMWORD[96+rsi]
        vpxor   ymm12,ymm12,YMMWORD[128+rsi]
        vpxor   ymm13,ymm13,YMMWORD[160+rsi]
        vpxor   ymm10,ymm10,YMMWORD[192+rsi]
        vpxor   ymm15,ymm15,YMMWORD[224+rsi]
        vmovdqu YMMWORD[rdi],ymm6
        vmovdqu YMMWORD[32+rdi],ymm8
        vmovdqu YMMWORD[64+rdi],ymm1
        vmovdqu YMMWORD[96+rdi],ymm5
        vmovdqu YMMWORD[128+rdi],ymm12
        vmovdqu YMMWORD[160+rdi],ymm13
        vmovdqu YMMWORD[192+rdi],ymm10
        vmovdqu YMMWORD[224+rdi],ymm15
        je      NEAR $L$done8x

        lea     rsi,[256+rsi]
        xor     r10,r10
        vmovdqa YMMWORD[rsp],ymm14
        lea     rdi,[256+rdi]
        sub     rdx,256
        vmovdqa YMMWORD[32+rsp],ymm2
        jmp     NEAR $L$oop_tail8x

ALIGN   32
$L$320_or_more8x:
        vpxor   ymm6,ymm6,YMMWORD[rsi]
        vpxor   ymm8,ymm8,YMMWORD[32+rsi]
        vpxor   ymm1,ymm1,YMMWORD[64+rsi]
        vpxor   ymm5,ymm5,YMMWORD[96+rsi]
        vpxor   ymm12,ymm12,YMMWORD[128+rsi]
        vpxor   ymm13,ymm13,YMMWORD[160+rsi]
        vpxor   ymm10,ymm10,YMMWORD[192+rsi]
        vpxor   ymm15,ymm15,YMMWORD[224+rsi]
        vpxor   ymm14,ymm14,YMMWORD[256+rsi]
        vpxor   ymm2,ymm2,YMMWORD[288+rsi]
        vmovdqu YMMWORD[rdi],ymm6
        vmovdqu YMMWORD[32+rdi],ymm8
        vmovdqu YMMWORD[64+rdi],ymm1
        vmovdqu YMMWORD[96+rdi],ymm5
        vmovdqu YMMWORD[128+rdi],ymm12
        vmovdqu YMMWORD[160+rdi],ymm13
        vmovdqu YMMWORD[192+rdi],ymm10
        vmovdqu YMMWORD[224+rdi],ymm15
        vmovdqu YMMWORD[256+rdi],ymm14
        vmovdqu YMMWORD[288+rdi],ymm2
        je      NEAR $L$done8x

        lea     rsi,[320+rsi]
        xor     r10,r10
        vmovdqa YMMWORD[rsp],ymm3
        lea     rdi,[320+rdi]
        sub     rdx,320
        vmovdqa YMMWORD[32+rsp],ymm7
        jmp     NEAR $L$oop_tail8x

ALIGN   32
$L$384_or_more8x:
        vpxor   ymm6,ymm6,YMMWORD[rsi]
        vpxor   ymm8,ymm8,YMMWORD[32+rsi]
        vpxor   ymm1,ymm1,YMMWORD[64+rsi]
        vpxor   ymm5,ymm5,YMMWORD[96+rsi]
        vpxor   ymm12,ymm12,YMMWORD[128+rsi]
        vpxor   ymm13,ymm13,YMMWORD[160+rsi]
        vpxor   ymm10,ymm10,YMMWORD[192+rsi]
        vpxor   ymm15,ymm15,YMMWORD[224+rsi]
        vpxor   ymm14,ymm14,YMMWORD[256+rsi]
        vpxor   ymm2,ymm2,YMMWORD[288+rsi]
        vpxor   ymm3,ymm3,YMMWORD[320+rsi]
        vpxor   ymm7,ymm7,YMMWORD[352+rsi]
        vmovdqu YMMWORD[rdi],ymm6
        vmovdqu YMMWORD[32+rdi],ymm8
        vmovdqu YMMWORD[64+rdi],ymm1
        vmovdqu YMMWORD[96+rdi],ymm5
        vmovdqu YMMWORD[128+rdi],ymm12
        vmovdqu YMMWORD[160+rdi],ymm13
        vmovdqu YMMWORD[192+rdi],ymm10
        vmovdqu YMMWORD[224+rdi],ymm15
        vmovdqu YMMWORD[256+rdi],ymm14
        vmovdqu YMMWORD[288+rdi],ymm2
        vmovdqu YMMWORD[320+rdi],ymm3
        vmovdqu YMMWORD[352+rdi],ymm7
        je      NEAR $L$done8x

        lea     rsi,[384+rsi]
        xor     r10,r10
        vmovdqa YMMWORD[rsp],ymm11
        lea     rdi,[384+rdi]
        sub     rdx,384
        vmovdqa YMMWORD[32+rsp],ymm9
        jmp     NEAR $L$oop_tail8x

ALIGN   32
$L$448_or_more8x:
        vpxor   ymm6,ymm6,YMMWORD[rsi]
        vpxor   ymm8,ymm8,YMMWORD[32+rsi]
        vpxor   ymm1,ymm1,YMMWORD[64+rsi]
        vpxor   ymm5,ymm5,YMMWORD[96+rsi]
        vpxor   ymm12,ymm12,YMMWORD[128+rsi]
        vpxor   ymm13,ymm13,YMMWORD[160+rsi]
        vpxor   ymm10,ymm10,YMMWORD[192+rsi]
        vpxor   ymm15,ymm15,YMMWORD[224+rsi]
        vpxor   ymm14,ymm14,YMMWORD[256+rsi]
        vpxor   ymm2,ymm2,YMMWORD[288+rsi]
        vpxor   ymm3,ymm3,YMMWORD[320+rsi]
        vpxor   ymm7,ymm7,YMMWORD[352+rsi]
        vpxor   ymm11,ymm11,YMMWORD[384+rsi]
        vpxor   ymm9,ymm9,YMMWORD[416+rsi]
        vmovdqu YMMWORD[rdi],ymm6
        vmovdqu YMMWORD[32+rdi],ymm8
        vmovdqu YMMWORD[64+rdi],ymm1
        vmovdqu YMMWORD[96+rdi],ymm5
        vmovdqu YMMWORD[128+rdi],ymm12
        vmovdqu YMMWORD[160+rdi],ymm13
        vmovdqu YMMWORD[192+rdi],ymm10
        vmovdqu YMMWORD[224+rdi],ymm15
        vmovdqu YMMWORD[256+rdi],ymm14
        vmovdqu YMMWORD[288+rdi],ymm2
        vmovdqu YMMWORD[320+rdi],ymm3
        vmovdqu YMMWORD[352+rdi],ymm7
        vmovdqu YMMWORD[384+rdi],ymm11
        vmovdqu YMMWORD[416+rdi],ymm9
        je      NEAR $L$done8x

        lea     rsi,[448+rsi]
        xor     r10,r10
        vmovdqa YMMWORD[rsp],ymm0
        lea     rdi,[448+rdi]
        sub     rdx,448
        vmovdqa YMMWORD[32+rsp],ymm4

$L$oop_tail8x:
        movzx   eax,BYTE[r10*1+rsi]
        movzx   ecx,BYTE[r10*1+rsp]
        lea     r10,[1+r10]
        xor     eax,ecx
        mov     BYTE[((-1))+r10*1+rdi],al
        dec     rdx
        jnz     NEAR $L$oop_tail8x

$L$done8x:
        vzeroall
        lea     r11,[((656+48))+rsp]
        movaps  xmm6,XMMWORD[((-48))+r11]
        movaps  xmm7,XMMWORD[((-32))+r11]
        movaps  xmm8,XMMWORD[((-16))+r11]
        movaps  xmm9,XMMWORD[r11]
        movaps  xmm10,XMMWORD[16+r11]
        movaps  xmm11,XMMWORD[32+r11]
        movaps  xmm12,XMMWORD[48+r11]
        movaps  xmm13,XMMWORD[64+r11]
        movaps  xmm14,XMMWORD[80+r11]
        movaps  xmm15,XMMWORD[96+r11]
        mov     rsp,QWORD[640+rsp]
        mov     rdi,QWORD[8+rsp]        ;WIN64 epilogue
        mov     rsi,QWORD[16+rsp]
        DB      0F3h,0C3h               ;repret
$L$SEH_end_ChaCha20_8x: