We currently use an old version of libgcrypt which results in us having fewer ciphers and missing on many other improvements. Signed-off-by: Vladimir Serbinenko <phcoder@gmail.com> Reviewed-by: Daniel Kiper <daniel.kiper@oracle.com>
737 lines
18 KiB
ArmAsm
737 lines
18 KiB
ArmAsm
/* chacha20-amd64-avx512.S - AVX512 implementation of ChaCha20 cipher
|
|
*
|
|
* Copyright (C) 2022 Jussi Kivilinna <jussi.kivilinna@iki.fi>
|
|
*
|
|
* This file is part of Libgcrypt.
|
|
*
|
|
* Libgcrypt is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU Lesser General Public License as
|
|
* published by the Free Software Foundation; either version 2.1 of
|
|
* the License, or (at your option) any later version.
|
|
*
|
|
* Libgcrypt is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with this program; if not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
/*
|
|
* Based on D. J. Bernstein reference implementation at
|
|
* http://cr.yp.to/chacha.html:
|
|
*
|
|
* chacha-regs.c version 20080118
|
|
* D. J. Bernstein
|
|
* Public domain.
|
|
*/
|
|
|
|
#ifdef __x86_64
|
|
#include <config.h>
|
|
#if defined(HAVE_GCC_INLINE_ASM_AVX512) && \
|
|
(defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
|
|
defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
|
|
|
|
#include "asm-common-amd64.h"
|
|
|
|
/* register macros */
|
|
#define INPUT %rdi
|
|
#define DST %rsi
|
|
#define SRC %rdx
|
|
#define NBLKS %rcx
|
|
#define ROUND %eax
|
|
|
|
/* vector registers */
|
|
#define X0 %zmm0
|
|
#define X1 %zmm1
|
|
#define X2 %zmm2
|
|
#define X3 %zmm3
|
|
#define X4 %zmm4
|
|
#define X5 %zmm5
|
|
#define X6 %zmm6
|
|
#define X7 %zmm7
|
|
#define X8 %zmm8
|
|
#define X9 %zmm9
|
|
#define X10 %zmm10
|
|
#define X11 %zmm11
|
|
#define X12 %zmm12
|
|
#define X13 %zmm13
|
|
#define X14 %zmm14
|
|
#define X15 %zmm15
|
|
#define X0y %ymm0
|
|
#define X1y %ymm1
|
|
#define X2y %ymm2
|
|
#define X3y %ymm3
|
|
#define X4y %ymm4
|
|
#define X5y %ymm5
|
|
#define X6y %ymm6
|
|
#define X7y %ymm7
|
|
#define X8y %ymm8
|
|
#define X9y %ymm9
|
|
#define X10y %ymm10
|
|
#define X11y %ymm11
|
|
#define X12y %ymm12
|
|
#define X13y %ymm13
|
|
#define X14y %ymm14
|
|
#define X15y %ymm15
|
|
#define X0x %xmm0
|
|
#define X1x %xmm1
|
|
#define X2x %xmm2
|
|
#define X3x %xmm3
|
|
#define X4x %xmm4
|
|
#define X5x %xmm5
|
|
#define X6x %xmm6
|
|
#define X7x %xmm7
|
|
#define X8x %xmm8
|
|
#define X9x %xmm9
|
|
#define X10x %xmm10
|
|
#define X11x %xmm11
|
|
#define X12x %xmm12
|
|
#define X13x %xmm13
|
|
#define X14x %xmm14
|
|
#define X15x %xmm15
|
|
|
|
#define TMP0 %zmm16
|
|
#define TMP1 %zmm17
|
|
#define TMP0y %ymm16
|
|
#define TMP1y %ymm17
|
|
#define TMP0x %xmm16
|
|
#define TMP1x %xmm17
|
|
|
|
#define COUNTER_ADD %zmm18
|
|
#define COUNTER_ADDy %ymm18
|
|
#define COUNTER_ADDx %xmm18
|
|
|
|
#define X12_SAVE %zmm19
|
|
#define X12_SAVEy %ymm19
|
|
#define X12_SAVEx %xmm19
|
|
#define X13_SAVE %zmm20
|
|
#define X13_SAVEy %ymm20
|
|
#define X13_SAVEx %xmm20
|
|
|
|
#define S0 %zmm21
|
|
#define S1 %zmm22
|
|
#define S2 %zmm23
|
|
#define S3 %zmm24
|
|
#define S4 %zmm25
|
|
#define S5 %zmm26
|
|
#define S6 %zmm27
|
|
#define S7 %zmm28
|
|
#define S8 %zmm29
|
|
#define S14 %zmm30
|
|
#define S15 %zmm31
|
|
#define S0y %ymm21
|
|
#define S1y %ymm22
|
|
#define S2y %ymm23
|
|
#define S3y %ymm24
|
|
#define S4y %ymm25
|
|
#define S5y %ymm26
|
|
#define S6y %ymm27
|
|
#define S7y %ymm28
|
|
#define S8y %ymm29
|
|
#define S14y %ymm30
|
|
#define S15y %ymm31
|
|
#define S0x %xmm21
|
|
#define S1x %xmm22
|
|
#define S2x %xmm23
|
|
#define S3x %xmm24
|
|
#define S4x %xmm25
|
|
#define S5x %xmm26
|
|
#define S6x %xmm27
|
|
#define S7x %xmm28
|
|
#define S8x %xmm29
|
|
#define S14x %xmm30
|
|
#define S15x %xmm31
|
|
|
|
/**********************************************************************
|
|
helper macros
|
|
**********************************************************************/
|
|
|
|
/* 4x4 32-bit integer matrix transpose */
|
|
#define transpose_4x4(x0,x1,x2,x3,t1,t2) \
|
|
vpunpckhdq x1, x0, t2; \
|
|
vpunpckldq x1, x0, x0; \
|
|
\
|
|
vpunpckldq x3, x2, t1; \
|
|
vpunpckhdq x3, x2, x2; \
|
|
\
|
|
vpunpckhqdq t1, x0, x1; \
|
|
vpunpcklqdq t1, x0, x0; \
|
|
\
|
|
vpunpckhqdq x2, t2, x3; \
|
|
vpunpcklqdq x2, t2, x2;
|
|
|
|
/* 4x4 128-bit matrix transpose */
|
|
#define transpose_16byte_4x4(x0,x1,x2,x3,t1,t2) \
|
|
vshufi32x4 $0xee, x1, x0, t2; \
|
|
vshufi32x4 $0x44, x1, x0, x0; \
|
|
\
|
|
vshufi32x4 $0x44, x3, x2, t1; \
|
|
vshufi32x4 $0xee, x3, x2, x2; \
|
|
\
|
|
vshufi32x4 $0xdd, t1, x0, x1; \
|
|
vshufi32x4 $0x88, t1, x0, x0; \
|
|
\
|
|
vshufi32x4 $0xdd, x2, t2, x3; \
|
|
vshufi32x4 $0x88, x2, t2, x2;
|
|
|
|
/* 2x2 128-bit matrix transpose */
|
|
#define transpose_16byte_2x2(x0,x1,t1) \
|
|
vmovdqa32 x0, t1; \
|
|
vshufi32x4 $0x0, x1, x0, x0; \
|
|
vshufi32x4 $0x3, x1, t1, x1;
|
|
|
|
#define xor_src_dst_4x4(dst, src, offset, add, x0, x4, x8, x12) \
|
|
vpxord (offset + 0 * (add))(src), x0, x0; \
|
|
vpxord (offset + 1 * (add))(src), x4, x4; \
|
|
vpxord (offset + 2 * (add))(src), x8, x8; \
|
|
vpxord (offset + 3 * (add))(src), x12, x12; \
|
|
vmovdqu32 x0, (offset + 0 * (add))(dst); \
|
|
vmovdqu32 x4, (offset + 1 * (add))(dst); \
|
|
vmovdqu32 x8, (offset + 2 * (add))(dst); \
|
|
vmovdqu32 x12, (offset + 3 * (add))(dst);
|
|
|
|
#define xor_src_dst(dst, src, offset, xreg) \
|
|
vpxord offset(src), xreg, xreg; \
|
|
vmovdqu32 xreg, offset(dst);
|
|
|
|
#define clear_vec4(v0,v1,v2,v3) \
|
|
vpxord v0, v0, v0; \
|
|
vpxord v1, v1, v1; \
|
|
vpxord v2, v2, v2; \
|
|
vpxord v3, v3, v3;
|
|
|
|
#define clear_zmm16_zmm31() \
|
|
clear_vec4(%ymm16, %ymm20, %ymm24, %ymm28); \
|
|
clear_vec4(%ymm17, %ymm21, %ymm25, %ymm29); \
|
|
clear_vec4(%ymm18, %ymm22, %ymm26, %ymm30); \
|
|
clear_vec4(%ymm19, %ymm23, %ymm27, %ymm31);
|
|
|
|
/**********************************************************************
|
|
16-way (zmm), 8-way (ymm), 4-way (xmm) chacha20
|
|
**********************************************************************/
|
|
|
|
#define ROTATE2(v1,v2,c) \
|
|
vprold $(c), v1, v1; \
|
|
vprold $(c), v2, v2;
|
|
|
|
#define XOR(ds,s) \
|
|
vpxord s, ds, ds;
|
|
|
|
#define PLUS(ds,s) \
|
|
vpaddd s, ds, ds;
|
|
|
|
#define QUARTERROUND2V(a1,b1,c1,d1,a2,b2,c2,d2) \
|
|
PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2); \
|
|
ROTATE2(d1, d2, 16); \
|
|
PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \
|
|
ROTATE2(b1, b2, 12); \
|
|
PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2); \
|
|
ROTATE2(d1, d2, 8); \
|
|
PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \
|
|
ROTATE2(b1, b2, 7);
|
|
|
|
/**********************************************************************
|
|
1-way/2-way (xmm) chacha20
|
|
**********************************************************************/
|
|
|
|
#define ROTATE(v1,c) \
|
|
vprold $(c), v1, v1; \
|
|
|
|
#define WORD_SHUF(v1,shuf) \
|
|
vpshufd $shuf, v1, v1;
|
|
|
|
#define QUARTERROUND1H(x0,x1,x2,x3,shuf_x1,shuf_x2,shuf_x3) \
|
|
PLUS(x0, x1); XOR(x3, x0); ROTATE(x3, 16); \
|
|
PLUS(x2, x3); XOR(x1, x2); ROTATE(x1, 12); \
|
|
PLUS(x0, x1); XOR(x3, x0); ROTATE(x3, 8); \
|
|
PLUS(x2, x3); \
|
|
WORD_SHUF(x3, shuf_x3); \
|
|
XOR(x1, x2); \
|
|
WORD_SHUF(x2, shuf_x2); \
|
|
ROTATE(x1, 7); \
|
|
WORD_SHUF(x1, shuf_x1);
|
|
|
|
#define QUARTERROUND2H(x0,x1,x2,x3,y0,y1,y2,y3,shuf_x1,shuf_x2,shuf_x3) \
|
|
PLUS(x0, x1); PLUS(y0, y1); XOR(x3, x0); XOR(y3, y0); \
|
|
ROTATE(x3, 16); ROTATE(y3, 16); \
|
|
PLUS(x2, x3); PLUS(y2, y3); XOR(x1, x2); XOR(y1, y2); \
|
|
ROTATE(x1, 12); ROTATE(y1, 12); \
|
|
PLUS(x0, x1); PLUS(y0, y1); XOR(x3, x0); XOR(y3, y0); \
|
|
ROTATE(x3, 8); ROTATE(y3, 8); \
|
|
PLUS(x2, x3); PLUS(y2, y3); \
|
|
WORD_SHUF(x3, shuf_x3); WORD_SHUF(y3, shuf_x3); \
|
|
XOR(x1, x2); XOR(y1, y2); \
|
|
WORD_SHUF(x2, shuf_x2); WORD_SHUF(y2, shuf_x2); \
|
|
ROTATE(x1, 7); ROTATE(y1, 7); \
|
|
WORD_SHUF(x1, shuf_x1); WORD_SHUF(y1, shuf_x1);
|
|
|
|
SECTION_RODATA
|
|
|
|
.align 64
|
|
ELF(.type _gcry_chacha20_amd64_avx512_data,@object;)
|
|
_gcry_chacha20_amd64_avx512_data:
|
|
.Lcounter_0_1_2_3:
|
|
.Lcounter_0_1:
|
|
.long 0,0,0,0
|
|
.Lone:
|
|
.long 1,0,0,0
|
|
.Lcounter_2_3:
|
|
.Ltwo:
|
|
.long 2,0,0,0
|
|
.Lthree:
|
|
.long 3,0,0,0
|
|
.Linc_counter:
|
|
.byte 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
|
|
ELF(.size _gcry_chacha20_amd64_avx512_data,.-_gcry_chacha20_amd64_avx512_data)
|
|
|
|
.text
|
|
|
|
.align 16
|
|
.globl _gcry_chacha20_amd64_avx512_blocks
|
|
ELF(.type _gcry_chacha20_amd64_avx512_blocks,@function;)
|
|
_gcry_chacha20_amd64_avx512_blocks:
|
|
/* input:
|
|
* %rdi: input
|
|
* %rsi: dst
|
|
* %rdx: src
|
|
* %rcx: nblks
|
|
*/
|
|
CFI_STARTPROC();
|
|
|
|
spec_stop_avx512;
|
|
|
|
cmpq $4, NBLKS;
|
|
jb .Lskip_vertical_handling;
|
|
|
|
/* Load constants */
|
|
vpmovzxbd .Linc_counter rRIP, COUNTER_ADD;
|
|
kxnorq %k1, %k1, %k1;
|
|
|
|
cmpq $16, NBLKS;
|
|
jae .Lprocess_16v;
|
|
|
|
/* Preload state to YMM registers */
|
|
vpbroadcastd (0 * 4)(INPUT), S0y;
|
|
vpbroadcastd (1 * 4)(INPUT), S1y;
|
|
vpbroadcastd (2 * 4)(INPUT), S2y;
|
|
vpbroadcastd (3 * 4)(INPUT), S3y;
|
|
vpbroadcastd (4 * 4)(INPUT), S4y;
|
|
vpbroadcastd (5 * 4)(INPUT), S5y;
|
|
vpbroadcastd (6 * 4)(INPUT), S6y;
|
|
vpbroadcastd (7 * 4)(INPUT), S7y;
|
|
vpbroadcastd (8 * 4)(INPUT), S8y;
|
|
vpbroadcastd (14 * 4)(INPUT), S14y;
|
|
vpbroadcastd (15 * 4)(INPUT), S15y;
|
|
jmp .Lskip16v;
|
|
|
|
.align 16
|
|
.Lprocess_16v:
|
|
/* Process 16 ChaCha20 blocks */
|
|
|
|
/* Preload state to ZMM registers */
|
|
vpbroadcastd (0 * 4)(INPUT), S0;
|
|
vpbroadcastd (1 * 4)(INPUT), S1;
|
|
vpbroadcastd (2 * 4)(INPUT), S2;
|
|
vpbroadcastd (3 * 4)(INPUT), S3;
|
|
vpbroadcastd (4 * 4)(INPUT), S4;
|
|
vpbroadcastd (5 * 4)(INPUT), S5;
|
|
vpbroadcastd (6 * 4)(INPUT), S6;
|
|
vpbroadcastd (7 * 4)(INPUT), S7;
|
|
vpbroadcastd (8 * 4)(INPUT), S8;
|
|
vpbroadcastd (14 * 4)(INPUT), S14;
|
|
vpbroadcastd (15 * 4)(INPUT), S15;
|
|
|
|
movl $20, ROUND;
|
|
subq $16, NBLKS;
|
|
|
|
/* Construct counter vectors X12 and X13 */
|
|
vpmovm2d %k1, X9;
|
|
vpaddd (12 * 4)(INPUT){1to16}, COUNTER_ADD, X12;
|
|
vpbroadcastd (13 * 4)(INPUT), X13;
|
|
vpcmpud $6, X12, COUNTER_ADD, %k2;
|
|
vpsubd X9, X13, X13{%k2};
|
|
vmovdqa32 X12, X12_SAVE;
|
|
vmovdqa32 X13, X13_SAVE;
|
|
|
|
/* Load vectors */
|
|
vmovdqa32 S0, X0;
|
|
vmovdqa32 S4, X4;
|
|
vmovdqa32 S8, X8;
|
|
vmovdqa32 S1, X1;
|
|
vmovdqa32 S5, X5;
|
|
vpbroadcastd (9 * 4)(INPUT), X9;
|
|
QUARTERROUND2V(X0, X4, X8, X12, X1, X5, X9, X13)
|
|
vmovdqa32 S2, X2;
|
|
vmovdqa32 S6, X6;
|
|
vpbroadcastd (10 * 4)(INPUT), X10;
|
|
vmovdqa32 S14, X14;
|
|
vmovdqa32 S3, X3;
|
|
vmovdqa32 S7, X7;
|
|
vpbroadcastd (11 * 4)(INPUT), X11;
|
|
vmovdqa32 S15, X15;
|
|
|
|
/* Update counter */
|
|
addq $16, (12 * 4)(INPUT);
|
|
jmp .Lround2_entry_16v;
|
|
|
|
.align 16
|
|
.Loop16v:
|
|
movl $20, ROUND;
|
|
subq $16, NBLKS;
|
|
|
|
vmovdqa32 S0, X0;
|
|
vmovdqa32 S4, X4;
|
|
vmovdqa32 S8, X8;
|
|
transpose_16byte_4x4(X1, X5, X9, X13, TMP0, TMP1);
|
|
xor_src_dst_4x4(DST, SRC, (64 * 1), 256, X1, X5, X9, X13);
|
|
vpmovm2d %k1, X9;
|
|
vpaddd (12 * 4)(INPUT){1to16}, COUNTER_ADD, X12;
|
|
vpbroadcastd (13 * 4)(INPUT), X13;
|
|
vpcmpud $6, X12, COUNTER_ADD, %k2;
|
|
vpsubd X9, X13, X13{%k2};
|
|
vmovdqa32 S1, X1;
|
|
vmovdqa32 S5, X5;
|
|
vpbroadcastd (9 * 4)(INPUT), X9;
|
|
vmovdqa32 X12, X12_SAVE;
|
|
vmovdqa32 X13, X13_SAVE;
|
|
QUARTERROUND2V(X0, X4, X8, X12, X1, X5, X9, X13)
|
|
transpose_16byte_4x4(X2, X6, X10, X14, TMP0, TMP1);
|
|
xor_src_dst_4x4(DST, SRC, (64 * 2), 256, X2, X6, X10, X14);
|
|
vmovdqa32 S2, X2;
|
|
vmovdqa32 S6, X6;
|
|
vpbroadcastd (10 * 4)(INPUT), X10;
|
|
vmovdqa32 S14, X14;
|
|
transpose_16byte_4x4(X3, X7, X11, X15, TMP0, TMP1);
|
|
xor_src_dst_4x4(DST, SRC, (64 * 3), 256, X3, X7, X11, X15);
|
|
leaq (16 * 64)(SRC), SRC;
|
|
leaq (16 * 64)(DST), DST;
|
|
vmovdqa32 S3, X3;
|
|
vmovdqa32 S7, X7;
|
|
vpbroadcastd (11 * 4)(INPUT), X11;
|
|
vmovdqa32 S15, X15;
|
|
|
|
/* Update counter */
|
|
addq $16, (12 * 4)(INPUT);
|
|
jmp .Lround2_entry_16v;
|
|
|
|
.align 16
|
|
.Lround2_16v:
|
|
QUARTERROUND2V(X2, X7, X8, X13, X3, X4, X9, X14)
|
|
QUARTERROUND2V(X0, X4, X8, X12, X1, X5, X9, X13)
|
|
.align 16
|
|
.Lround2_entry_16v:
|
|
QUARTERROUND2V(X2, X6, X10, X14, X3, X7, X11, X15)
|
|
QUARTERROUND2V(X0, X5, X10, X15, X1, X6, X11, X12)
|
|
subl $2, ROUND;
|
|
jnz .Lround2_16v;
|
|
|
|
PLUS(X0, S0);
|
|
PLUS(X1, S1);
|
|
QUARTERROUND2V(X2, X7, X8, X13, X3, X4, X9, X14)
|
|
PLUS(X2, S2);
|
|
PLUS(X3, S3);
|
|
transpose_4x4(X0, X1, X2, X3, TMP0, TMP1);
|
|
PLUS(X4, S4);
|
|
PLUS(X5, S5);
|
|
PLUS(X6, S6);
|
|
PLUS(X7, S7);
|
|
transpose_4x4(X4, X5, X6, X7, TMP0, TMP1);
|
|
PLUS(X8, S8);
|
|
PLUS(X9, (9 * 4)(INPUT){1to16});
|
|
PLUS(X10, (10 * 4)(INPUT){1to16});
|
|
PLUS(X11, (11 * 4)(INPUT){1to16});
|
|
transpose_4x4(X8, X9, X10, X11, TMP0, TMP1);
|
|
PLUS(X12, X12_SAVE);
|
|
PLUS(X13, X13_SAVE);
|
|
PLUS(X14, S14);
|
|
PLUS(X15, S15);
|
|
transpose_4x4(X12, X13, X14, X15, TMP0, TMP1);
|
|
|
|
transpose_16byte_4x4(X0, X4, X8, X12, TMP0, TMP1);
|
|
xor_src_dst_4x4(DST, SRC, (64 * 0), 256, X0, X4, X8, X12);
|
|
|
|
cmpq $16, NBLKS;
|
|
jae .Loop16v;
|
|
|
|
transpose_16byte_4x4(X1, X5, X9, X13, TMP0, TMP1);
|
|
xor_src_dst_4x4(DST, SRC, (64 * 1), 256, X1, X5, X9, X13);
|
|
transpose_16byte_4x4(X2, X6, X10, X14, TMP0, TMP1);
|
|
xor_src_dst_4x4(DST, SRC, (64 * 2), 256, X2, X6, X10, X14);
|
|
transpose_16byte_4x4(X3, X7, X11, X15, TMP0, TMP1);
|
|
xor_src_dst_4x4(DST, SRC, (64 * 3), 256, X3, X7, X11, X15);
|
|
|
|
leaq (16 * 64)(SRC), SRC;
|
|
leaq (16 * 64)(DST), DST;
|
|
|
|
.align 16
|
|
.Lskip16v:
|
|
cmpq $8, NBLKS;
|
|
jb .Lskip8v;
|
|
|
|
/* Process 8 ChaCha20 blocks */
|
|
|
|
/* Construct counter vectors X12 and X13 */
|
|
vpmovm2d %k1, X9y;
|
|
vpaddd (12 * 4)(INPUT){1to8}, COUNTER_ADDy, X12y;
|
|
vpbroadcastd (13 * 4)(INPUT), X13y;
|
|
vpcmpud $6, X12y, COUNTER_ADDy, %k2;
|
|
vpsubd X9y, X13y, X13y{%k2};
|
|
vmovdqa32 X12y, X12_SAVEy;
|
|
vmovdqa32 X13y, X13_SAVEy;
|
|
|
|
/* Load vectors */
|
|
vmovdqa32 S0y, X0y;
|
|
vmovdqa32 S4y, X4y;
|
|
vmovdqa32 S8y, X8y;
|
|
vmovdqa32 S1y, X1y;
|
|
vmovdqa32 S5y, X5y;
|
|
vpbroadcastd (9 * 4)(INPUT), X9y;
|
|
vmovdqa32 S2y, X2y;
|
|
vmovdqa32 S6y, X6y;
|
|
vpbroadcastd (10 * 4)(INPUT), X10y;
|
|
vmovdqa32 S14y, X14y;
|
|
vmovdqa32 S3y, X3y;
|
|
vmovdqa32 S7y, X7y;
|
|
vpbroadcastd (11 * 4)(INPUT), X11y;
|
|
vmovdqa32 S15y, X15y;
|
|
|
|
/* Update counter */
|
|
addq $8, (12 * 4)(INPUT);
|
|
|
|
movl $20, ROUND;
|
|
subq $8, NBLKS;
|
|
.align 16
|
|
.Lround2_8v:
|
|
QUARTERROUND2V(X0y, X4y, X8y, X12y, X1y, X5y, X9y, X13y)
|
|
QUARTERROUND2V(X2y, X6y, X10y, X14y, X3y, X7y, X11y, X15y)
|
|
QUARTERROUND2V(X0y, X5y, X10y, X15y, X1y, X6y, X11y, X12y)
|
|
QUARTERROUND2V(X2y, X7y, X8y, X13y, X3y, X4y, X9y, X14y)
|
|
subl $2, ROUND;
|
|
jnz .Lround2_8v;
|
|
|
|
PLUS(X0y, S0y);
|
|
PLUS(X1y, S1y);
|
|
PLUS(X2y, S2y);
|
|
PLUS(X3y, S3y);
|
|
transpose_4x4(X0y, X1y, X2y, X3y, TMP0y, TMP1y);
|
|
PLUS(X4y, S4y);
|
|
PLUS(X5y, S5y);
|
|
PLUS(X6y, S6y);
|
|
PLUS(X7y, S7y);
|
|
transpose_4x4(X4y, X5y, X6y, X7y, TMP0y, TMP1y);
|
|
PLUS(X8y, S8y);
|
|
transpose_16byte_2x2(X0y, X4y, TMP0y);
|
|
PLUS(X9y, (9 * 4)(INPUT){1to8});
|
|
transpose_16byte_2x2(X1y, X5y, TMP0y);
|
|
PLUS(X10y, (10 * 4)(INPUT){1to8});
|
|
transpose_16byte_2x2(X2y, X6y, TMP0y);
|
|
PLUS(X11y, (11 * 4)(INPUT){1to8});
|
|
transpose_16byte_2x2(X3y, X7y, TMP0y);
|
|
xor_src_dst_4x4(DST, SRC, (16 * 0), 64, X0y, X1y, X2y, X3y);
|
|
transpose_4x4(X8y, X9y, X10y, X11y, TMP0y, TMP1y);
|
|
PLUS(X12y, X12_SAVEy);
|
|
PLUS(X13y, X13_SAVEy);
|
|
PLUS(X14y, S14y);
|
|
PLUS(X15y, S15y);
|
|
xor_src_dst_4x4(DST, SRC, (16 * 16), 64, X4y, X5y, X6y, X7y);
|
|
transpose_4x4(X12y, X13y, X14y, X15y, TMP0y, TMP1y);
|
|
transpose_16byte_2x2(X8y, X12y, TMP0y);
|
|
transpose_16byte_2x2(X9y, X13y, TMP0y);
|
|
transpose_16byte_2x2(X10y, X14y, TMP0y);
|
|
transpose_16byte_2x2(X11y, X15y, TMP0y);
|
|
xor_src_dst_4x4(DST, SRC, (16 * 2), 64, X8y, X9y, X10y, X11y);
|
|
xor_src_dst_4x4(DST, SRC, (16 * 18), 64, X12y, X13y, X14y, X15y);
|
|
|
|
leaq (8 * 64)(SRC), SRC;
|
|
leaq (8 * 64)(DST), DST;
|
|
|
|
.align 16
|
|
.Lskip8v:
|
|
cmpq $4, NBLKS;
|
|
jb .Lskip4v;
|
|
|
|
/* Process 4 ChaCha20 blocks */
|
|
|
|
/* Construct counter vectors X12 and X13 */
|
|
vpmovm2d %k1, X9x;
|
|
vpaddd (12 * 4)(INPUT){1to4}, COUNTER_ADDx, X12x;
|
|
vpbroadcastd (13 * 4)(INPUT), X13x;
|
|
vpcmpud $6, X12x, COUNTER_ADDx, %k2;
|
|
vpsubd X9x, X13x, X13x{%k2};
|
|
vmovdqa32 X12x, X12_SAVEx;
|
|
vmovdqa32 X13x, X13_SAVEx;
|
|
|
|
/* Load vectors */
|
|
vmovdqa32 S0x, X0x;
|
|
vmovdqa32 S4x, X4x;
|
|
vmovdqa32 S8x, X8x;
|
|
vmovdqa32 S1x, X1x;
|
|
vmovdqa32 S5x, X5x;
|
|
vpbroadcastd (9 * 4)(INPUT), X9x;
|
|
vmovdqa32 S2x, X2x;
|
|
vmovdqa32 S6x, X6x;
|
|
vpbroadcastd (10 * 4)(INPUT), X10x;
|
|
vmovdqa32 S14x, X14x;
|
|
vmovdqa32 S3x, X3x;
|
|
vmovdqa32 S7x, X7x;
|
|
vpbroadcastd (11 * 4)(INPUT), X11x;
|
|
vmovdqa32 S15x, X15x;
|
|
|
|
/* Update counter */
|
|
addq $4, (12 * 4)(INPUT);
|
|
|
|
movl $20, ROUND;
|
|
subq $4, NBLKS;
|
|
.align 16
|
|
.Lround2_4v:
|
|
QUARTERROUND2V(X0x, X4x, X8x, X12x, X1x, X5x, X9x, X13x)
|
|
QUARTERROUND2V(X2x, X6x, X10x, X14x, X3x, X7x, X11x, X15x)
|
|
QUARTERROUND2V(X0x, X5x, X10x, X15x, X1x, X6x, X11x, X12x)
|
|
QUARTERROUND2V(X2x, X7x, X8x, X13x, X3x, X4x, X9x, X14x)
|
|
subl $2, ROUND;
|
|
jnz .Lround2_4v;
|
|
|
|
PLUS(X0x, S0x);
|
|
PLUS(X1x, S1x);
|
|
PLUS(X2x, S2x);
|
|
PLUS(X3x, S3x);
|
|
transpose_4x4(X0x, X1x, X2x, X3x, TMP0x, TMP1x);
|
|
PLUS(X4x, S4x);
|
|
PLUS(X5x, S5x);
|
|
PLUS(X6x, S6x);
|
|
PLUS(X7x, S7x);
|
|
xor_src_dst_4x4(DST, SRC, (16 * 0), 64, X0x, X1x, X2x, X3x);
|
|
transpose_4x4(X4x, X5x, X6x, X7x, TMP0x, TMP1x);
|
|
PLUS(X8x, S8x);
|
|
PLUS(X9x, (9 * 4)(INPUT){1to4});
|
|
PLUS(X10x, (10 * 4)(INPUT){1to4});
|
|
PLUS(X11x, (11 * 4)(INPUT){1to4});
|
|
xor_src_dst_4x4(DST, SRC, (16 * 1), 64, X4x, X5x, X6x, X7x);
|
|
transpose_4x4(X8x, X9x, X10x, X11x, TMP0x, TMP1x);
|
|
PLUS(X12x, X12_SAVEx);
|
|
PLUS(X13x, X13_SAVEx);
|
|
PLUS(X14x, S14x);
|
|
PLUS(X15x, S15x);
|
|
xor_src_dst_4x4(DST, SRC, (16 * 2), 64, X8x, X9x, X10x, X11x);
|
|
transpose_4x4(X12x, X13x, X14x, X15x, TMP0x, TMP1x);
|
|
xor_src_dst_4x4(DST, SRC, (16 * 3), 64, X12x, X13x, X14x, X15x);
|
|
|
|
leaq (4 * 64)(SRC), SRC;
|
|
leaq (4 * 64)(DST), DST;
|
|
|
|
.align 16
|
|
.Lskip4v:
|
|
/* clear AVX512 registers */
|
|
kxorq %k2, %k2, %k2;
|
|
vzeroupper;
|
|
clear_zmm16_zmm31();
|
|
|
|
.align 16
|
|
.Lskip_vertical_handling:
|
|
cmpq $0, NBLKS;
|
|
je .Ldone;
|
|
|
|
/* Load state */
|
|
vmovdqu (0 * 4)(INPUT), X10x;
|
|
vmovdqu (4 * 4)(INPUT), X11x;
|
|
vmovdqu (8 * 4)(INPUT), X12x;
|
|
vmovdqu (12 * 4)(INPUT), X13x;
|
|
|
|
/* Load constant */
|
|
vmovdqa .Lone rRIP, X4x;
|
|
|
|
cmpq $1, NBLKS;
|
|
je .Lhandle1;
|
|
|
|
/* Process two ChaCha20 blocks (XMM) */
|
|
movl $20, ROUND;
|
|
subq $2, NBLKS;
|
|
|
|
vmovdqa X10x, X0x;
|
|
vmovdqa X11x, X1x;
|
|
vmovdqa X12x, X2x;
|
|
vmovdqa X13x, X3x;
|
|
|
|
vmovdqa X10x, X8x;
|
|
vmovdqa X11x, X9x;
|
|
vmovdqa X12x, X14x;
|
|
vpaddq X4x, X13x, X15x;
|
|
vmovdqa X15x, X7x;
|
|
|
|
.align 16
|
|
.Lround2_2:
|
|
QUARTERROUND2H(X0x, X1x, X2x, X3x, X8x, X9x, X14x, X15x,
|
|
0x39, 0x4e, 0x93);
|
|
QUARTERROUND2H(X0x, X1x, X2x, X3x, X8x, X9x, X14x, X15x,
|
|
0x93, 0x4e, 0x39);
|
|
subl $2, ROUND;
|
|
jnz .Lround2_2;
|
|
|
|
PLUS(X0x, X10x);
|
|
PLUS(X1x, X11x);
|
|
PLUS(X2x, X12x);
|
|
PLUS(X3x, X13x);
|
|
|
|
vpaddq .Ltwo rRIP, X13x, X13x; /* Update counter */
|
|
|
|
xor_src_dst_4x4(DST, SRC, 0 * 4, 4 * 4, X0x, X1x, X2x, X3x);
|
|
|
|
PLUS(X8x, X10x);
|
|
PLUS(X9x, X11x);
|
|
PLUS(X14x, X12x);
|
|
PLUS(X15x, X7x);
|
|
|
|
xor_src_dst_4x4(DST, SRC, 16 * 4, 4 * 4, X8x, X9x, X14x, X15x);
|
|
lea (2 * 64)(DST), DST;
|
|
lea (2 * 64)(SRC), SRC;
|
|
|
|
cmpq $0, NBLKS;
|
|
je .Lskip1;
|
|
|
|
.align 16
|
|
.Lhandle1:
|
|
/* Process one ChaCha20 block (XMM) */
|
|
movl $20, ROUND;
|
|
subq $1, NBLKS;
|
|
|
|
vmovdqa X10x, X0x;
|
|
vmovdqa X11x, X1x;
|
|
vmovdqa X12x, X2x;
|
|
vmovdqa X13x, X3x;
|
|
|
|
.align 16
|
|
.Lround2_1:
|
|
QUARTERROUND1H(X0x, X1x, X2x, X3x, 0x39, 0x4e, 0x93);
|
|
QUARTERROUND1H(X0x, X1x, X2x, X3x, 0x93, 0x4e, 0x39);
|
|
subl $2, ROUND;
|
|
jnz .Lround2_1;
|
|
|
|
PLUS(X0x, X10x);
|
|
PLUS(X1x, X11x);
|
|
PLUS(X2x, X12x);
|
|
PLUS(X3x, X13x);
|
|
|
|
vpaddq X4x, X13x, X13x; /* Update counter */
|
|
|
|
xor_src_dst_4x4(DST, SRC, 0 * 4, 4 * 4, X0x, X1x, X2x, X3x);
|
|
|
|
.align 16
|
|
.Lskip1:
|
|
/* Store counter */
|
|
vmovdqu X13x, (12 * 4)(INPUT);
|
|
|
|
.align 16
|
|
.Ldone:
|
|
vzeroall; /* clears ZMM0-ZMM15 */
|
|
|
|
xorl %eax, %eax;
|
|
ret_spec_stop;
|
|
CFI_ENDPROC();
|
|
ELF(.size _gcry_chacha20_amd64_avx512_blocks,
|
|
.-_gcry_chacha20_amd64_avx512_blocks;)
|
|
|
|
#endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/
|
|
#endif /*__x86_64*/
|