We currently use an old version of libgcrypt which results in us having fewer ciphers and missing on many other improvements. Signed-off-by: Vladimir Serbinenko <phcoder@gmail.com> Reviewed-by: Daniel Kiper <daniel.kiper@oracle.com>
588 lines
16 KiB
ArmAsm
588 lines
16 KiB
ArmAsm
/* keccak-amd64-avx512.S - x86-64 AVX512 implementation of Keccak
|
|
*
|
|
* Copyright (C) 2022 Jussi Kivilinna <jussi.kivilinna@iki.fi>
|
|
*
|
|
* This file is part of Libgcrypt.
|
|
*
|
|
* Libgcrypt is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU Lesser General Public License as
|
|
* published by the Free Software Foundation; either version 2.1 of
|
|
* the License, or (at your option) any later version.
|
|
*
|
|
* Libgcrypt is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with this program; if not, see <http://www.gnu.org/licenses/>.
|
|
*
|
|
* ---
|
|
*
|
|
* Core function `KeccakF1600_ce` based on ARMv8-CE KeccakF1600 implementation
|
|
* by Andy Polyakov from CRYPTOGAMS distribution `arm/keccak1600-armv8.pl`.
|
|
* `KeccakF1600_ce` was ported to x86-64 AVX512 and converted to use GCC
|
|
* preprocessed assembly and fitted with new absorb function optimized for
|
|
* x86-64. SHA3-256 performance on Intel tigerlake, 5.72 cpB.
|
|
*
|
|
* Original copyright license follows:
|
|
*
|
|
* Copyright (c) 2006, CRYPTOGAMS by <appro@openssl.org>
|
|
* All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
*
|
|
* * Redistributions of source code must retain copyright notices,
|
|
* this list of conditions and the following disclaimer.
|
|
*
|
|
* * Redistributions in binary form must reproduce the above
|
|
* copyright notice, this list of conditions and the following
|
|
* disclaimer in the documentation and/or other materials
|
|
* provided with the distribution.
|
|
*
|
|
* * Neither the name of the CRYPTOGAMS nor the names of its
|
|
* copyright holder and contributors may be used to endorse or
|
|
* promote products derived from this software without specific
|
|
* prior written permission.
|
|
*
|
|
* ALTERNATIVELY, provided that this notice is retained in full, this
|
|
* product may be distributed under the terms of the GNU General Public
|
|
* License (GPL), in which case the provisions of the GPL apply INSTEAD OF
|
|
* those given above.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
|
|
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#ifdef __x86_64
|
|
#include <config.h>
|
|
#if defined(HAVE_GCC_INLINE_ASM_AVX512) && \
|
|
(defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
|
|
defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
|
|
|
|
#include "asm-common-amd64.h"
|
|
|
|
.text
|
|
|
|
/* Register macros. */
|
|
#define A_0_0 %xmm31
|
|
#define A_0_1 %xmm30
|
|
#define A_0_2 %xmm29
|
|
#define A_0_3 %xmm28
|
|
#define A_0_4 %xmm27
|
|
#define A_1_0 %xmm26
|
|
#define A_1_1 %xmm25
|
|
#define A_1_2 %xmm24
|
|
#define A_1_3 %xmm23
|
|
#define A_1_4 %xmm22
|
|
#define A_2_0 %xmm21
|
|
#define A_2_1 %xmm20
|
|
#define A_2_2 %xmm19
|
|
#define A_2_3 %xmm18
|
|
#define A_2_4 %xmm17
|
|
#define A_3_0 %xmm16
|
|
#define A_3_1 %xmm15
|
|
#define A_3_2 %xmm14
|
|
#define A_3_3 %xmm13
|
|
#define A_3_4 %xmm12
|
|
#define A_4_0 %xmm11
|
|
#define A_4_1 %xmm10
|
|
#define A_4_2 %xmm9
|
|
#define A_4_3 %xmm8
|
|
#define A_4_4 %xmm7
|
|
|
|
#define C_0 %xmm6
|
|
#define C_1 %xmm5
|
|
#define C_2 %xmm4
|
|
#define C_3 %xmm3
|
|
#define C_4 %xmm2
|
|
#define C_5 %xmm1
|
|
#define C_6 %xmm0
|
|
|
|
#define D_0 C_4
|
|
#define D_1 C_5
|
|
#define D_2 C_6
|
|
#define D_3 C_2
|
|
#define D_4 C_3
|
|
|
|
/* Helper macros for ARMv8-CE to x86-64/AVX512 conversion. */
|
|
#define eor3_d(dst_s1, s2, s3) \
|
|
vpternlogq $0x96, s3, s2, dst_s1;
|
|
|
|
#define eor3(dst, s1, s2, s3) \
|
|
vmovdqa s1, dst; \
|
|
eor3_d(dst, s2, s3);
|
|
|
|
#define rax1_c(dst, s1, s2_rol1) \
|
|
vprolq $1, s2_rol1, dst; \
|
|
vpxor s1, dst, dst;
|
|
|
|
#define rax1_t(dst_s1, s2_rol1, tmp) \
|
|
vprolq $1, s2_rol1, tmp; \
|
|
vpxor tmp, dst_s1, dst_s1;
|
|
|
|
#define rax1_s(dst_s1, s2_rol1) \
|
|
vprolq $1, s2_rol1, s2_rol1; \
|
|
vpxor s2_rol1, dst_s1, dst_s1;
|
|
|
|
#define xar(dst, s1, s2, rol) \
|
|
vpxorq s2, s1, dst; \
|
|
vprolq $(rol), dst, dst;
|
|
|
|
#define xar_x(dst, s1, s2, rol) \
|
|
vpxor s2, s1, dst; \
|
|
vprolq $(rol), dst, dst;
|
|
|
|
#define bcax_d(dst_s1, s2, s3) \
|
|
vpternlogq $0xb4, s3, s2, dst_s1;
|
|
|
|
#define bcax(dst, s1, s2, s3) \
|
|
vmovdqa64 s1, dst; \
|
|
bcax_d(dst, s2, s3);
|
|
|
|
#define bcax_x(dst, s1, s2, s3) \
|
|
vmovdqa s1, dst; \
|
|
bcax_d(dst, s2, s3);
|
|
|
|
#define eor(dst, s1, s2) \
|
|
vpxorq s2, s1, dst;
|
|
|
|
/* Misc helper macros. */
|
|
#define clear_avx512_4regs(a, b, c, d) \
|
|
eor(a, a, a); eor(b, b, b); eor(c, c, c); eor(d, d, d);
|
|
|
|
#define clear_regs() \
|
|
vzeroall; /* xmm0-xmm15 */ \
|
|
clear_avx512_4regs(%ymm16, %ymm17, %ymm18, %ymm19); \
|
|
clear_avx512_4regs(%ymm20, %ymm21, %ymm22, %ymm23); \
|
|
clear_avx512_4regs(%ymm24, %ymm25, %ymm26, %ymm27); \
|
|
clear_avx512_4regs(%ymm28, %ymm29, %ymm30, %ymm31);
|
|
|
|
ELF(.type KeccakF1600_ce,@function)
|
|
.align 64, 0xcc
|
|
KeccakF1600_ce:
|
|
.Loop_ce:
|
|
CFI_STARTPROC()
|
|
|
|
////////////////////////////////////////////////// Theta
|
|
eor3( C_0, A_4_0, A_3_0, A_2_0)
|
|
eor3( C_1, A_4_1, A_3_1, A_2_1)
|
|
eor3( C_3, A_4_3, A_3_3, A_2_3)
|
|
eor3( C_2, A_4_2, A_3_2, A_2_2)
|
|
eor3( C_4, A_4_4, A_3_4, A_2_4)
|
|
eor3_d( C_0, A_1_0, A_0_0)
|
|
eor3_d( C_1, A_1_1, A_0_1)
|
|
eor3_d( C_3, A_1_3, A_0_3)
|
|
eor3_d( C_2, A_1_2, A_0_2)
|
|
eor3_d( C_4, A_1_4, A_0_4)
|
|
|
|
rax1_c( C_5, C_0, C_2) // D[1]
|
|
rax1_t( C_2, C_4, C_6) // D[3]
|
|
rax1_c( C_6, C_1, C_3) // D[2]
|
|
rax1_s( C_3, C_0) // D[4]
|
|
rax1_s( C_4, C_1) // D[0]
|
|
|
|
////////////////////////////////////////////////// Theta+Rho+Pi
|
|
xar( C_0, A_0_1, D_1, 1) // C[0]=A[2][0]
|
|
|
|
xar( A_0_1, A_1_1, D_1, 44)
|
|
xar( A_1_1, A_1_4, D_4, 20)
|
|
xar( A_1_4, A_4_2, D_2, 61)
|
|
xar( A_4_2, A_2_4, D_4, 39)
|
|
xar( A_2_4, A_4_0, D_0, 18)
|
|
|
|
xar( C_1, A_0_2, D_2, 62) // C[1]=A[4][0]
|
|
|
|
xar( A_0_2, A_2_2, D_2, 43)
|
|
xar( A_2_2, A_2_3, D_3, 25)
|
|
xar( A_2_3, A_3_4, D_4, 8)
|
|
xar_x( A_3_4, A_4_3, D_3, 56)
|
|
xar( A_4_3, A_3_0, D_0, 41)
|
|
|
|
xar( A_3_0, A_0_4, D_4, 27)
|
|
|
|
xar_x( D_4, A_4_4, D_4, 14) // D[4]=A[0][4]
|
|
xar_x( A_4_4, A_4_1, D_1, 2)
|
|
xar( A_1_3, A_1_3, D_3, 55) // A[1][3]=A[4][1]
|
|
xar( A_0_4, A_3_1, D_1, 45) // A[0][4]=A[1][3]
|
|
xar( A_3_1, A_1_0, D_0, 36)
|
|
|
|
xar( A_1_0, A_0_3, D_3, 28)
|
|
|
|
eor( A_0_0, A_0_0, D_0)
|
|
|
|
xar_x( D_3, A_3_3, D_3, 21) // D[3]=A[0][3]
|
|
xar( A_0_3, A_3_2, D_2, 15) // A[0][3]=A[3][3]
|
|
xar( D_1, A_2_1, D_1, 10) // D[1]=A[3][2]
|
|
xar( D_2, A_1_2, D_2, 6) // D[2]=A[2][1]
|
|
xar( D_0, A_2_0, D_0, 3) // D[0]=A[1][2]
|
|
|
|
////////////////////////////////////////////////// Chi+Iota
|
|
bcax_x( A_4_0, C_1, A_4_2, A_1_3) // A[1][3]=A[4][1]
|
|
bcax( A_4_1, A_1_3, A_4_3, A_4_2) // A[1][3]=A[4][1]
|
|
bcax_d( A_4_2, A_4_4, A_4_3)
|
|
bcax_d( A_4_3, C_1, A_4_4)
|
|
bcax_d( A_4_4, A_1_3, C_1) // A[1][3]=A[4][1]
|
|
|
|
bcax_x( A_3_2, D_1, A_3_4, A_0_3) // A[0][3]=A[3][3]
|
|
bcax( A_3_3, A_0_3, A_3_0, A_3_4) // A[0][3]=A[3][3]
|
|
bcax_d( A_3_4, A_3_1, A_3_0)
|
|
bcax_d( A_3_0, D_1, A_3_1)
|
|
bcax_d( A_3_1, A_0_3, D_1) // A[0][3]=A[3][3]
|
|
|
|
bcax( A_2_0, C_0, A_2_2, D_2)
|
|
bcax( A_2_1, D_2, A_2_3, A_2_2)
|
|
bcax_d( A_2_2, A_2_4, A_2_3)
|
|
bcax_d( A_2_3, C_0, A_2_4)
|
|
bcax_d( A_2_4, D_2, C_0)
|
|
|
|
bcax( A_1_2, D_0, A_1_4, A_0_4) // A[0][4]=A[1][3]
|
|
bcax( A_1_3, A_0_4, A_1_0, A_1_4) // A[0][4]=A[1][3]
|
|
bcax_d( A_1_4, A_1_1, A_1_0)
|
|
bcax_d( A_1_0, D_0, A_1_1)
|
|
bcax_d( A_1_1, A_0_4, D_0) // A[0][4]=A[1][3]
|
|
|
|
bcax( A_0_3, D_3, A_0_0, D_4)
|
|
bcax( A_0_4, D_4, A_0_1, A_0_0)
|
|
bcax_d( A_0_0, A_0_2, A_0_1)
|
|
bcax_d( A_0_1, D_3, A_0_2)
|
|
bcax_d( A_0_2, D_4, D_3)
|
|
eor( A_0_0, A_0_0, (%r10))
|
|
|
|
cmpq %r10, %r11
|
|
je .Lend_ce
|
|
|
|
addq $8, %r10
|
|
jmp .Loop_ce
|
|
|
|
.align 64, 0xcc
|
|
.Lend_ce:
|
|
ret_spec_stop
|
|
CFI_ENDPROC()
|
|
ELF(.size KeccakF1600_ce,.-KeccakF1600_ce)
|
|
|
|
.globl _gcry_keccak_f1600_state_permute64_avx512
|
|
ELF(.type _gcry_keccak_f1600_state_permute64_avx512,@function)
|
|
.align 64, 0xcc
|
|
_gcry_keccak_f1600_state_permute64_avx512:
|
|
/* input:
|
|
* %rdi: state
|
|
* %rsi: round constants
|
|
*/
|
|
CFI_STARTPROC()
|
|
|
|
spec_stop_avx512;
|
|
|
|
leaq 12*8(%rdi), %rax
|
|
leaq (24-1)*8(%rsi), %r11
|
|
|
|
vmovdqu64 0*8(%rdi), A_0_0
|
|
vmovdqu64 1*8(%rdi), A_0_1
|
|
vmovdqu64 2*8(%rdi), A_0_2
|
|
vmovdqu64 3*8(%rdi), A_0_3
|
|
vmovdqu64 4*8(%rdi), A_0_4
|
|
vmovdqu64 5*8(%rdi), A_1_0
|
|
vmovdqu64 6*8(%rdi), A_1_1
|
|
vmovdqu64 7*8(%rdi), A_1_2
|
|
vmovdqu64 8*8(%rdi), A_1_3
|
|
vmovdqu64 9*8(%rdi), A_1_4
|
|
vmovdqu64 10*8(%rdi), A_2_0
|
|
vmovdqu64 11*8(%rdi), A_2_1
|
|
vmovdqu64 0*8(%rax), A_2_2
|
|
vmovdqu64 1*8(%rax), A_2_3
|
|
vmovdqu64 2*8(%rax), A_2_4
|
|
vmovdqu64 3*8(%rax), A_3_0
|
|
vmovdqu 4*8(%rax), A_3_1
|
|
vmovdqu 5*8(%rax), A_3_2
|
|
vmovdqu 6*8(%rax), A_3_3
|
|
vmovdqu 7*8(%rax), A_3_4
|
|
vmovdqu 8*8(%rax), A_4_0
|
|
vmovdqu 9*8(%rax), A_4_1
|
|
vmovdqu 10*8(%rax), A_4_2
|
|
vmovdqu 11*8(%rax), A_4_3
|
|
vmovq 12*8(%rax), A_4_4
|
|
|
|
movq %rsi, %r10
|
|
call KeccakF1600_ce
|
|
|
|
vpunpcklqdq A_0_1, A_0_0, A_0_0
|
|
vpunpcklqdq A_0_3, A_0_2, A_0_2
|
|
vpunpcklqdq A_1_0, A_0_4, A_0_4
|
|
vpunpcklqdq A_1_2, A_1_1, A_1_1
|
|
vpunpcklqdq A_1_4, A_1_3, A_1_3
|
|
vpunpcklqdq A_2_1, A_2_0, A_2_0
|
|
vpunpcklqdq A_2_3, A_2_2, A_2_2
|
|
vpunpcklqdq A_3_0, A_2_4, A_2_4
|
|
vpunpcklqdq A_3_2, A_3_1, A_3_1
|
|
vpunpcklqdq A_3_4, A_3_3, A_3_3
|
|
vpunpcklqdq A_4_1, A_4_0, A_4_0
|
|
vpunpcklqdq A_4_3, A_4_2, A_4_2
|
|
vmovdqu64 A_0_0, 0*8(%rdi)
|
|
vmovdqu64 A_0_2, 2*8(%rdi)
|
|
vmovdqu64 A_0_4, 4*8(%rdi)
|
|
vmovdqu64 A_1_1, 6*8(%rdi)
|
|
vmovdqu64 A_1_3, 8*8(%rdi)
|
|
vmovdqu64 A_2_0, 10*8(%rdi)
|
|
vmovdqu64 A_2_2, 0*8(%rax)
|
|
vmovdqu64 A_2_4, 2*8(%rax)
|
|
vmovdqu A_3_1, 4*8(%rax)
|
|
vmovdqu A_3_3, 6*8(%rax)
|
|
vmovdqu A_4_0, 8*8(%rax)
|
|
vmovdqu A_4_2, 10*8(%rax)
|
|
vmovq A_4_4, 12*8(%rax)
|
|
|
|
xorl %eax, %eax
|
|
|
|
clear_regs()
|
|
ret_spec_stop
|
|
CFI_ENDPROC()
|
|
ELF(.size _gcry_keccak_f1600_state_permute64_avx512,
|
|
.-_gcry_keccak_f1600_state_permute64_avx512)
|
|
|
|
.globl _gcry_keccak_absorb_blocks_avx512
|
|
ELF(.type _gcry_keccak_absorb_blocks_avx512,@function)
|
|
.align 64, 0xcc
|
|
_gcry_keccak_absorb_blocks_avx512:
|
|
/* input:
|
|
* %rdi: state
|
|
* %rsi: round constants
|
|
* %rdx: lanes
|
|
* %rcx: nlanes
|
|
* %r8 : blocklanes
|
|
* %r9 : lanes output pointer
|
|
*/
|
|
CFI_STARTPROC()
|
|
|
|
spec_stop_avx512;
|
|
|
|
leaq 12*8(%rdi), %rax
|
|
leaq (24-1)*8(%rsi), %r11
|
|
|
|
vmovdqu64 0*8(%rdi), A_0_0
|
|
vmovdqu64 1*8(%rdi), A_0_1
|
|
vmovdqu64 2*8(%rdi), A_0_2
|
|
vmovdqu64 3*8(%rdi), A_0_3
|
|
vmovdqu64 4*8(%rdi), A_0_4
|
|
vmovdqu64 5*8(%rdi), A_1_0
|
|
vmovdqu64 6*8(%rdi), A_1_1
|
|
vmovdqu64 7*8(%rdi), A_1_2
|
|
vmovdqu64 8*8(%rdi), A_1_3
|
|
vmovdqu64 9*8(%rdi), A_1_4
|
|
vmovdqu64 10*8(%rdi), A_2_0
|
|
vmovdqu64 11*8(%rdi), A_2_1
|
|
vmovdqu64 0*8(%rax), A_2_2
|
|
vmovdqu64 1*8(%rax), A_2_3
|
|
vmovdqu64 2*8(%rax), A_2_4
|
|
vmovdqu64 3*8(%rax), A_3_0
|
|
vmovdqu 4*8(%rax), A_3_1
|
|
vmovdqu 5*8(%rax), A_3_2
|
|
vmovdqu 6*8(%rax), A_3_3
|
|
vmovdqu 7*8(%rax), A_3_4
|
|
vmovdqu 8*8(%rax), A_4_0
|
|
vmovdqu 9*8(%rax), A_4_1
|
|
vmovdqu 10*8(%rax), A_4_2
|
|
vmovdqu 11*8(%rax), A_4_3
|
|
vmovq 12*8(%rax), A_4_4
|
|
|
|
cmpq $(104 >> 3), %r8
|
|
jb .Loop_absorb_72_ce
|
|
je .Loop_absorb_104_ce
|
|
cmpq $(144 >> 3), %r8
|
|
jb .Loop_absorb_136_ce
|
|
je .Loop_absorb_144_ce
|
|
jmp .Loop_absorb_168_ce
|
|
|
|
.align 64, 0xcc
|
|
.Loop_absorb_168_ce:
|
|
subq %r8, %rcx // len - bsz
|
|
jb .Labsorbed_ce
|
|
|
|
vpxorq 0*8(%rdx), A_0_0, A_0_0
|
|
vpxorq 1*8(%rdx), A_0_1, A_0_1
|
|
vpxorq 2*8(%rdx), A_0_2, A_0_2
|
|
vpxorq 3*8(%rdx), A_0_3, A_0_3
|
|
vpxorq 4*8(%rdx), A_0_4, A_0_4
|
|
vpxorq 5*8(%rdx), A_1_0, A_1_0
|
|
vpxorq 6*8(%rdx), A_1_1, A_1_1
|
|
vpxorq 7*8(%rdx), A_1_2, A_1_2
|
|
vpxorq 8*8(%rdx), A_1_3, A_1_3
|
|
vpxorq 9*8(%rdx), A_1_4, A_1_4
|
|
vpxorq 10*8(%rdx), A_2_0, A_2_0
|
|
vpxorq 11*8(%rdx), A_2_1, A_2_1
|
|
vpxorq 12*8(%rdx), A_2_2, A_2_2
|
|
vpxorq 13*8(%rdx), A_2_3, A_2_3
|
|
vpxorq 14*8(%rdx), A_2_4, A_2_4
|
|
vpxorq 15*8(%rdx), A_3_0, A_3_0
|
|
vpxor 16*8(%rdx), A_3_1, A_3_1
|
|
vpxor 17*8(%rdx), A_3_2, A_3_2
|
|
vpxor 18*8(%rdx), A_3_3, A_3_3
|
|
vpxor 19*8(%rdx), A_3_4, A_3_4
|
|
vmovq 20*8(%rdx), C_0
|
|
leaq 21*8(%rdx), %rdx
|
|
vpxorq C_0, A_4_0, A_4_0
|
|
|
|
movq %rsi, %r10
|
|
call KeccakF1600_ce
|
|
|
|
jmp .Loop_absorb_168_ce
|
|
|
|
.align 64, 0xcc
|
|
.Loop_absorb_144_ce:
|
|
subq %r8, %rcx // len - bsz
|
|
jb .Labsorbed_ce
|
|
|
|
vpxorq 0*8(%rdx), A_0_0, A_0_0
|
|
vpxorq 1*8(%rdx), A_0_1, A_0_1
|
|
vpxorq 2*8(%rdx), A_0_2, A_0_2
|
|
vpxorq 3*8(%rdx), A_0_3, A_0_3
|
|
vpxorq 4*8(%rdx), A_0_4, A_0_4
|
|
vpxorq 5*8(%rdx), A_1_0, A_1_0
|
|
vpxorq 6*8(%rdx), A_1_1, A_1_1
|
|
vpxorq 7*8(%rdx), A_1_2, A_1_2
|
|
vpxorq 8*8(%rdx), A_1_3, A_1_3
|
|
vpxorq 9*8(%rdx), A_1_4, A_1_4
|
|
vpxorq 10*8(%rdx), A_2_0, A_2_0
|
|
vpxorq 11*8(%rdx), A_2_1, A_2_1
|
|
vpxorq 12*8(%rdx), A_2_2, A_2_2
|
|
vpxorq 13*8(%rdx), A_2_3, A_2_3
|
|
vpxorq 14*8(%rdx), A_2_4, A_2_4
|
|
vpxorq 15*8(%rdx), A_3_0, A_3_0
|
|
vpxor 16*8(%rdx), A_3_1, A_3_1
|
|
vmovq 17*8(%rdx), C_0
|
|
leaq 18*8(%rdx), %rdx
|
|
vpxor C_0, A_3_2, A_3_2
|
|
|
|
movq %rsi, %r10
|
|
call KeccakF1600_ce
|
|
|
|
jmp .Loop_absorb_144_ce
|
|
|
|
.align 64, 0xcc
|
|
.Loop_absorb_136_ce:
|
|
subq %r8, %rcx // len - bsz
|
|
jb .Labsorbed_ce
|
|
|
|
vpxorq 0*8(%rdx), A_0_0, A_0_0
|
|
vpxorq 1*8(%rdx), A_0_1, A_0_1
|
|
vpxorq 2*8(%rdx), A_0_2, A_0_2
|
|
vpxorq 3*8(%rdx), A_0_3, A_0_3
|
|
vpxorq 4*8(%rdx), A_0_4, A_0_4
|
|
vpxorq 5*8(%rdx), A_1_0, A_1_0
|
|
vpxorq 6*8(%rdx), A_1_1, A_1_1
|
|
vpxorq 7*8(%rdx), A_1_2, A_1_2
|
|
vpxorq 8*8(%rdx), A_1_3, A_1_3
|
|
vpxorq 9*8(%rdx), A_1_4, A_1_4
|
|
vpxorq 10*8(%rdx), A_2_0, A_2_0
|
|
vpxorq 11*8(%rdx), A_2_1, A_2_1
|
|
vpxorq 12*8(%rdx), A_2_2, A_2_2
|
|
vpxorq 13*8(%rdx), A_2_3, A_2_3
|
|
vpxorq 14*8(%rdx), A_2_4, A_2_4
|
|
vpxorq 15*8(%rdx), A_3_0, A_3_0
|
|
vmovq 16*8(%rdx), C_0
|
|
leaq 17*8(%rdx), %rdx
|
|
vpxor C_0, A_3_1, A_3_1
|
|
|
|
movq %rsi, %r10
|
|
call KeccakF1600_ce
|
|
|
|
jmp .Loop_absorb_136_ce
|
|
|
|
.align 64, 0xcc
|
|
.Loop_absorb_104_ce:
|
|
subq %r8, %rcx // len - bsz
|
|
jb .Labsorbed_ce
|
|
|
|
vpxorq 0*8(%rdx), A_0_0, A_0_0
|
|
vpxorq 1*8(%rdx), A_0_1, A_0_1
|
|
vpxorq 2*8(%rdx), A_0_2, A_0_2
|
|
vpxorq 3*8(%rdx), A_0_3, A_0_3
|
|
vpxorq 4*8(%rdx), A_0_4, A_0_4
|
|
vpxorq 5*8(%rdx), A_1_0, A_1_0
|
|
vpxorq 6*8(%rdx), A_1_1, A_1_1
|
|
vpxorq 7*8(%rdx), A_1_2, A_1_2
|
|
vpxorq 8*8(%rdx), A_1_3, A_1_3
|
|
vpxorq 9*8(%rdx), A_1_4, A_1_4
|
|
vpxorq 10*8(%rdx), A_2_0, A_2_0
|
|
vpxorq 11*8(%rdx), A_2_1, A_2_1
|
|
vmovq 12*8(%rdx), C_0
|
|
leaq 13*8(%rdx), %rdx
|
|
vpxorq C_0, A_2_2, A_2_2
|
|
|
|
movq %rsi, %r10
|
|
call KeccakF1600_ce
|
|
|
|
jmp .Loop_absorb_104_ce
|
|
|
|
.align 64, 0xcc
|
|
.Loop_absorb_72_ce:
|
|
subq %r8, %rcx // len - bsz
|
|
jb .Labsorbed_ce
|
|
|
|
vpxorq 0*8(%rdx), A_0_0, A_0_0
|
|
vpxorq 1*8(%rdx), A_0_1, A_0_1
|
|
vpxorq 2*8(%rdx), A_0_2, A_0_2
|
|
vpxorq 3*8(%rdx), A_0_3, A_0_3
|
|
vpxorq 4*8(%rdx), A_0_4, A_0_4
|
|
vpxorq 5*8(%rdx), A_1_0, A_1_0
|
|
vpxorq 6*8(%rdx), A_1_1, A_1_1
|
|
vpxorq 7*8(%rdx), A_1_2, A_1_2
|
|
vmovq 8*8(%rdx), C_0
|
|
leaq 9*8(%rdx), %rdx
|
|
vpxorq C_0, A_1_3, A_1_3
|
|
|
|
movq %rsi, %r10
|
|
call KeccakF1600_ce
|
|
|
|
jmp .Loop_absorb_72_ce
|
|
|
|
.align 64, 0xcc
|
|
.Labsorbed_ce:
|
|
vpunpcklqdq A_0_1, A_0_0, A_0_0
|
|
vpunpcklqdq A_0_3, A_0_2, A_0_2
|
|
vpunpcklqdq A_1_0, A_0_4, A_0_4
|
|
vpunpcklqdq A_1_2, A_1_1, A_1_1
|
|
vpunpcklqdq A_1_4, A_1_3, A_1_3
|
|
vpunpcklqdq A_2_1, A_2_0, A_2_0
|
|
vpunpcklqdq A_2_3, A_2_2, A_2_2
|
|
vpunpcklqdq A_3_0, A_2_4, A_2_4
|
|
vpunpcklqdq A_3_2, A_3_1, A_3_1
|
|
vpunpcklqdq A_3_4, A_3_3, A_3_3
|
|
vpunpcklqdq A_4_1, A_4_0, A_4_0
|
|
vpunpcklqdq A_4_3, A_4_2, A_4_2
|
|
vmovdqu64 A_0_0, 0*8(%rdi)
|
|
vmovdqu64 A_0_2, 2*8(%rdi)
|
|
vmovdqu64 A_0_4, 4*8(%rdi)
|
|
vmovdqu64 A_1_1, 6*8(%rdi)
|
|
vmovdqu64 A_1_3, 8*8(%rdi)
|
|
vmovdqu64 A_2_0, 10*8(%rdi)
|
|
vmovdqu64 A_2_2, 0*8(%rax)
|
|
vmovdqu64 A_2_4, 2*8(%rax)
|
|
vmovdqu A_3_1, 4*8(%rax)
|
|
vmovdqu A_3_3, 6*8(%rax)
|
|
vmovdqu A_4_0, 8*8(%rax)
|
|
vmovdqu A_4_2, 10*8(%rax)
|
|
vmovq A_4_4, 12*8(%rax)
|
|
|
|
leaq (%r8, %rcx), %rax // return value
|
|
movq %rdx, (%r9) // return buffer pointer
|
|
|
|
clear_regs()
|
|
ret_spec_stop
|
|
CFI_ENDPROC()
|
|
ELF(.size _gcry_keccak_absorb_blocks_avx512,
|
|
.-_gcry_keccak_absorb_blocks_avx512)
|
|
|
|
#endif /* HAVE_GCC_INLINE_ASM_AVX512 */
|
|
#endif /* __x86_64 */
|