grub/grub-core/lib/libgcrypt/cipher/keccak-amd64-avx512.S
Vladimir Serbinenko 3312af6e07 libgcrypt: Import libgcrypt 1.11
We currently use an old version of libgcrypt which results in us having
fewer ciphers and missing on many other improvements.

Signed-off-by: Vladimir Serbinenko <phcoder@gmail.com>
Reviewed-by: Daniel Kiper <daniel.kiper@oracle.com>
2025-07-11 23:12:50 +02:00

588 lines
16 KiB
ArmAsm

/* keccak-amd64-avx512.S - x86-64 AVX512 implementation of Keccak
*
* Copyright (C) 2022 Jussi Kivilinna <jussi.kivilinna@iki.fi>
*
* This file is part of Libgcrypt.
*
* Libgcrypt is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* Libgcrypt is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this program; if not, see <http://www.gnu.org/licenses/>.
*
* ---
*
* Core function `KeccakF1600_ce` based on ARMv8-CE KeccakF1600 implementation
* by Andy Polyakov from CRYPTOGAMS distribution `arm/keccak1600-armv8.pl`.
* `KeccakF1600_ce` was ported to x86-64 AVX512 and converted to use GCC
* preprocessed assembly and fitted with new absorb function optimized for
* x86-64. SHA3-256 performance on Intel tigerlake, 5.72 cpB.
*
* Original copyright license follows:
*
* Copyright (c) 2006, CRYPTOGAMS by <appro@openssl.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain copyright notices,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* * Neither the name of the CRYPTOGAMS nor the names of its
* copyright holder and contributors may be used to endorse or
* promote products derived from this software without specific
* prior written permission.
*
* ALTERNATIVELY, provided that this notice is retained in full, this
* product may be distributed under the terms of the GNU General Public
* License (GPL), in which case the provisions of the GPL apply INSTEAD OF
* those given above.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef __x86_64
#include <config.h>
#if defined(HAVE_GCC_INLINE_ASM_AVX512) && \
(defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
#include "asm-common-amd64.h"
.text
/* Register macros. */
#define A_0_0 %xmm31
#define A_0_1 %xmm30
#define A_0_2 %xmm29
#define A_0_3 %xmm28
#define A_0_4 %xmm27
#define A_1_0 %xmm26
#define A_1_1 %xmm25
#define A_1_2 %xmm24
#define A_1_3 %xmm23
#define A_1_4 %xmm22
#define A_2_0 %xmm21
#define A_2_1 %xmm20
#define A_2_2 %xmm19
#define A_2_3 %xmm18
#define A_2_4 %xmm17
#define A_3_0 %xmm16
#define A_3_1 %xmm15
#define A_3_2 %xmm14
#define A_3_3 %xmm13
#define A_3_4 %xmm12
#define A_4_0 %xmm11
#define A_4_1 %xmm10
#define A_4_2 %xmm9
#define A_4_3 %xmm8
#define A_4_4 %xmm7
#define C_0 %xmm6
#define C_1 %xmm5
#define C_2 %xmm4
#define C_3 %xmm3
#define C_4 %xmm2
#define C_5 %xmm1
#define C_6 %xmm0
#define D_0 C_4
#define D_1 C_5
#define D_2 C_6
#define D_3 C_2
#define D_4 C_3
/* Helper macros for ARMv8-CE to x86-64/AVX512 conversion. */
#define eor3_d(dst_s1, s2, s3) \
vpternlogq $0x96, s3, s2, dst_s1;
#define eor3(dst, s1, s2, s3) \
vmovdqa s1, dst; \
eor3_d(dst, s2, s3);
#define rax1_c(dst, s1, s2_rol1) \
vprolq $1, s2_rol1, dst; \
vpxor s1, dst, dst;
#define rax1_t(dst_s1, s2_rol1, tmp) \
vprolq $1, s2_rol1, tmp; \
vpxor tmp, dst_s1, dst_s1;
#define rax1_s(dst_s1, s2_rol1) \
vprolq $1, s2_rol1, s2_rol1; \
vpxor s2_rol1, dst_s1, dst_s1;
#define xar(dst, s1, s2, rol) \
vpxorq s2, s1, dst; \
vprolq $(rol), dst, dst;
#define xar_x(dst, s1, s2, rol) \
vpxor s2, s1, dst; \
vprolq $(rol), dst, dst;
#define bcax_d(dst_s1, s2, s3) \
vpternlogq $0xb4, s3, s2, dst_s1;
#define bcax(dst, s1, s2, s3) \
vmovdqa64 s1, dst; \
bcax_d(dst, s2, s3);
#define bcax_x(dst, s1, s2, s3) \
vmovdqa s1, dst; \
bcax_d(dst, s2, s3);
#define eor(dst, s1, s2) \
vpxorq s2, s1, dst;
/* Misc helper macros. */
#define clear_avx512_4regs(a, b, c, d) \
eor(a, a, a); eor(b, b, b); eor(c, c, c); eor(d, d, d);
#define clear_regs() \
vzeroall; /* xmm0-xmm15 */ \
clear_avx512_4regs(%ymm16, %ymm17, %ymm18, %ymm19); \
clear_avx512_4regs(%ymm20, %ymm21, %ymm22, %ymm23); \
clear_avx512_4regs(%ymm24, %ymm25, %ymm26, %ymm27); \
clear_avx512_4regs(%ymm28, %ymm29, %ymm30, %ymm31);
ELF(.type KeccakF1600_ce,@function)
.align 64, 0xcc
KeccakF1600_ce:
.Loop_ce:
CFI_STARTPROC()
////////////////////////////////////////////////// Theta
eor3( C_0, A_4_0, A_3_0, A_2_0)
eor3( C_1, A_4_1, A_3_1, A_2_1)
eor3( C_3, A_4_3, A_3_3, A_2_3)
eor3( C_2, A_4_2, A_3_2, A_2_2)
eor3( C_4, A_4_4, A_3_4, A_2_4)
eor3_d( C_0, A_1_0, A_0_0)
eor3_d( C_1, A_1_1, A_0_1)
eor3_d( C_3, A_1_3, A_0_3)
eor3_d( C_2, A_1_2, A_0_2)
eor3_d( C_4, A_1_4, A_0_4)
rax1_c( C_5, C_0, C_2) // D[1]
rax1_t( C_2, C_4, C_6) // D[3]
rax1_c( C_6, C_1, C_3) // D[2]
rax1_s( C_3, C_0) // D[4]
rax1_s( C_4, C_1) // D[0]
////////////////////////////////////////////////// Theta+Rho+Pi
xar( C_0, A_0_1, D_1, 1) // C[0]=A[2][0]
xar( A_0_1, A_1_1, D_1, 44)
xar( A_1_1, A_1_4, D_4, 20)
xar( A_1_4, A_4_2, D_2, 61)
xar( A_4_2, A_2_4, D_4, 39)
xar( A_2_4, A_4_0, D_0, 18)
xar( C_1, A_0_2, D_2, 62) // C[1]=A[4][0]
xar( A_0_2, A_2_2, D_2, 43)
xar( A_2_2, A_2_3, D_3, 25)
xar( A_2_3, A_3_4, D_4, 8)
xar_x( A_3_4, A_4_3, D_3, 56)
xar( A_4_3, A_3_0, D_0, 41)
xar( A_3_0, A_0_4, D_4, 27)
xar_x( D_4, A_4_4, D_4, 14) // D[4]=A[0][4]
xar_x( A_4_4, A_4_1, D_1, 2)
xar( A_1_3, A_1_3, D_3, 55) // A[1][3]=A[4][1]
xar( A_0_4, A_3_1, D_1, 45) // A[0][4]=A[1][3]
xar( A_3_1, A_1_0, D_0, 36)
xar( A_1_0, A_0_3, D_3, 28)
eor( A_0_0, A_0_0, D_0)
xar_x( D_3, A_3_3, D_3, 21) // D[3]=A[0][3]
xar( A_0_3, A_3_2, D_2, 15) // A[0][3]=A[3][3]
xar( D_1, A_2_1, D_1, 10) // D[1]=A[3][2]
xar( D_2, A_1_2, D_2, 6) // D[2]=A[2][1]
xar( D_0, A_2_0, D_0, 3) // D[0]=A[1][2]
////////////////////////////////////////////////// Chi+Iota
bcax_x( A_4_0, C_1, A_4_2, A_1_3) // A[1][3]=A[4][1]
bcax( A_4_1, A_1_3, A_4_3, A_4_2) // A[1][3]=A[4][1]
bcax_d( A_4_2, A_4_4, A_4_3)
bcax_d( A_4_3, C_1, A_4_4)
bcax_d( A_4_4, A_1_3, C_1) // A[1][3]=A[4][1]
bcax_x( A_3_2, D_1, A_3_4, A_0_3) // A[0][3]=A[3][3]
bcax( A_3_3, A_0_3, A_3_0, A_3_4) // A[0][3]=A[3][3]
bcax_d( A_3_4, A_3_1, A_3_0)
bcax_d( A_3_0, D_1, A_3_1)
bcax_d( A_3_1, A_0_3, D_1) // A[0][3]=A[3][3]
bcax( A_2_0, C_0, A_2_2, D_2)
bcax( A_2_1, D_2, A_2_3, A_2_2)
bcax_d( A_2_2, A_2_4, A_2_3)
bcax_d( A_2_3, C_0, A_2_4)
bcax_d( A_2_4, D_2, C_0)
bcax( A_1_2, D_0, A_1_4, A_0_4) // A[0][4]=A[1][3]
bcax( A_1_3, A_0_4, A_1_0, A_1_4) // A[0][4]=A[1][3]
bcax_d( A_1_4, A_1_1, A_1_0)
bcax_d( A_1_0, D_0, A_1_1)
bcax_d( A_1_1, A_0_4, D_0) // A[0][4]=A[1][3]
bcax( A_0_3, D_3, A_0_0, D_4)
bcax( A_0_4, D_4, A_0_1, A_0_0)
bcax_d( A_0_0, A_0_2, A_0_1)
bcax_d( A_0_1, D_3, A_0_2)
bcax_d( A_0_2, D_4, D_3)
eor( A_0_0, A_0_0, (%r10))
cmpq %r10, %r11
je .Lend_ce
addq $8, %r10
jmp .Loop_ce
.align 64, 0xcc
.Lend_ce:
ret_spec_stop
CFI_ENDPROC()
ELF(.size KeccakF1600_ce,.-KeccakF1600_ce)
.globl _gcry_keccak_f1600_state_permute64_avx512
ELF(.type _gcry_keccak_f1600_state_permute64_avx512,@function)
.align 64, 0xcc
_gcry_keccak_f1600_state_permute64_avx512:
/* input:
* %rdi: state
* %rsi: round constants
*/
CFI_STARTPROC()
spec_stop_avx512;
leaq 12*8(%rdi), %rax
leaq (24-1)*8(%rsi), %r11
vmovdqu64 0*8(%rdi), A_0_0
vmovdqu64 1*8(%rdi), A_0_1
vmovdqu64 2*8(%rdi), A_0_2
vmovdqu64 3*8(%rdi), A_0_3
vmovdqu64 4*8(%rdi), A_0_4
vmovdqu64 5*8(%rdi), A_1_0
vmovdqu64 6*8(%rdi), A_1_1
vmovdqu64 7*8(%rdi), A_1_2
vmovdqu64 8*8(%rdi), A_1_3
vmovdqu64 9*8(%rdi), A_1_4
vmovdqu64 10*8(%rdi), A_2_0
vmovdqu64 11*8(%rdi), A_2_1
vmovdqu64 0*8(%rax), A_2_2
vmovdqu64 1*8(%rax), A_2_3
vmovdqu64 2*8(%rax), A_2_4
vmovdqu64 3*8(%rax), A_3_0
vmovdqu 4*8(%rax), A_3_1
vmovdqu 5*8(%rax), A_3_2
vmovdqu 6*8(%rax), A_3_3
vmovdqu 7*8(%rax), A_3_4
vmovdqu 8*8(%rax), A_4_0
vmovdqu 9*8(%rax), A_4_1
vmovdqu 10*8(%rax), A_4_2
vmovdqu 11*8(%rax), A_4_3
vmovq 12*8(%rax), A_4_4
movq %rsi, %r10
call KeccakF1600_ce
vpunpcklqdq A_0_1, A_0_0, A_0_0
vpunpcklqdq A_0_3, A_0_2, A_0_2
vpunpcklqdq A_1_0, A_0_4, A_0_4
vpunpcklqdq A_1_2, A_1_1, A_1_1
vpunpcklqdq A_1_4, A_1_3, A_1_3
vpunpcklqdq A_2_1, A_2_0, A_2_0
vpunpcklqdq A_2_3, A_2_2, A_2_2
vpunpcklqdq A_3_0, A_2_4, A_2_4
vpunpcklqdq A_3_2, A_3_1, A_3_1
vpunpcklqdq A_3_4, A_3_3, A_3_3
vpunpcklqdq A_4_1, A_4_0, A_4_0
vpunpcklqdq A_4_3, A_4_2, A_4_2
vmovdqu64 A_0_0, 0*8(%rdi)
vmovdqu64 A_0_2, 2*8(%rdi)
vmovdqu64 A_0_4, 4*8(%rdi)
vmovdqu64 A_1_1, 6*8(%rdi)
vmovdqu64 A_1_3, 8*8(%rdi)
vmovdqu64 A_2_0, 10*8(%rdi)
vmovdqu64 A_2_2, 0*8(%rax)
vmovdqu64 A_2_4, 2*8(%rax)
vmovdqu A_3_1, 4*8(%rax)
vmovdqu A_3_3, 6*8(%rax)
vmovdqu A_4_0, 8*8(%rax)
vmovdqu A_4_2, 10*8(%rax)
vmovq A_4_4, 12*8(%rax)
xorl %eax, %eax
clear_regs()
ret_spec_stop
CFI_ENDPROC()
ELF(.size _gcry_keccak_f1600_state_permute64_avx512,
.-_gcry_keccak_f1600_state_permute64_avx512)
.globl _gcry_keccak_absorb_blocks_avx512
ELF(.type _gcry_keccak_absorb_blocks_avx512,@function)
.align 64, 0xcc
_gcry_keccak_absorb_blocks_avx512:
/* input:
* %rdi: state
* %rsi: round constants
* %rdx: lanes
* %rcx: nlanes
* %r8 : blocklanes
* %r9 : lanes output pointer
*/
CFI_STARTPROC()
spec_stop_avx512;
leaq 12*8(%rdi), %rax
leaq (24-1)*8(%rsi), %r11
vmovdqu64 0*8(%rdi), A_0_0
vmovdqu64 1*8(%rdi), A_0_1
vmovdqu64 2*8(%rdi), A_0_2
vmovdqu64 3*8(%rdi), A_0_3
vmovdqu64 4*8(%rdi), A_0_4
vmovdqu64 5*8(%rdi), A_1_0
vmovdqu64 6*8(%rdi), A_1_1
vmovdqu64 7*8(%rdi), A_1_2
vmovdqu64 8*8(%rdi), A_1_3
vmovdqu64 9*8(%rdi), A_1_4
vmovdqu64 10*8(%rdi), A_2_0
vmovdqu64 11*8(%rdi), A_2_1
vmovdqu64 0*8(%rax), A_2_2
vmovdqu64 1*8(%rax), A_2_3
vmovdqu64 2*8(%rax), A_2_4
vmovdqu64 3*8(%rax), A_3_0
vmovdqu 4*8(%rax), A_3_1
vmovdqu 5*8(%rax), A_3_2
vmovdqu 6*8(%rax), A_3_3
vmovdqu 7*8(%rax), A_3_4
vmovdqu 8*8(%rax), A_4_0
vmovdqu 9*8(%rax), A_4_1
vmovdqu 10*8(%rax), A_4_2
vmovdqu 11*8(%rax), A_4_3
vmovq 12*8(%rax), A_4_4
cmpq $(104 >> 3), %r8
jb .Loop_absorb_72_ce
je .Loop_absorb_104_ce
cmpq $(144 >> 3), %r8
jb .Loop_absorb_136_ce
je .Loop_absorb_144_ce
jmp .Loop_absorb_168_ce
.align 64, 0xcc
.Loop_absorb_168_ce:
subq %r8, %rcx // len - bsz
jb .Labsorbed_ce
vpxorq 0*8(%rdx), A_0_0, A_0_0
vpxorq 1*8(%rdx), A_0_1, A_0_1
vpxorq 2*8(%rdx), A_0_2, A_0_2
vpxorq 3*8(%rdx), A_0_3, A_0_3
vpxorq 4*8(%rdx), A_0_4, A_0_4
vpxorq 5*8(%rdx), A_1_0, A_1_0
vpxorq 6*8(%rdx), A_1_1, A_1_1
vpxorq 7*8(%rdx), A_1_2, A_1_2
vpxorq 8*8(%rdx), A_1_3, A_1_3
vpxorq 9*8(%rdx), A_1_4, A_1_4
vpxorq 10*8(%rdx), A_2_0, A_2_0
vpxorq 11*8(%rdx), A_2_1, A_2_1
vpxorq 12*8(%rdx), A_2_2, A_2_2
vpxorq 13*8(%rdx), A_2_3, A_2_3
vpxorq 14*8(%rdx), A_2_4, A_2_4
vpxorq 15*8(%rdx), A_3_0, A_3_0
vpxor 16*8(%rdx), A_3_1, A_3_1
vpxor 17*8(%rdx), A_3_2, A_3_2
vpxor 18*8(%rdx), A_3_3, A_3_3
vpxor 19*8(%rdx), A_3_4, A_3_4
vmovq 20*8(%rdx), C_0
leaq 21*8(%rdx), %rdx
vpxorq C_0, A_4_0, A_4_0
movq %rsi, %r10
call KeccakF1600_ce
jmp .Loop_absorb_168_ce
.align 64, 0xcc
.Loop_absorb_144_ce:
subq %r8, %rcx // len - bsz
jb .Labsorbed_ce
vpxorq 0*8(%rdx), A_0_0, A_0_0
vpxorq 1*8(%rdx), A_0_1, A_0_1
vpxorq 2*8(%rdx), A_0_2, A_0_2
vpxorq 3*8(%rdx), A_0_3, A_0_3
vpxorq 4*8(%rdx), A_0_4, A_0_4
vpxorq 5*8(%rdx), A_1_0, A_1_0
vpxorq 6*8(%rdx), A_1_1, A_1_1
vpxorq 7*8(%rdx), A_1_2, A_1_2
vpxorq 8*8(%rdx), A_1_3, A_1_3
vpxorq 9*8(%rdx), A_1_4, A_1_4
vpxorq 10*8(%rdx), A_2_0, A_2_0
vpxorq 11*8(%rdx), A_2_1, A_2_1
vpxorq 12*8(%rdx), A_2_2, A_2_2
vpxorq 13*8(%rdx), A_2_3, A_2_3
vpxorq 14*8(%rdx), A_2_4, A_2_4
vpxorq 15*8(%rdx), A_3_0, A_3_0
vpxor 16*8(%rdx), A_3_1, A_3_1
vmovq 17*8(%rdx), C_0
leaq 18*8(%rdx), %rdx
vpxor C_0, A_3_2, A_3_2
movq %rsi, %r10
call KeccakF1600_ce
jmp .Loop_absorb_144_ce
.align 64, 0xcc
.Loop_absorb_136_ce:
subq %r8, %rcx // len - bsz
jb .Labsorbed_ce
vpxorq 0*8(%rdx), A_0_0, A_0_0
vpxorq 1*8(%rdx), A_0_1, A_0_1
vpxorq 2*8(%rdx), A_0_2, A_0_2
vpxorq 3*8(%rdx), A_0_3, A_0_3
vpxorq 4*8(%rdx), A_0_4, A_0_4
vpxorq 5*8(%rdx), A_1_0, A_1_0
vpxorq 6*8(%rdx), A_1_1, A_1_1
vpxorq 7*8(%rdx), A_1_2, A_1_2
vpxorq 8*8(%rdx), A_1_3, A_1_3
vpxorq 9*8(%rdx), A_1_4, A_1_4
vpxorq 10*8(%rdx), A_2_0, A_2_0
vpxorq 11*8(%rdx), A_2_1, A_2_1
vpxorq 12*8(%rdx), A_2_2, A_2_2
vpxorq 13*8(%rdx), A_2_3, A_2_3
vpxorq 14*8(%rdx), A_2_4, A_2_4
vpxorq 15*8(%rdx), A_3_0, A_3_0
vmovq 16*8(%rdx), C_0
leaq 17*8(%rdx), %rdx
vpxor C_0, A_3_1, A_3_1
movq %rsi, %r10
call KeccakF1600_ce
jmp .Loop_absorb_136_ce
.align 64, 0xcc
.Loop_absorb_104_ce:
subq %r8, %rcx // len - bsz
jb .Labsorbed_ce
vpxorq 0*8(%rdx), A_0_0, A_0_0
vpxorq 1*8(%rdx), A_0_1, A_0_1
vpxorq 2*8(%rdx), A_0_2, A_0_2
vpxorq 3*8(%rdx), A_0_3, A_0_3
vpxorq 4*8(%rdx), A_0_4, A_0_4
vpxorq 5*8(%rdx), A_1_0, A_1_0
vpxorq 6*8(%rdx), A_1_1, A_1_1
vpxorq 7*8(%rdx), A_1_2, A_1_2
vpxorq 8*8(%rdx), A_1_3, A_1_3
vpxorq 9*8(%rdx), A_1_4, A_1_4
vpxorq 10*8(%rdx), A_2_0, A_2_0
vpxorq 11*8(%rdx), A_2_1, A_2_1
vmovq 12*8(%rdx), C_0
leaq 13*8(%rdx), %rdx
vpxorq C_0, A_2_2, A_2_2
movq %rsi, %r10
call KeccakF1600_ce
jmp .Loop_absorb_104_ce
.align 64, 0xcc
.Loop_absorb_72_ce:
subq %r8, %rcx // len - bsz
jb .Labsorbed_ce
vpxorq 0*8(%rdx), A_0_0, A_0_0
vpxorq 1*8(%rdx), A_0_1, A_0_1
vpxorq 2*8(%rdx), A_0_2, A_0_2
vpxorq 3*8(%rdx), A_0_3, A_0_3
vpxorq 4*8(%rdx), A_0_4, A_0_4
vpxorq 5*8(%rdx), A_1_0, A_1_0
vpxorq 6*8(%rdx), A_1_1, A_1_1
vpxorq 7*8(%rdx), A_1_2, A_1_2
vmovq 8*8(%rdx), C_0
leaq 9*8(%rdx), %rdx
vpxorq C_0, A_1_3, A_1_3
movq %rsi, %r10
call KeccakF1600_ce
jmp .Loop_absorb_72_ce
.align 64, 0xcc
.Labsorbed_ce:
vpunpcklqdq A_0_1, A_0_0, A_0_0
vpunpcklqdq A_0_3, A_0_2, A_0_2
vpunpcklqdq A_1_0, A_0_4, A_0_4
vpunpcklqdq A_1_2, A_1_1, A_1_1
vpunpcklqdq A_1_4, A_1_3, A_1_3
vpunpcklqdq A_2_1, A_2_0, A_2_0
vpunpcklqdq A_2_3, A_2_2, A_2_2
vpunpcklqdq A_3_0, A_2_4, A_2_4
vpunpcklqdq A_3_2, A_3_1, A_3_1
vpunpcklqdq A_3_4, A_3_3, A_3_3
vpunpcklqdq A_4_1, A_4_0, A_4_0
vpunpcklqdq A_4_3, A_4_2, A_4_2
vmovdqu64 A_0_0, 0*8(%rdi)
vmovdqu64 A_0_2, 2*8(%rdi)
vmovdqu64 A_0_4, 4*8(%rdi)
vmovdqu64 A_1_1, 6*8(%rdi)
vmovdqu64 A_1_3, 8*8(%rdi)
vmovdqu64 A_2_0, 10*8(%rdi)
vmovdqu64 A_2_2, 0*8(%rax)
vmovdqu64 A_2_4, 2*8(%rax)
vmovdqu A_3_1, 4*8(%rax)
vmovdqu A_3_3, 6*8(%rax)
vmovdqu A_4_0, 8*8(%rax)
vmovdqu A_4_2, 10*8(%rax)
vmovq A_4_4, 12*8(%rax)
leaq (%r8, %rcx), %rax // return value
movq %rdx, (%r9) // return buffer pointer
clear_regs()
ret_spec_stop
CFI_ENDPROC()
ELF(.size _gcry_keccak_absorb_blocks_avx512,
.-_gcry_keccak_absorb_blocks_avx512)
#endif /* HAVE_GCC_INLINE_ASM_AVX512 */
#endif /* __x86_64 */