661 lines
15 KiB
ArmAsm
661 lines
15 KiB
ArmAsm
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
|
/*
|
|
* SM4 Cipher Algorithm for ARMv8 with Crypto Extensions
|
|
* as specified in
|
|
* https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
|
|
*
|
|
* Copyright (C) 2022, Alibaba Group.
|
|
* Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
|
|
*/
|
|
|
|
#include <linux/linkage.h>
|
|
#include <asm/assembler.h>
|
|
|
|
.arch armv8-a+crypto
|
|
|
|
.irp b, 0, 1, 2, 3, 4, 5, 6, 7, 16, 20, 24, 25, 26, 27, 28, 29, 30, 31
|
|
.set .Lv\b\().4s, \b
|
|
.endr
|
|
|
|
.macro sm4e, vd, vn
|
|
.inst 0xcec08400 | (.L\vn << 5) | .L\vd
|
|
.endm
|
|
|
|
.macro sm4ekey, vd, vn, vm
|
|
.inst 0xce60c800 | (.L\vm << 16) | (.L\vn << 5) | .L\vd
|
|
.endm
|
|
|
|
/* Register macros */
|
|
|
|
#define RTMP0 v16
|
|
#define RTMP1 v17
|
|
#define RTMP2 v18
|
|
#define RTMP3 v19
|
|
|
|
#define RIV v20
|
|
|
|
/* Helper macros. */
|
|
|
|
#define PREPARE \
|
|
ld1 {v24.16b-v27.16b}, [x0], #64; \
|
|
ld1 {v28.16b-v31.16b}, [x0];
|
|
|
|
#define SM4_CRYPT_BLK(b0) \
|
|
rev32 b0.16b, b0.16b; \
|
|
sm4e b0.4s, v24.4s; \
|
|
sm4e b0.4s, v25.4s; \
|
|
sm4e b0.4s, v26.4s; \
|
|
sm4e b0.4s, v27.4s; \
|
|
sm4e b0.4s, v28.4s; \
|
|
sm4e b0.4s, v29.4s; \
|
|
sm4e b0.4s, v30.4s; \
|
|
sm4e b0.4s, v31.4s; \
|
|
rev64 b0.4s, b0.4s; \
|
|
ext b0.16b, b0.16b, b0.16b, #8; \
|
|
rev32 b0.16b, b0.16b;
|
|
|
|
#define SM4_CRYPT_BLK4(b0, b1, b2, b3) \
|
|
rev32 b0.16b, b0.16b; \
|
|
rev32 b1.16b, b1.16b; \
|
|
rev32 b2.16b, b2.16b; \
|
|
rev32 b3.16b, b3.16b; \
|
|
sm4e b0.4s, v24.4s; \
|
|
sm4e b1.4s, v24.4s; \
|
|
sm4e b2.4s, v24.4s; \
|
|
sm4e b3.4s, v24.4s; \
|
|
sm4e b0.4s, v25.4s; \
|
|
sm4e b1.4s, v25.4s; \
|
|
sm4e b2.4s, v25.4s; \
|
|
sm4e b3.4s, v25.4s; \
|
|
sm4e b0.4s, v26.4s; \
|
|
sm4e b1.4s, v26.4s; \
|
|
sm4e b2.4s, v26.4s; \
|
|
sm4e b3.4s, v26.4s; \
|
|
sm4e b0.4s, v27.4s; \
|
|
sm4e b1.4s, v27.4s; \
|
|
sm4e b2.4s, v27.4s; \
|
|
sm4e b3.4s, v27.4s; \
|
|
sm4e b0.4s, v28.4s; \
|
|
sm4e b1.4s, v28.4s; \
|
|
sm4e b2.4s, v28.4s; \
|
|
sm4e b3.4s, v28.4s; \
|
|
sm4e b0.4s, v29.4s; \
|
|
sm4e b1.4s, v29.4s; \
|
|
sm4e b2.4s, v29.4s; \
|
|
sm4e b3.4s, v29.4s; \
|
|
sm4e b0.4s, v30.4s; \
|
|
sm4e b1.4s, v30.4s; \
|
|
sm4e b2.4s, v30.4s; \
|
|
sm4e b3.4s, v30.4s; \
|
|
sm4e b0.4s, v31.4s; \
|
|
sm4e b1.4s, v31.4s; \
|
|
sm4e b2.4s, v31.4s; \
|
|
sm4e b3.4s, v31.4s; \
|
|
rev64 b0.4s, b0.4s; \
|
|
rev64 b1.4s, b1.4s; \
|
|
rev64 b2.4s, b2.4s; \
|
|
rev64 b3.4s, b3.4s; \
|
|
ext b0.16b, b0.16b, b0.16b, #8; \
|
|
ext b1.16b, b1.16b, b1.16b, #8; \
|
|
ext b2.16b, b2.16b, b2.16b, #8; \
|
|
ext b3.16b, b3.16b, b3.16b, #8; \
|
|
rev32 b0.16b, b0.16b; \
|
|
rev32 b1.16b, b1.16b; \
|
|
rev32 b2.16b, b2.16b; \
|
|
rev32 b3.16b, b3.16b;
|
|
|
|
#define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7) \
|
|
rev32 b0.16b, b0.16b; \
|
|
rev32 b1.16b, b1.16b; \
|
|
rev32 b2.16b, b2.16b; \
|
|
rev32 b3.16b, b3.16b; \
|
|
rev32 b4.16b, b4.16b; \
|
|
rev32 b5.16b, b5.16b; \
|
|
rev32 b6.16b, b6.16b; \
|
|
rev32 b7.16b, b7.16b; \
|
|
sm4e b0.4s, v24.4s; \
|
|
sm4e b1.4s, v24.4s; \
|
|
sm4e b2.4s, v24.4s; \
|
|
sm4e b3.4s, v24.4s; \
|
|
sm4e b4.4s, v24.4s; \
|
|
sm4e b5.4s, v24.4s; \
|
|
sm4e b6.4s, v24.4s; \
|
|
sm4e b7.4s, v24.4s; \
|
|
sm4e b0.4s, v25.4s; \
|
|
sm4e b1.4s, v25.4s; \
|
|
sm4e b2.4s, v25.4s; \
|
|
sm4e b3.4s, v25.4s; \
|
|
sm4e b4.4s, v25.4s; \
|
|
sm4e b5.4s, v25.4s; \
|
|
sm4e b6.4s, v25.4s; \
|
|
sm4e b7.4s, v25.4s; \
|
|
sm4e b0.4s, v26.4s; \
|
|
sm4e b1.4s, v26.4s; \
|
|
sm4e b2.4s, v26.4s; \
|
|
sm4e b3.4s, v26.4s; \
|
|
sm4e b4.4s, v26.4s; \
|
|
sm4e b5.4s, v26.4s; \
|
|
sm4e b6.4s, v26.4s; \
|
|
sm4e b7.4s, v26.4s; \
|
|
sm4e b0.4s, v27.4s; \
|
|
sm4e b1.4s, v27.4s; \
|
|
sm4e b2.4s, v27.4s; \
|
|
sm4e b3.4s, v27.4s; \
|
|
sm4e b4.4s, v27.4s; \
|
|
sm4e b5.4s, v27.4s; \
|
|
sm4e b6.4s, v27.4s; \
|
|
sm4e b7.4s, v27.4s; \
|
|
sm4e b0.4s, v28.4s; \
|
|
sm4e b1.4s, v28.4s; \
|
|
sm4e b2.4s, v28.4s; \
|
|
sm4e b3.4s, v28.4s; \
|
|
sm4e b4.4s, v28.4s; \
|
|
sm4e b5.4s, v28.4s; \
|
|
sm4e b6.4s, v28.4s; \
|
|
sm4e b7.4s, v28.4s; \
|
|
sm4e b0.4s, v29.4s; \
|
|
sm4e b1.4s, v29.4s; \
|
|
sm4e b2.4s, v29.4s; \
|
|
sm4e b3.4s, v29.4s; \
|
|
sm4e b4.4s, v29.4s; \
|
|
sm4e b5.4s, v29.4s; \
|
|
sm4e b6.4s, v29.4s; \
|
|
sm4e b7.4s, v29.4s; \
|
|
sm4e b0.4s, v30.4s; \
|
|
sm4e b1.4s, v30.4s; \
|
|
sm4e b2.4s, v30.4s; \
|
|
sm4e b3.4s, v30.4s; \
|
|
sm4e b4.4s, v30.4s; \
|
|
sm4e b5.4s, v30.4s; \
|
|
sm4e b6.4s, v30.4s; \
|
|
sm4e b7.4s, v30.4s; \
|
|
sm4e b0.4s, v31.4s; \
|
|
sm4e b1.4s, v31.4s; \
|
|
sm4e b2.4s, v31.4s; \
|
|
sm4e b3.4s, v31.4s; \
|
|
sm4e b4.4s, v31.4s; \
|
|
sm4e b5.4s, v31.4s; \
|
|
sm4e b6.4s, v31.4s; \
|
|
sm4e b7.4s, v31.4s; \
|
|
rev64 b0.4s, b0.4s; \
|
|
rev64 b1.4s, b1.4s; \
|
|
rev64 b2.4s, b2.4s; \
|
|
rev64 b3.4s, b3.4s; \
|
|
rev64 b4.4s, b4.4s; \
|
|
rev64 b5.4s, b5.4s; \
|
|
rev64 b6.4s, b6.4s; \
|
|
rev64 b7.4s, b7.4s; \
|
|
ext b0.16b, b0.16b, b0.16b, #8; \
|
|
ext b1.16b, b1.16b, b1.16b, #8; \
|
|
ext b2.16b, b2.16b, b2.16b, #8; \
|
|
ext b3.16b, b3.16b, b3.16b, #8; \
|
|
ext b4.16b, b4.16b, b4.16b, #8; \
|
|
ext b5.16b, b5.16b, b5.16b, #8; \
|
|
ext b6.16b, b6.16b, b6.16b, #8; \
|
|
ext b7.16b, b7.16b, b7.16b, #8; \
|
|
rev32 b0.16b, b0.16b; \
|
|
rev32 b1.16b, b1.16b; \
|
|
rev32 b2.16b, b2.16b; \
|
|
rev32 b3.16b, b3.16b; \
|
|
rev32 b4.16b, b4.16b; \
|
|
rev32 b5.16b, b5.16b; \
|
|
rev32 b6.16b, b6.16b; \
|
|
rev32 b7.16b, b7.16b;
|
|
|
|
|
|
.align 3
|
|
SYM_FUNC_START(sm4_ce_expand_key)
|
|
/* input:
|
|
* x0: 128-bit key
|
|
* x1: rkey_enc
|
|
* x2: rkey_dec
|
|
* x3: fk array
|
|
* x4: ck array
|
|
*/
|
|
ld1 {v0.16b}, [x0];
|
|
rev32 v0.16b, v0.16b;
|
|
ld1 {v1.16b}, [x3];
|
|
/* load ck */
|
|
ld1 {v24.16b-v27.16b}, [x4], #64;
|
|
ld1 {v28.16b-v31.16b}, [x4];
|
|
|
|
/* input ^ fk */
|
|
eor v0.16b, v0.16b, v1.16b;
|
|
|
|
sm4ekey v0.4s, v0.4s, v24.4s;
|
|
sm4ekey v1.4s, v0.4s, v25.4s;
|
|
sm4ekey v2.4s, v1.4s, v26.4s;
|
|
sm4ekey v3.4s, v2.4s, v27.4s;
|
|
sm4ekey v4.4s, v3.4s, v28.4s;
|
|
sm4ekey v5.4s, v4.4s, v29.4s;
|
|
sm4ekey v6.4s, v5.4s, v30.4s;
|
|
sm4ekey v7.4s, v6.4s, v31.4s;
|
|
|
|
st1 {v0.16b-v3.16b}, [x1], #64;
|
|
st1 {v4.16b-v7.16b}, [x1];
|
|
rev64 v7.4s, v7.4s;
|
|
rev64 v6.4s, v6.4s;
|
|
rev64 v5.4s, v5.4s;
|
|
rev64 v4.4s, v4.4s;
|
|
rev64 v3.4s, v3.4s;
|
|
rev64 v2.4s, v2.4s;
|
|
rev64 v1.4s, v1.4s;
|
|
rev64 v0.4s, v0.4s;
|
|
ext v7.16b, v7.16b, v7.16b, #8;
|
|
ext v6.16b, v6.16b, v6.16b, #8;
|
|
ext v5.16b, v5.16b, v5.16b, #8;
|
|
ext v4.16b, v4.16b, v4.16b, #8;
|
|
ext v3.16b, v3.16b, v3.16b, #8;
|
|
ext v2.16b, v2.16b, v2.16b, #8;
|
|
ext v1.16b, v1.16b, v1.16b, #8;
|
|
ext v0.16b, v0.16b, v0.16b, #8;
|
|
st1 {v7.16b}, [x2], #16;
|
|
st1 {v6.16b}, [x2], #16;
|
|
st1 {v5.16b}, [x2], #16;
|
|
st1 {v4.16b}, [x2], #16;
|
|
st1 {v3.16b}, [x2], #16;
|
|
st1 {v2.16b}, [x2], #16;
|
|
st1 {v1.16b}, [x2], #16;
|
|
st1 {v0.16b}, [x2];
|
|
|
|
ret;
|
|
SYM_FUNC_END(sm4_ce_expand_key)
|
|
|
|
.align 3
|
|
SYM_FUNC_START(sm4_ce_crypt_block)
|
|
/* input:
|
|
* x0: round key array, CTX
|
|
* x1: dst
|
|
* x2: src
|
|
*/
|
|
PREPARE;
|
|
|
|
ld1 {v0.16b}, [x2];
|
|
SM4_CRYPT_BLK(v0);
|
|
st1 {v0.16b}, [x1];
|
|
|
|
ret;
|
|
SYM_FUNC_END(sm4_ce_crypt_block)
|
|
|
|
.align 3
|
|
SYM_FUNC_START(sm4_ce_crypt)
|
|
/* input:
|
|
* x0: round key array, CTX
|
|
* x1: dst
|
|
* x2: src
|
|
* w3: nblocks
|
|
*/
|
|
PREPARE;
|
|
|
|
.Lcrypt_loop_blk:
|
|
sub w3, w3, #8;
|
|
tbnz w3, #31, .Lcrypt_tail8;
|
|
|
|
ld1 {v0.16b-v3.16b}, [x2], #64;
|
|
ld1 {v4.16b-v7.16b}, [x2], #64;
|
|
|
|
SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
|
|
|
|
st1 {v0.16b-v3.16b}, [x1], #64;
|
|
st1 {v4.16b-v7.16b}, [x1], #64;
|
|
|
|
cbz w3, .Lcrypt_end;
|
|
b .Lcrypt_loop_blk;
|
|
|
|
.Lcrypt_tail8:
|
|
add w3, w3, #8;
|
|
cmp w3, #4;
|
|
blt .Lcrypt_tail4;
|
|
|
|
sub w3, w3, #4;
|
|
|
|
ld1 {v0.16b-v3.16b}, [x2], #64;
|
|
SM4_CRYPT_BLK4(v0, v1, v2, v3);
|
|
st1 {v0.16b-v3.16b}, [x1], #64;
|
|
|
|
cbz w3, .Lcrypt_end;
|
|
|
|
.Lcrypt_tail4:
|
|
sub w3, w3, #1;
|
|
|
|
ld1 {v0.16b}, [x2], #16;
|
|
SM4_CRYPT_BLK(v0);
|
|
st1 {v0.16b}, [x1], #16;
|
|
|
|
cbnz w3, .Lcrypt_tail4;
|
|
|
|
.Lcrypt_end:
|
|
ret;
|
|
SYM_FUNC_END(sm4_ce_crypt)
|
|
|
|
.align 3
|
|
SYM_FUNC_START(sm4_ce_cbc_enc)
|
|
/* input:
|
|
* x0: round key array, CTX
|
|
* x1: dst
|
|
* x2: src
|
|
* x3: iv (big endian, 128 bit)
|
|
* w4: nblocks
|
|
*/
|
|
PREPARE;
|
|
|
|
ld1 {RIV.16b}, [x3];
|
|
|
|
.Lcbc_enc_loop:
|
|
sub w4, w4, #1;
|
|
|
|
ld1 {RTMP0.16b}, [x2], #16;
|
|
eor RIV.16b, RIV.16b, RTMP0.16b;
|
|
|
|
SM4_CRYPT_BLK(RIV);
|
|
|
|
st1 {RIV.16b}, [x1], #16;
|
|
|
|
cbnz w4, .Lcbc_enc_loop;
|
|
|
|
/* store new IV */
|
|
st1 {RIV.16b}, [x3];
|
|
|
|
ret;
|
|
SYM_FUNC_END(sm4_ce_cbc_enc)
|
|
|
|
.align 3
|
|
SYM_FUNC_START(sm4_ce_cbc_dec)
|
|
/* input:
|
|
* x0: round key array, CTX
|
|
* x1: dst
|
|
* x2: src
|
|
* x3: iv (big endian, 128 bit)
|
|
* w4: nblocks
|
|
*/
|
|
PREPARE;
|
|
|
|
ld1 {RIV.16b}, [x3];
|
|
|
|
.Lcbc_loop_blk:
|
|
sub w4, w4, #8;
|
|
tbnz w4, #31, .Lcbc_tail8;
|
|
|
|
ld1 {v0.16b-v3.16b}, [x2], #64;
|
|
ld1 {v4.16b-v7.16b}, [x2];
|
|
|
|
SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
|
|
|
|
sub x2, x2, #64;
|
|
eor v0.16b, v0.16b, RIV.16b;
|
|
ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
|
|
eor v1.16b, v1.16b, RTMP0.16b;
|
|
eor v2.16b, v2.16b, RTMP1.16b;
|
|
eor v3.16b, v3.16b, RTMP2.16b;
|
|
st1 {v0.16b-v3.16b}, [x1], #64;
|
|
|
|
eor v4.16b, v4.16b, RTMP3.16b;
|
|
ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
|
|
eor v5.16b, v5.16b, RTMP0.16b;
|
|
eor v6.16b, v6.16b, RTMP1.16b;
|
|
eor v7.16b, v7.16b, RTMP2.16b;
|
|
|
|
mov RIV.16b, RTMP3.16b;
|
|
st1 {v4.16b-v7.16b}, [x1], #64;
|
|
|
|
cbz w4, .Lcbc_end;
|
|
b .Lcbc_loop_blk;
|
|
|
|
.Lcbc_tail8:
|
|
add w4, w4, #8;
|
|
cmp w4, #4;
|
|
blt .Lcbc_tail4;
|
|
|
|
sub w4, w4, #4;
|
|
|
|
ld1 {v0.16b-v3.16b}, [x2];
|
|
|
|
SM4_CRYPT_BLK4(v0, v1, v2, v3);
|
|
|
|
eor v0.16b, v0.16b, RIV.16b;
|
|
ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
|
|
eor v1.16b, v1.16b, RTMP0.16b;
|
|
eor v2.16b, v2.16b, RTMP1.16b;
|
|
eor v3.16b, v3.16b, RTMP2.16b;
|
|
|
|
mov RIV.16b, RTMP3.16b;
|
|
st1 {v0.16b-v3.16b}, [x1], #64;
|
|
|
|
cbz w4, .Lcbc_end;
|
|
|
|
.Lcbc_tail4:
|
|
sub w4, w4, #1;
|
|
|
|
ld1 {v0.16b}, [x2];
|
|
|
|
SM4_CRYPT_BLK(v0);
|
|
|
|
eor v0.16b, v0.16b, RIV.16b;
|
|
ld1 {RIV.16b}, [x2], #16;
|
|
st1 {v0.16b}, [x1], #16;
|
|
|
|
cbnz w4, .Lcbc_tail4;
|
|
|
|
.Lcbc_end:
|
|
/* store new IV */
|
|
st1 {RIV.16b}, [x3];
|
|
|
|
ret;
|
|
SYM_FUNC_END(sm4_ce_cbc_dec)
|
|
|
|
.align 3
|
|
SYM_FUNC_START(sm4_ce_cfb_enc)
|
|
/* input:
|
|
* x0: round key array, CTX
|
|
* x1: dst
|
|
* x2: src
|
|
* x3: iv (big endian, 128 bit)
|
|
* w4: nblocks
|
|
*/
|
|
PREPARE;
|
|
|
|
ld1 {RIV.16b}, [x3];
|
|
|
|
.Lcfb_enc_loop:
|
|
sub w4, w4, #1;
|
|
|
|
SM4_CRYPT_BLK(RIV);
|
|
|
|
ld1 {RTMP0.16b}, [x2], #16;
|
|
eor RIV.16b, RIV.16b, RTMP0.16b;
|
|
st1 {RIV.16b}, [x1], #16;
|
|
|
|
cbnz w4, .Lcfb_enc_loop;
|
|
|
|
/* store new IV */
|
|
st1 {RIV.16b}, [x3];
|
|
|
|
ret;
|
|
SYM_FUNC_END(sm4_ce_cfb_enc)
|
|
|
|
.align 3
|
|
SYM_FUNC_START(sm4_ce_cfb_dec)
|
|
/* input:
|
|
* x0: round key array, CTX
|
|
* x1: dst
|
|
* x2: src
|
|
* x3: iv (big endian, 128 bit)
|
|
* w4: nblocks
|
|
*/
|
|
PREPARE;
|
|
|
|
ld1 {v0.16b}, [x3];
|
|
|
|
.Lcfb_loop_blk:
|
|
sub w4, w4, #8;
|
|
tbnz w4, #31, .Lcfb_tail8;
|
|
|
|
ld1 {v1.16b, v2.16b, v3.16b}, [x2], #48;
|
|
ld1 {v4.16b-v7.16b}, [x2];
|
|
|
|
SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
|
|
|
|
sub x2, x2, #48;
|
|
ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
|
|
eor v0.16b, v0.16b, RTMP0.16b;
|
|
eor v1.16b, v1.16b, RTMP1.16b;
|
|
eor v2.16b, v2.16b, RTMP2.16b;
|
|
eor v3.16b, v3.16b, RTMP3.16b;
|
|
st1 {v0.16b-v3.16b}, [x1], #64;
|
|
|
|
ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
|
|
eor v4.16b, v4.16b, RTMP0.16b;
|
|
eor v5.16b, v5.16b, RTMP1.16b;
|
|
eor v6.16b, v6.16b, RTMP2.16b;
|
|
eor v7.16b, v7.16b, RTMP3.16b;
|
|
st1 {v4.16b-v7.16b}, [x1], #64;
|
|
|
|
mov v0.16b, RTMP3.16b;
|
|
|
|
cbz w4, .Lcfb_end;
|
|
b .Lcfb_loop_blk;
|
|
|
|
.Lcfb_tail8:
|
|
add w4, w4, #8;
|
|
cmp w4, #4;
|
|
blt .Lcfb_tail4;
|
|
|
|
sub w4, w4, #4;
|
|
|
|
ld1 {v1.16b, v2.16b, v3.16b}, [x2];
|
|
|
|
SM4_CRYPT_BLK4(v0, v1, v2, v3);
|
|
|
|
ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
|
|
eor v0.16b, v0.16b, RTMP0.16b;
|
|
eor v1.16b, v1.16b, RTMP1.16b;
|
|
eor v2.16b, v2.16b, RTMP2.16b;
|
|
eor v3.16b, v3.16b, RTMP3.16b;
|
|
st1 {v0.16b-v3.16b}, [x1], #64;
|
|
|
|
mov v0.16b, RTMP3.16b;
|
|
|
|
cbz w4, .Lcfb_end;
|
|
|
|
.Lcfb_tail4:
|
|
sub w4, w4, #1;
|
|
|
|
SM4_CRYPT_BLK(v0);
|
|
|
|
ld1 {RTMP0.16b}, [x2], #16;
|
|
eor v0.16b, v0.16b, RTMP0.16b;
|
|
st1 {v0.16b}, [x1], #16;
|
|
|
|
mov v0.16b, RTMP0.16b;
|
|
|
|
cbnz w4, .Lcfb_tail4;
|
|
|
|
.Lcfb_end:
|
|
/* store new IV */
|
|
st1 {v0.16b}, [x3];
|
|
|
|
ret;
|
|
SYM_FUNC_END(sm4_ce_cfb_dec)
|
|
|
|
.align 3
|
|
SYM_FUNC_START(sm4_ce_ctr_enc)
|
|
/* input:
|
|
* x0: round key array, CTX
|
|
* x1: dst
|
|
* x2: src
|
|
* x3: ctr (big endian, 128 bit)
|
|
* w4: nblocks
|
|
*/
|
|
PREPARE;
|
|
|
|
ldp x7, x8, [x3];
|
|
rev x7, x7;
|
|
rev x8, x8;
|
|
|
|
.Lctr_loop_blk:
|
|
sub w4, w4, #8;
|
|
tbnz w4, #31, .Lctr_tail8;
|
|
|
|
#define inc_le128(vctr) \
|
|
mov vctr.d[1], x8; \
|
|
mov vctr.d[0], x7; \
|
|
adds x8, x8, #1; \
|
|
adc x7, x7, xzr; \
|
|
rev64 vctr.16b, vctr.16b;
|
|
|
|
/* construct CTRs */
|
|
inc_le128(v0); /* +0 */
|
|
inc_le128(v1); /* +1 */
|
|
inc_le128(v2); /* +2 */
|
|
inc_le128(v3); /* +3 */
|
|
inc_le128(v4); /* +4 */
|
|
inc_le128(v5); /* +5 */
|
|
inc_le128(v6); /* +6 */
|
|
inc_le128(v7); /* +7 */
|
|
|
|
SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
|
|
|
|
ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
|
|
eor v0.16b, v0.16b, RTMP0.16b;
|
|
eor v1.16b, v1.16b, RTMP1.16b;
|
|
eor v2.16b, v2.16b, RTMP2.16b;
|
|
eor v3.16b, v3.16b, RTMP3.16b;
|
|
st1 {v0.16b-v3.16b}, [x1], #64;
|
|
|
|
ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
|
|
eor v4.16b, v4.16b, RTMP0.16b;
|
|
eor v5.16b, v5.16b, RTMP1.16b;
|
|
eor v6.16b, v6.16b, RTMP2.16b;
|
|
eor v7.16b, v7.16b, RTMP3.16b;
|
|
st1 {v4.16b-v7.16b}, [x1], #64;
|
|
|
|
cbz w4, .Lctr_end;
|
|
b .Lctr_loop_blk;
|
|
|
|
.Lctr_tail8:
|
|
add w4, w4, #8;
|
|
cmp w4, #4;
|
|
blt .Lctr_tail4;
|
|
|
|
sub w4, w4, #4;
|
|
|
|
/* construct CTRs */
|
|
inc_le128(v0); /* +0 */
|
|
inc_le128(v1); /* +1 */
|
|
inc_le128(v2); /* +2 */
|
|
inc_le128(v3); /* +3 */
|
|
|
|
SM4_CRYPT_BLK4(v0, v1, v2, v3);
|
|
|
|
ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
|
|
eor v0.16b, v0.16b, RTMP0.16b;
|
|
eor v1.16b, v1.16b, RTMP1.16b;
|
|
eor v2.16b, v2.16b, RTMP2.16b;
|
|
eor v3.16b, v3.16b, RTMP3.16b;
|
|
st1 {v0.16b-v3.16b}, [x1], #64;
|
|
|
|
cbz w4, .Lctr_end;
|
|
|
|
.Lctr_tail4:
|
|
sub w4, w4, #1;
|
|
|
|
/* construct CTRs */
|
|
inc_le128(v0);
|
|
|
|
SM4_CRYPT_BLK(v0);
|
|
|
|
ld1 {RTMP0.16b}, [x2], #16;
|
|
eor v0.16b, v0.16b, RTMP0.16b;
|
|
st1 {v0.16b}, [x1], #16;
|
|
|
|
cbnz w4, .Lctr_tail4;
|
|
|
|
.Lctr_end:
|
|
/* store new CTR */
|
|
rev x7, x7;
|
|
rev x8, x8;
|
|
stp x7, x8, [x3];
|
|
|
|
ret;
|
|
SYM_FUNC_END(sm4_ce_ctr_enc)
|