corecrypto/acceleratecrypto/Source/aes/arm/vpaes-armv7.s

752 lines
16 KiB
ArmAsm
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Copyright (c) (2015,2016,2019) Apple Inc. All rights reserved.
#
# corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
# is contained in the License.txt file distributed with corecrypto) and only to
# people who accept that license. IMPORTANT: Any license rights granted to you by
# Apple Inc. (if any) are limited to internal use within your organization only on
# devices and computers you own or control, for the sole purpose of verifying the
# security characteristics and correct functioning of the Apple Software. You may
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
#if !defined(__arm64__) && defined(__ARM_NEON__)
#define ekey r2
#define eax r4
.macro save_all_neon
#if BUILDKERNEL
vstmdb sp!, {q12-q15}
vstmdb sp!, {q8-q11}
vstmdb sp!, {q0-q3}
#endif
vstmdb sp!, {q4-q7}
.endm
.macro restore_all_neon
vldmia sp!, {q4-q7}
#if BUILDKERNEL
vldmia sp!, {q0-q3}
vldmia sp!, {q8-q11}
vldmia sp!, {q12-q15}
#endif
.endm
.macro vpaes_push
push {r4-r7,lr}
add r7, sp, #12
push {r8,r10,r11}
.endm
.macro vpaes_pop
pop {r8,r10,r11}
pop {r4-r7,pc}
.endm
.p2align 6
.Lk_ipt:
.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
.Lk_sbo:
.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
.Lk_mc_forward:
.quad 0x0407060500030201, 0x0C0F0E0D080B0A09
.quad 0x080B0A0904070605, 0x000302010C0F0E0D
.quad 0x0C0F0E0D080B0A09, 0x0407060500030201
.quad 0x000302010C0F0E0D, 0x080B0A0904070605
.Lk_mc_backward:
.quad 0x0605040702010003, 0x0E0D0C0F0A09080B
.quad 0x020100030E0D0C0F, 0x0A09080B06050407
.quad 0x0E0D0C0F0A09080B, 0x0605040702010003
.quad 0x0A09080B06050407, 0x020100030E0D0C0F
.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
.quad 0x030E09040F0A0500, 0x0B06010C07020D08
.quad 0x0F060D040B020900, 0x070E050C030A0108
.quad 0x0B0E0104070A0D00, 0x0306090C0F020508
.p2align 4
vpaes_encrypt_core:
mov r9, ekey
mov r11, #16
adr r10, .Lk_ipt
ldr eax, [ekey, #240]
vldmia r10!,{q3-q4}
vbic q1, q0, q9
vld1.8 {q5}, [r9]!
vshr.u32 q1, q1, #4
vand q0, q0, q9
vtbl.8 d4, {q3}, d0
vtbl.8 d5, {q3}, d1
adr r10, .Lk_mc_backward
vtbl.8 d0, {q4}, d2
vtbl.8 d1, {q4}, d3
veor q2, q2, q5
veor q0, q0, q2
cmp eax, #0
b .Lenc_entry
.p2align 4
.Lenc_loop:
vtbl.8 d8, {q13}, d4
vtbl.8 d9, {q13}, d5
vtbl.8 d0, {q12}, d6
vtbl.8 d1, {q12}, d7
veor q4, q4, q5
add r12, r10, r11
veor q5, q0, q4
vld1.8 {q4}, [r12 :128]
sub r12, r12, #64
vtbl.8 d12, {q15}, d4
vtbl.8 d13, {q15}, d5
vld1.8 {q1}, [r12 :128]
vtbl.8 d4, {q14}, d6
vtbl.8 d5, {q14}, d7
veor q2, q2, q6
vtbl.8 d6, {q5}, d8
vtbl.8 d7, {q5}, d9
vtbl.8 d0, {q5}, d2
vtbl.8 d1, {q5}, d3
veor q5, q0, q2
add r11, r11, #16
veor q3, q3, q5
vtbl.8 d0, {q5}, d2
vtbl.8 d1, {q5}, d3
and r11, r11, #48
subs eax, eax, #1
veor q0, q0, q3
.Lenc_entry:
vbic q1, q0, q9
vand q0, q0, q9
vshr.u32 q1, q1, #4
vtbl.8 d10, {q11}, d0
vtbl.8 d11, {q11}, d1
veor q0, q0, q1
vtbl.8 d6, {q10}, d2
vtbl.8 d7, {q10}, d3
vtbl.8 d8, {q10}, d0
vtbl.8 d9, {q10}, d1
veor q3, q3, q5
veor q4, q4, q5
vtbl.8 d4, {q10}, d6
vtbl.8 d5, {q10}, d7
vtbl.8 d6, {q10}, d8
vtbl.8 d7, {q10}, d9
veor q2, q2, q0
veor q3, q3, q1
vld1.8 {q5}, [r9]!
bgt .Lenc_loop
adr r12, .Lk_sbo
vld1.8 {q1}, [r12]!
vtbl.8 d8, {q1}, d4
vtbl.8 d9, {q1}, d5
vld1.8 {q2}, [r12]
add r12, r10, r11
veor q4, q4, q5
add r12, r12, #64
vtbl.8 d0, {q2}, d6
vtbl.8 d1, {q2}, d7
vld1.8 {q1}, [r12]
veor q2, q0, q4
vtbl.8 d0, {q2}, d2
vtbl.8 d1, {q2}, d3
bx lr
.p2align 4
.Lk_dipt:
.quad 0x0F505B040B545F00, 0x154A411E114E451A
.quad 0x86E383E660056500, 0x12771772F491F194
.quad 0x000302010C0F0E0D, 0x080B0A0904070605 // .Lk_mc_forward+48
.Lk_dsb9:
.quad 0x851C03539A86D600, 0xCAD51F504F994CC9
.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
.Lk_dsbd:
.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
.Lk_dsbb:
.quad 0xD022649296B44200, 0x602646F6B0F2D404
.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
.Lk_dsbe:
.quad 0x46F2929626D4D000, 0x2242600464B4F6B0
.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
.Lk_dsbo:
.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
.quad 0x0F060D040B020900, 0x070E050C030A0108
.p2align 4
vpaes_decrypt_core:
mov r9, r2 // dkey
ldr eax, [r2, #240] // Nr
adr r12, .Lk_dipt
vbic q1, q0, q9
vld1.64 {q3}, [r12 :128]!
vshr.u32 q1, q1, #4
vld1.8 {q5}, [r9]!
lsl r11, eax, #4
vand q2, q0, q9
vtbl.8 d4, {q3}, d4
vtbl.8 d5, {q3}, d5
vld1.64 {q4}, [r12 :128]!
eor r11, r11, #48
adr r10, .Lk_dsbd
vtbl.8 d0, {q4}, d2
vtbl.8 d1, {q4}, d3
and r11, r11, #48
veor q2, q2, q5
vld1.64 {q5}, [r12 :128]!
veor q0, q0, q2
cmp eax, #0
b .Ldec_entry
.p2align 4
.Ldec_loop:
sub r12, r10, 32
vld1.64 {q6-q7}, [r12 :128]!
vtbl.8 d8, {q6}, d4
vtbl.8 d9, {q6}, d5
vtbl.8 d2, {q7}, d6
vtbl.8 d3, {q7}, d7
vld1.64 {q6-q7}, [r12 :128]!
veor q0, q0, q4
vtbl.8 d8, {q6}, d4
vtbl.8 d9, {q6}, d5
veor q6, q0, q1
vtbl.8 d2, {q7}, d6
vtbl.8 d3, {q7}, d7
vtbl.8 d0, {q6}, d10
vtbl.8 d1, {q6}, d11
vld1.64 {q6-q7}, [r12 :128]!
veor q0, q0, q4
vtbl.8 d8, {q6}, d4
vtbl.8 d9, {q6}, d5
veor q6, q0, q1
vtbl.8 d2, {q7}, d6
vtbl.8 d3, {q7}, d7
vtbl.8 d0, {q6}, d10
vtbl.8 d1, {q6}, d11
vld1.64 {q6-q7}, [r12 :128]!
veor q0, q0, q4
vtbl.8 d8, {q6}, d4
vtbl.8 d9, {q6}, d5
veor q6, q0, q1
vtbl.8 d2, {q7}, d6
vtbl.8 d3, {q7}, d7
vtbl.8 d0, {q6}, d10
vtbl.8 d1, {q6}, d11
veor q0, q0, q4
vext.8 q5, q5, q5, #12
veor q0, q0, q1
subs eax, eax, #1
.Ldec_entry:
vbic q1, q0, q9
vand q0, q0, q9
vshr.u32 q1, q1, #4
vtbl.8 d4, {q11}, d0
vtbl.8 d5, {q11}, d1
veor q0, q0, q1
vtbl.8 d6, {q10}, d2
vtbl.8 d7, {q10}, d3
veor q3, q3, q2
vtbl.8 d8, {q10}, d0
vtbl.8 d9, {q10}, d1
veor q4, q4, q2
vtbl.8 d4, {q10}, d6
vtbl.8 d5, {q10}, d7
veor q2, q2, q0
vtbl.8 d6, {q10}, d8
vtbl.8 d7, {q10}, d9
vld1.8 {q0}, [r9]!
veor q3, q3, q1
bne .Ldec_loop
vld1.64 {q6-q7}, [r12 :128]!
vtbl.8 d8, {q6}, d4
vtbl.8 d9, {q6}, d5
add r12, r12, r11, lsr #1
vtbl.8 d6, {q7}, d6
vtbl.8 d7, {q7}, d7
vld1.64 {q2}, [r12]
veor q0, q0, q4
veor q1, q0, q3
vtbl.8 d0, {q1}, d4
vtbl.8 d1, {q1}, d5
bx lr
.p2align 6
.Lk_ipt2:
.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
.Lk_rcon:
.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
.Lk_sr:
.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
.quad 0x030E09040F0A0500, 0x0B06010C07020D08
.quad 0x0F060D040B020900, 0x070E050C030A0108
.quad 0x0B0E0104070A0D00, 0x0306090C0F020508
.p2align 4
vpaes_schedule_core:
bl vpaes_preheat
adr r10, .Lk_rcon
vld1.8 {q0}, [r0]
vld1.64 {q8}, [r10 :128]!
vmov q3, q0
adr r11, .Lk_ipt2
bl vpaes_schedule_transform
vmov q7, q0
cmp r3, #0
bne .Lschedule_am_decrypting
vst1.8 {q0}, [r2]
b .Lschedule_go
.Lschedule_am_decrypting:
add r12, r10, r8
vmov q1, q3
vld1.8 {q3}, [r12]
vtbl.8 d6, {q1}, d6
vtbl.8 d7, {q1}, d7
eor r8, r8, #48
vst1.8 {q3}, [r2]
.Lschedule_go:
cmp r1, #192
bgt .Lschedule_256
beq .Lschedule_192
.Lschedule_128:
mov r1, #10
.Loop_schedule_128:
bl vpaes_schedule_round
subs r1, r1, #1
beq .Lschedule_mangle_last
bl vpaes_schedule_mangle
b .Loop_schedule_128
.p2align 4
.Lschedule_192:
add r12, r0, #8
vld1.8 {q0}, [r12]
bl vpaes_schedule_transform
vmov d13, d1
veor d12, d12, d12
mov r1, #4
.Loop_schedule_192:
bl vpaes_schedule_round
vext.8 q0, q6, q0, #8
bl vpaes_schedule_mangle
bl vpaes_schedule_192_smear
bl vpaes_schedule_mangle
bl vpaes_schedule_round
subs r1, r1, #1
beq .Lschedule_mangle_last
bl vpaes_schedule_mangle
bl vpaes_schedule_192_smear
b .Loop_schedule_192
.p2align 4
.Lschedule_256:
add r12, r0, #16
vld1.8 {q0}, [r12]
bl vpaes_schedule_transform
mov r1, #7
.Loop_schedule_256:
bl vpaes_schedule_mangle
vmov q6, q0
bl vpaes_schedule_round
subs r1, r1, #1
beq .Lschedule_mangle_last
bl vpaes_schedule_mangle
vdup.32 q0, d1[1]
vmov q5, q7
vmov q7, q6
bl vpaes_schedule_low_round
vmov q7, q5
b .Loop_schedule_256
.p2align 4
.Lk_opt:
.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
.Lk_deskew:
.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
.p2align 4
.Lschedule_mangle_last:
adr r11, .Lk_deskew
cmp r3, #0
bne .Lschedule_mangle_last_dec
add r12, r8, r10
vld1.8 {q1}, [r12]
adr r11, .Lk_opt
vtbl.8 d2, {q0}, d2
vtbl.8 d3, {q0}, d3
vmov q0, q1
add r2, r2, #32
.Lschedule_mangle_last_dec:
adr r12, .Lk_s63
sub r2, r2, #16
vld1.8 {q1}, [r12]
veor q0, q0, q1
bl vpaes_schedule_transform
vst1.8 {q0}, [r2]
restore_all_neon
eor r0, r0, r0
vpaes_pop
.p2align 4
vpaes_schedule_192_smear:
vdup.32 q1, d12[0]
vdup.32 q0, d15[1]
vmov s7, s26
vmov s0, s30
veor q6, q6, q1
veor q6, q6, q0
vmov q0, q6
veor d12, d12, d12
bx lr
.p2align 4
vpaes_schedule_round:
veor q1, q1, q1
vext.8 q1, q8, q1, #15
vext.8 q8, q8, q8, #15
veor q7, q7, q1
vdup.32 q0, d1[1]
vext.8 q0, q0, q0, #1
vpaes_schedule_low_round:
veor q1, q1, q1
adr r12, .Lk_s63
vext.8 q1, q1, q7, #12
veor q2, q2, q2
veor q7, q7, q1
vld1.8 {q1}, [r12]
vext.8 q2, q2, q7, #8
veor q7, q7, q1
veor q7, q7, q2
vbic q1, q0, q9
vshr.u32 q1, q1, #4
vand q0, q0, q9
vtbl.8 d4, {q11}, d0
vtbl.8 d5, {q11}, d1
veor q0, q0, q1
vtbl.8 d6, {q10}, d2
vtbl.8 d7, {q10}, d3
veor q3, q3, q2
vtbl.8 d8, {q10}, d0
vtbl.8 d9, {q10}, d1
veor q4, q4, q2
vtbl.8 d4, {q10}, d6
vtbl.8 d5, {q10}, d7
veor q2, q2, q0
vtbl.8 d6, {q10}, d8
vtbl.8 d7, {q10}, d9
veor q3, q3, q1
vtbl.8 d8, {q13}, d4
vtbl.8 d9, {q13}, d5
vtbl.8 d0, {q12}, d6
vtbl.8 d1, {q12}, d7
veor q0, q0, q4
veor q0, q0, q7
vmov q7, q0
bx lr
.p2align 4
vpaes_schedule_transform:
vbic q1, q0, q9
vldmia r11, {q4-q5}
vand q0, q0, q9
vshr.u32 q1, q1, #4
vtbl.8 d0, {q4}, d0
vtbl.8 d1, {q4}, d1
vtbl.8 d2, {q5}, d2
vtbl.8 d3, {q5}, d3
veor q0, q0, q1
bx lr
.p2align 4
.Lk_mc_forward2:
.quad 0x0407060500030201, 0x0C0F0E0D080B0A09
.Lk_s63:
.quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
.Lk_dksd:
.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
.Lk_dksb:
.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
.Lk_dkse:
.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
.Lk_dks9:
.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
.p2align 4
vpaes_schedule_mangle:
vstmdb sp!, {q6-q7}
adr r12, .Lk_mc_forward2
vmov q4, q0
cmp r3, #0
vldmia r12!, {q5-q6} // q5 = Lk_mc_forward2, q6 = Lk_s63
bne .Lschedule_mangle_dec
add r2, r2, #16
veor q4, q4, q6
vtbl.8 d6, {q4}, d10
vtbl.8 d7, {q4}, d11
vtbl.8 d8, {q3}, d10
vtbl.8 d9, {q3}, d11
vtbl.8 d2, {q4}, d10
vtbl.8 d3, {q4}, d11
veor q3, q3, q4
veor q3, q3, q1
b .Lschedule_mangle_both
.p2align 4
.Lschedule_mangle_dec:
vbic q1, q4, q9
vldmia r12!, {q6-q7}
vshr.u32 q1, q1, #4
vand q4, q4, q9
vtbl.8 d4, {q6}, d8
vtbl.8 d5, {q6}, d9
vtbl.8 d6, {q7}, d2
vtbl.8 d7, {q7}, d3
vldmia r12!, {q6-q7}
veor q2, q3, q2
vtbl.8 d6, {q2}, d10
vtbl.8 d7, {q2}, d11
vtbl.8 d4, {q6}, d8
vtbl.8 d5, {q6}, d9
veor q2, q2, q3
vtbl.8 d6, {q7}, d2
vtbl.8 d7, {q7}, d3
vldmia r12!, {q6-q7}
veor q2, q3, q2
vtbl.8 d6, {q2}, d10
vtbl.8 d7, {q2}, d11
vtbl.8 d4, {q6}, d8
vtbl.8 d5, {q6}, d9
veor q2, q2, q3
vtbl.8 d6, {q7}, d2
vtbl.8 d7, {q7}, d3
vldmia r12!, {q6-q7}
veor q2, q3, q2
vtbl.8 d6, {q2}, d10
vtbl.8 d7, {q2}, d11
vtbl.8 d4, {q6}, d8
vtbl.8 d5, {q6}, d9
veor q2, q2, q3
vtbl.8 d6, {q7}, d2
vtbl.8 d7, {q7}, d3
veor q3, q3, q2
sub r2, r2, #16
.Lschedule_mangle_both:
add r12, r10, r8
vld1.8 {q1}, [r12]
sub r8, r8, #16
vtbl.8 d4, {q3}, d2
vtbl.8 d5, {q3}, d3
and r8, r8, #48
vst1.8 {q2}, [r2]
vldmia sp!, {q6-q7}
bx lr
/*
int vpaes_set_encrypt_key(const uint8_t *userKey, int bits, void *key);
*/
#define userKey r0
#define AES_bits r1
#define key r2
#define t r12
.globl _AccelerateCrypto_vpaes_set_encrypt_key
.p2align 4
_AccelerateCrypto_vpaes_set_encrypt_key:
// 128/192/256 divide by 32 = 4/6/8 + 5 - 9/11/13
lsr t, AES_bits, #5
vpaes_push
mov r11, t
save_all_neon
add t, r11, #5
mov r3, #0
str t, [key, #240]
mov r8, #48
b vpaes_schedule_core
.globl _AccelerateCrypto_vpaes_set_decrypt_key
.p2align 4
_AccelerateCrypto_vpaes_set_decrypt_key:
lsr t, AES_bits, #5
vpaes_push
mov r11, t
save_all_neon
mov r8, #32
add t, r11, #5
and r8, r8, AES_bits, lsr #1
mov r3, #1
str t, [key, #240]
add key, key, #16
eor r8, r8, #32
add key, key, t, lsl #4
b vpaes_schedule_core
/*
void vpaes_encrypt(const unsigned char *in, unsigned char *out, const AES_KEY *key);
*/
#define in r0
#define out r1
#define key r2
.globl _AccelerateCrypto_vpaes_encrypt
.p2align 4
_AccelerateCrypto_vpaes_encrypt:
vpaes_push
save_all_neon
vld1.8 {q0}, [in]
bl vpaes_preheat
bl vpaes_encrypt_core
vst1.8 {q0}, [out]
restore_all_neon
eor r0, r0 // return 0 for SUCCESS
vpaes_pop
.globl _AccelerateCrypto_vpaes_decrypt
.p2align 4
_AccelerateCrypto_vpaes_decrypt:
vpaes_push
save_all_neon
vld1.8 {q0}, [in]
bl vpaes_preheat
bl vpaes_decrypt_core
vst1.8 {q0}, [out]
restore_all_neon
eor r0, r0 // return 0 for SUCCESS
vpaes_pop
.p2align 4
vpaes_preheat:
adr r12, .Lk_s0F
vldmia r12, {q9-q15}
bx lr
.p2align 6
// the following 7 16-bytes words are loaded into
.Lk_s0F:
.quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
.Lk_inv:
.quad 0x0E05060F0D080180, 0x040703090A0B0C02
.quad 0x01040A060F0B0780, 0x030D0E0C02050809
.Lk_sb1:
.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
.Lk_sb2:
.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
#endif // !defined(__arm64__) && defined(__ARM_NEON__)