478 lines
12 KiB
ArmAsm
478 lines
12 KiB
ArmAsm
# Copyright (c) (2011,2012,2013,2014,2015,2016,2019) Apple Inc. All rights reserved.
|
||
#
|
||
# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||
# is contained in the License.txt file distributed with corecrypto) and only to
|
||
# people who accept that license. IMPORTANT: Any license rights granted to you by
|
||
# Apple Inc. (if any) are limited to internal use within your organization only on
|
||
# devices and computers you own or control, for the sole purpose of verifying the
|
||
# security characteristics and correct functioning of the Apple Software. You may
|
||
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||
#if defined(__arm__)
|
||
|
||
#define S0 r0
|
||
#define S1 r1
|
||
#define S2 r2
|
||
#define S3 r3
|
||
|
||
#if Select == 0
|
||
#define Name _AccelerateCrypto_AES_encrypt // Routine name.
|
||
#define MTable _AESEncryptTable // Main table.
|
||
#define FTable _AESSubBytesWordTable // Final table.
|
||
#define P0 S0 // State permutation.
|
||
#define P1 S1
|
||
#define P2 S2
|
||
#define P3 S3
|
||
#define Increment +16 // ExpandedKey increment.
|
||
#elif Select == 1
|
||
#define Name _AccelerateCrypto_AES_decrypt // Routine name.
|
||
#define MTable _AESDecryptTable // Main table.
|
||
#define FTable _AESInvSubBytesWordTable // Final table.
|
||
#define P0 S2 // State permutation.
|
||
#define P1 S3
|
||
#define P2 S0
|
||
#define P3 S1
|
||
#define Increment -16 // ExpandedKey increment.
|
||
#endif // Select
|
||
|
||
#if defined(__ARM_NEON__) // vpaes uses NEON instructions
|
||
.extern _AccelerateCrypto_vpaes_encrypt
|
||
.extern _AccelerateCrypto_vpaes_decrypt
|
||
#endif
|
||
|
||
#define ExpandedKey r11
|
||
#define ExpandedKeyEnd lr
|
||
#define ContextKeyLength 240
|
||
#define t r12
|
||
|
||
.subsections_via_symbols
|
||
.text
|
||
.syntax unified
|
||
.p2align 2
|
||
.code 16
|
||
.thumb_func Name
|
||
.globl Name
|
||
Name:
|
||
#if defined(__ARM_NEON__) // if neon is available, use cache-attack resilient vector permute AES
|
||
|
||
#if Select == 0
|
||
b _AccelerateCrypto_vpaes_encrypt
|
||
#else
|
||
b _AccelerateCrypto_vpaes_decrypt
|
||
#endif
|
||
|
||
#else // __ARM_NEON__
|
||
|
||
// set up debug trace frame pointer
|
||
push {r7,lr}
|
||
mov r7, sp
|
||
|
||
// now setup the stack for the current function
|
||
push {r1,r4-r6,r8-r11}
|
||
sub sp, #(16+8) // make sp 16-byte aligned
|
||
|
||
// copy r0,r2 to r4,r11 to release r0,r2 (r1 is saved in stack) for use as S0-S3
|
||
mov r4, r0
|
||
mov ExpandedKey, r2
|
||
|
||
// Get and check "key length".
|
||
ldr t, [ExpandedKey, #ContextKeyLength]
|
||
cmp t, #160
|
||
beq 2f
|
||
cmp t, #192
|
||
beq 2f
|
||
cmp t, #224
|
||
beq 2f
|
||
mov r0, #-1 // Return error.
|
||
b 9f
|
||
2:
|
||
|
||
#if (Select == 0)
|
||
// For encryption, prepare to iterate forward through expanded key.
|
||
add ExpandedKeyEnd, ExpandedKey, t
|
||
#else
|
||
// For decryption, prepare to iterate backward through expanded key.
|
||
mov ExpandedKeyEnd, ExpandedKey
|
||
add ExpandedKey, t
|
||
#endif
|
||
|
||
/*
|
||
we need to do this for otherwise ldmia $0, {$1-$4} will hit memory access error when $0 is not word-aligned in thumb state
|
||
*/
|
||
.macro thumb2_ldmia
|
||
ldr $1, [$0, #0]
|
||
ldr $2, [$0, #4]
|
||
ldr $3, [$0, #8]
|
||
ldr $4, [$0, #12]
|
||
.endm
|
||
|
||
.macro thumb2_stmia
|
||
str $1, [$0, #0]
|
||
str $2, [$0, #4]
|
||
str $3, [$0, #8]
|
||
str $4, [$0, #12]
|
||
.endm
|
||
|
||
// Initialize State from input text.
|
||
// we need to do this otherwise ldmia will crash when input (pointed by r4) is not word aligned
|
||
thumb2_ldmia r4, S0, S1, S2, S3
|
||
|
||
// Add round key and save results.
|
||
thumb2_ldmia ExpandedKey, r4, r5, r8, r10
|
||
add ExpandedKey, #Increment
|
||
|
||
eor S0, r4
|
||
eor S1, r5
|
||
eor S2, r8
|
||
eor S3, r10
|
||
|
||
// Set up r6 = _AESEncryptTable or _AESDecryptTable
|
||
ldr r6, L_table1
|
||
L_table0:
|
||
mov r12, pc
|
||
ldr r6, [r12, r6]
|
||
|
||
// save S0-S3 in the stack memory
|
||
stmia sp, {S0-S3}
|
||
|
||
// use this to extract byte from a shifted word, tried use uxtb, same complexity, but then limit to armv6 or above
|
||
mov r9, #0xff
|
||
|
||
// Get round key.
|
||
thumb2_ldmia ExpandedKey, S0, S1, S2, S3
|
||
add ExpandedKey, #Increment
|
||
|
||
// per round operation
|
||
|
||
/*
|
||
the following macro defines the per round operation for aes
|
||
the state computed from the previous round is now saved in sp[0:15]
|
||
and r0-r3 has been initialized with the next expanded round key
|
||
the macro reads those 16 bytes in sp[0:15] and for each byte does a table look up
|
||
the result (4-byte) word is xor-ed to one of r0-r3
|
||
the final r0-r3 is the aes state
|
||
r6 : points to Main or Final table
|
||
r9 : 0xff is used as a byte mask
|
||
*/
|
||
|
||
.macro aes_per_round
|
||
|
||
#if defined (__ARM_ARCH_7S__)
|
||
// better for swift and (old cortex-a8)
|
||
|
||
// S0 process
|
||
ldr t, [sp, #0] // load 4 bytes for S0 process
|
||
and r4, r9, t // byte 0
|
||
and r5, r9, t, lsr #8 // byte 1
|
||
ldr r4, [r6, r4, lsl #2] // 1st table lookup
|
||
and r8, r9, t, lsr #16 // byte 2
|
||
ldr r5, [r6, r5, lsl #2] // 2nd table lookup
|
||
and r10, r9, t, lsr #24 // byte 3
|
||
ldr r8, [r6, r8, lsl #2] // 3rd table lookup
|
||
eor S0, r4 // S0 ^= 1st table lookup
|
||
ldr r10, [r6, r10, lsl #2] // 4th table lookup
|
||
eor P3, r5, ror #24 // P3 ^= 2nd table lookup
|
||
ldr t, [sp, #4] // read Word for next S1 process
|
||
eor S2, r8, ror #16 // S2 ^= 3rd table lookup
|
||
eor P1, r10, ror #8 // P1 ^= 4th table lookup
|
||
|
||
// S1 process
|
||
and r4, r9, t
|
||
and r5, r9, t, lsr #8
|
||
ldr r4, [r6, r4, lsl #2]
|
||
and r8, r9, t, lsr #16
|
||
ldr r5, [r6, r5, lsl #2]
|
||
and r10, r9, t, lsr #24
|
||
ldr r8, [r6, r8, lsl #2]
|
||
eor S1, r4
|
||
ldr r10, [r6, r10, lsl #2]
|
||
eor P0, r5, ror #24
|
||
ldr t, [sp, #8]
|
||
eor S3, r8, ror #16
|
||
eor P2, r10, ror #8
|
||
|
||
// S2 process
|
||
and r4, r9, t
|
||
and r5, r9, t, lsr #8
|
||
ldr r4, [r6, r4, lsl #2]
|
||
and r8, r9, t, lsr #16
|
||
ldr r5, [r6, r5, lsl #2]
|
||
and r10, r9, t, lsr #24
|
||
ldr r8, [r6, r8, lsl #2]
|
||
eor S2, r4
|
||
ldr r10, [r6, r10, lsl #2]
|
||
eor P1, r5, ror #24
|
||
ldr t, [sp, #12]
|
||
eor S0, r8, ror #16
|
||
eor P3, r10, ror #8
|
||
|
||
// S3 process
|
||
and r4, r9, t
|
||
and r5, r9, t, lsr #8
|
||
ldr r4, [r6, r4, lsl #2]
|
||
and r8, r9, t, lsr #16
|
||
ldr r5, [r6, r5, lsl #2]
|
||
and r10, r9, t, lsr #24
|
||
ldr r8, [r6, r8, lsl #2]
|
||
eor S3, r4
|
||
ldr r10, [r6, r10, lsl #2]
|
||
eor P2, r5, ror #24
|
||
eor S1, r8, ror #16
|
||
eor P0, r10, ror #8
|
||
|
||
#else
|
||
|
||
// better for cortex-a7 and cortex-a9
|
||
|
||
// S0 process
|
||
ldrb r4, [sp, #0] // byte 0
|
||
ldrb r5, [sp, #1] // byte 1
|
||
ldrb r8, [sp, #2] // byte 2
|
||
ldrb r10, [sp, #3] // byte 3
|
||
ldr r4, [r6, r4, lsl #2] // 1st table lookup
|
||
ldr r5, [r6, r5, lsl #2] // 2nd table lookup
|
||
ldr r8, [r6, r8, lsl #2] // 1st table lookup
|
||
eor S0, r4 // S0 ^= 1st table lookup
|
||
ldr r10, [r6, r10, lsl #2] // 2nd table lookup
|
||
eor P3, r5, ror #24 // P3 ^= 2nd table lookup
|
||
eor S2, r8, ror #16 // S2 ^= 3rd table lookup
|
||
eor P1, r10, ror #8 // P1 ^= 4th table lookup
|
||
|
||
// S1 process
|
||
ldrb r4, [sp, #4] // byte 0
|
||
ldrb r5, [sp, #5] // byte 1
|
||
ldrb r8, [sp, #6] // byte 2
|
||
ldrb r10, [sp, #7] // byte 3
|
||
ldr r4, [r6, r4, lsl #2]
|
||
ldr r5, [r6, r5, lsl #2]
|
||
ldr r8, [r6, r8, lsl #2]
|
||
eor S1, r4
|
||
ldr r10, [r6, r10, lsl #2]
|
||
eor P0, r5, ror #24
|
||
eor S3, r8, ror #16
|
||
eor P2, r10, ror #8
|
||
|
||
// S2 process
|
||
ldrb r4, [sp, #8] // byte 0
|
||
ldrb r5, [sp, #9] // byte 1
|
||
ldrb r8, [sp, #10] // byte 2
|
||
ldrb r10, [sp, #11] // byte 3
|
||
ldr r4, [r6, r4, lsl #2]
|
||
ldr r5, [r6, r5, lsl #2]
|
||
ldr r8, [r6, r8, lsl #2]
|
||
eor S2, r4
|
||
ldr r10, [r6, r10, lsl #2]
|
||
eor P1, r5, ror #24
|
||
eor S0, r8, ror #16
|
||
eor P3, r10, ror #8
|
||
|
||
// S3 process
|
||
ldrb r4, [sp, #12] // byte 0
|
||
ldrb r5, [sp, #13] // byte 1
|
||
ldrb r8, [sp, #14] // byte 2
|
||
ldrb r10, [sp, #15] // byte 3
|
||
ldr r4, [r6, r4, lsl #2]
|
||
ldr r5, [r6, r5, lsl #2]
|
||
ldr r8, [r6, r8, lsl #2]
|
||
eor S3, r4
|
||
ldr r10, [r6, r10, lsl #2]
|
||
eor P2, r5, ror #24
|
||
eor S1, r8, ror #16
|
||
eor P0, r10, ror #8
|
||
|
||
#endif
|
||
|
||
.endm
|
||
|
||
.macro aes_last_round
|
||
#if defined (__ARM_ARCH_7S__)
|
||
// better for swift (and old cortex-a8)
|
||
|
||
// S0 process
|
||
ldr t, [sp, #0] // load 4 bytes for S0 process
|
||
and r4, r9, t // byte 0
|
||
and r5, r9, t, lsr #8 // byte 1
|
||
ldrb r4, [r6, r4] // 1st table lookup
|
||
and r8, r9, t, lsr #16 // byte 2
|
||
ldrb r5, [r6, r5] // 2nd table lookup
|
||
and r10, r9, t, lsr #24 // byte 3
|
||
ldrb r8, [r6, r8] // 3rd table lookup
|
||
eor S0, r4 // S0 ^= 1st table lookup
|
||
ldrb r10, [r6, r10] // 4th table lookup
|
||
eor P3, r5, ror #24 // P3 ^= 2nd table lookup
|
||
ldr t, [sp, #4] // read Word for next S1 process
|
||
eor S2, r8, ror #16 // S2 ^= 3rd table lookup
|
||
eor P1, r10, ror #8 // P1 ^= 4th table lookup
|
||
|
||
// S1 process
|
||
and r4, r9, t
|
||
and r5, r9, t, lsr #8
|
||
ldrb r4, [r6, r4]
|
||
and r8, r9, t, lsr #16
|
||
ldrb r5, [r6, r5]
|
||
and r10, r9, t, lsr #24
|
||
ldrb r8, [r6, r8]
|
||
eor S1, r4
|
||
ldrb r10, [r6, r10]
|
||
eor P0, r5, ror #24
|
||
ldr t, [sp, #8]
|
||
eor S3, r8, ror #16
|
||
eor P2, r10, ror #8
|
||
|
||
// S2 process
|
||
and r4, r9, t
|
||
and r5, r9, t, lsr #8
|
||
ldrb r4, [r6, r4]
|
||
and r8, r9, t, lsr #16
|
||
ldrb r5, [r6, r5]
|
||
and r10, r9, t, lsr #24
|
||
ldrb r8, [r6, r8]
|
||
eor S2, r4
|
||
ldrb r10, [r6, r10]
|
||
eor P1, r5, ror #24
|
||
ldr t, [sp, #12]
|
||
eor S0, r8, ror #16
|
||
eor P3, r10, ror #8
|
||
|
||
// S3 process
|
||
and r4, r9, t
|
||
and r5, r9, t, lsr #8
|
||
ldrb r4, [r6, r4]
|
||
and r8, r9, t, lsr #16
|
||
ldrb r5, [r6, r5]
|
||
and r10, r9, t, lsr #24
|
||
ldrb r8, [r6, r8]
|
||
eor S3, r4
|
||
ldrb r10, [r6, r10]
|
||
eor P2, r5, ror #24
|
||
eor S1, r8, ror #16
|
||
eor P0, r10, ror #8
|
||
|
||
#else
|
||
// better for cortex-a7 and cortex-a9
|
||
|
||
// S0 process
|
||
ldrb r4, [sp, #0] // byte 0
|
||
ldrb r5, [sp, #1] // byte 1
|
||
ldrb r8, [sp, #2] // byte 2
|
||
ldrb r10, [sp, #3] // byte 3
|
||
ldrb r4, [r6, r4] // 1st table lookup
|
||
ldrb r5, [r6, r5] // 2nd table lookup
|
||
ldrb r8, [r6, r8] // 3rd table lookup
|
||
eor S0, r4 // S0 ^= 1st table lookup
|
||
ldrb r10, [r6, r10] // 4th table lookup
|
||
eor P3, r5, ror #24 // P3 ^= 2nd table lookup
|
||
eor S2, r8, ror #16 // S2 ^= 3rd table lookup
|
||
eor P1, r10, ror #8 // P1 ^= 4th table lookup
|
||
|
||
// S1 process
|
||
ldrb r4, [sp, #4] // byte 0
|
||
ldrb r5, [sp, #5] // byte 1
|
||
ldrb r8, [sp, #6] // byte 2
|
||
ldrb r10, [sp, #7] // byte 3
|
||
ldrb r4, [r6, r4]
|
||
ldrb r5, [r6, r5]
|
||
ldrb r8, [r6, r8]
|
||
eor S1, r4
|
||
ldrb r10, [r6, r10]
|
||
eor P0, r5, ror #24
|
||
eor S3, r8, ror #16
|
||
eor P2, r10, ror #8
|
||
|
||
// S2 process
|
||
ldrb r4, [sp, #8] // byte 0
|
||
ldrb r5, [sp, #9] // byte 1
|
||
ldrb r8, [sp, #10] // byte 2
|
||
ldrb r10, [sp, #11] // byte 3
|
||
ldrb r4, [r6, r4]
|
||
ldrb r5, [r6, r5]
|
||
ldrb r8, [r6, r8]
|
||
eor S2, r4
|
||
ldrb r10, [r6, r10]
|
||
eor P1, r5, ror #24
|
||
eor S0, r8, ror #16
|
||
eor P3, r10, ror #8
|
||
|
||
// S3 process
|
||
ldrb r4, [sp, #12] // byte 0
|
||
ldrb r5, [sp, #13] // byte 1
|
||
ldrb r8, [sp, #14] // byte 2
|
||
ldrb r10, [sp, #15] // byte 3
|
||
ldrb r4, [r6, r4]
|
||
ldrb r5, [r6, r5]
|
||
ldrb r8, [r6, r8]
|
||
eor S3, r4
|
||
ldrb r10, [r6, r10]
|
||
eor P2, r5, ror #24
|
||
eor S1, r8, ror #16
|
||
eor P0, r10, ror #8
|
||
#endif
|
||
|
||
.endm
|
||
|
||
1:
|
||
aes_per_round
|
||
|
||
// Save state for next iteration and load next round key.
|
||
stmia sp,{S0-S3}
|
||
thumb2_ldmia ExpandedKey, S0, S1, S2, S3
|
||
|
||
cmp ExpandedKeyEnd, ExpandedKey
|
||
add ExpandedKey, #Increment
|
||
bne 1b
|
||
|
||
// setup r6 = _AESSubBytesWordTable or _AESInvSubBytesWordTable
|
||
ldr r6, L_table3
|
||
L_table2:
|
||
mov r12, pc
|
||
ldr r6, [r12, r6]
|
||
|
||
aes_last_round
|
||
|
||
ldr r4, [sp, #(16+8)] // restore OutputText
|
||
thumb2_stmia r4, S0, S1, S2, S3
|
||
eor r0, r0 // Return success.
|
||
|
||
9:
|
||
|
||
add sp, #(4+16+8) // skip r1 restore
|
||
pop {r4-r6,r8-r11}
|
||
pop {r7, pc}
|
||
|
||
|
||
.p2align 2
|
||
L_table1:
|
||
.long L_Tab$non_lazy_ptr-(L_table0+4)
|
||
|
||
.p2align 2
|
||
L_table3:
|
||
.long L_Tab$non_lazy_ptr2-(L_table2+4)
|
||
|
||
.section __DATA,__nl_symbol_ptr,non_lazy_symbol_pointers
|
||
.p2align 2
|
||
L_Tab$non_lazy_ptr:
|
||
.indirect_symbol MTable
|
||
.long 0
|
||
|
||
.p2align 2
|
||
L_Tab$non_lazy_ptr2:
|
||
.indirect_symbol FTable
|
||
.long 0
|
||
|
||
#endif // __ARM_NEON__
|
||
|
||
#undef S0
|
||
#undef S1
|
||
#undef S2
|
||
#undef S3
|
||
#undef Name
|
||
#undef MTable
|
||
#undef FTable
|
||
#undef P0
|
||
#undef P1
|
||
#undef P2
|
||
#undef P3
|
||
#undef Increment
|
||
|
||
#endif /* defined(__arm__) */
|
||
|