# Copyright (c) (2011,2012,2013,2014,2015,2016,2019) Apple Inc. All rights reserved. # # corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which # is contained in the License.txt file distributed with corecrypto) and only to # people who accept that license. IMPORTANT: Any license rights granted to you by # Apple Inc. (if any) are limited to internal use within your organization only on # devices and computers you own or control, for the sole purpose of verifying the # security characteristics and correct functioning of the Apple Software. You may # not, directly or indirectly, redistribute the Apple Software or any portions thereof. #if defined(__arm__) #define S0 r0 #define S1 r1 #define S2 r2 #define S3 r3 #if Select == 0 #define Name _AccelerateCrypto_AES_encrypt // Routine name. #define MTable _AESEncryptTable // Main table. #define FTable _AESSubBytesWordTable // Final table. #define P0 S0 // State permutation. #define P1 S1 #define P2 S2 #define P3 S3 #define Increment +16 // ExpandedKey increment. #elif Select == 1 #define Name _AccelerateCrypto_AES_decrypt // Routine name. #define MTable _AESDecryptTable // Main table. #define FTable _AESInvSubBytesWordTable // Final table. #define P0 S2 // State permutation. #define P1 S3 #define P2 S0 #define P3 S1 #define Increment -16 // ExpandedKey increment. #endif // Select #if defined(__ARM_NEON__) // vpaes uses NEON instructions .extern _AccelerateCrypto_vpaes_encrypt .extern _AccelerateCrypto_vpaes_decrypt #endif #define ExpandedKey r11 #define ExpandedKeyEnd lr #define ContextKeyLength 240 #define t r12 .subsections_via_symbols .text .syntax unified .p2align 2 .code 16 .thumb_func Name .globl Name Name: #if defined(__ARM_NEON__) // if neon is available, use cache-attack resilient vector permute AES #if Select == 0 b _AccelerateCrypto_vpaes_encrypt #else b _AccelerateCrypto_vpaes_decrypt #endif #else // __ARM_NEON__ // set up debug trace frame pointer push {r7,lr} mov r7, sp // now setup the stack for the current function push {r1,r4-r6,r8-r11} sub sp, #(16+8) // make sp 16-byte aligned // copy r0,r2 to r4,r11 to release r0,r2 (r1 is saved in stack) for use as S0-S3 mov r4, r0 mov ExpandedKey, r2 // Get and check "key length". ldr t, [ExpandedKey, #ContextKeyLength] cmp t, #160 beq 2f cmp t, #192 beq 2f cmp t, #224 beq 2f mov r0, #-1 // Return error. b 9f 2: #if (Select == 0) // For encryption, prepare to iterate forward through expanded key. add ExpandedKeyEnd, ExpandedKey, t #else // For decryption, prepare to iterate backward through expanded key. mov ExpandedKeyEnd, ExpandedKey add ExpandedKey, t #endif /* we need to do this for otherwise ldmia $0, {$1-$4} will hit memory access error when $0 is not word-aligned in thumb state */ .macro thumb2_ldmia ldr $1, [$0, #0] ldr $2, [$0, #4] ldr $3, [$0, #8] ldr $4, [$0, #12] .endm .macro thumb2_stmia str $1, [$0, #0] str $2, [$0, #4] str $3, [$0, #8] str $4, [$0, #12] .endm // Initialize State from input text. // we need to do this otherwise ldmia will crash when input (pointed by r4) is not word aligned thumb2_ldmia r4, S0, S1, S2, S3 // Add round key and save results. thumb2_ldmia ExpandedKey, r4, r5, r8, r10 add ExpandedKey, #Increment eor S0, r4 eor S1, r5 eor S2, r8 eor S3, r10 // Set up r6 = _AESEncryptTable or _AESDecryptTable ldr r6, L_table1 L_table0: mov r12, pc ldr r6, [r12, r6] // save S0-S3 in the stack memory stmia sp, {S0-S3} // use this to extract byte from a shifted word, tried use uxtb, same complexity, but then limit to armv6 or above mov r9, #0xff // Get round key. thumb2_ldmia ExpandedKey, S0, S1, S2, S3 add ExpandedKey, #Increment // per round operation /* the following macro defines the per round operation for aes the state computed from the previous round is now saved in sp[0:15] and r0-r3 has been initialized with the next expanded round key the macro reads those 16 bytes in sp[0:15] and for each byte does a table look up the result (4-byte) word is xor-ed to one of r0-r3 the final r0-r3 is the aes state r6 : points to Main or Final table r9 : 0xff is used as a byte mask */ .macro aes_per_round #if defined (__ARM_ARCH_7S__) // better for swift and (old cortex-a8) // S0 process ldr t, [sp, #0] // load 4 bytes for S0 process and r4, r9, t // byte 0 and r5, r9, t, lsr #8 // byte 1 ldr r4, [r6, r4, lsl #2] // 1st table lookup and r8, r9, t, lsr #16 // byte 2 ldr r5, [r6, r5, lsl #2] // 2nd table lookup and r10, r9, t, lsr #24 // byte 3 ldr r8, [r6, r8, lsl #2] // 3rd table lookup eor S0, r4 // S0 ^= 1st table lookup ldr r10, [r6, r10, lsl #2] // 4th table lookup eor P3, r5, ror #24 // P3 ^= 2nd table lookup ldr t, [sp, #4] // read Word for next S1 process eor S2, r8, ror #16 // S2 ^= 3rd table lookup eor P1, r10, ror #8 // P1 ^= 4th table lookup // S1 process and r4, r9, t and r5, r9, t, lsr #8 ldr r4, [r6, r4, lsl #2] and r8, r9, t, lsr #16 ldr r5, [r6, r5, lsl #2] and r10, r9, t, lsr #24 ldr r8, [r6, r8, lsl #2] eor S1, r4 ldr r10, [r6, r10, lsl #2] eor P0, r5, ror #24 ldr t, [sp, #8] eor S3, r8, ror #16 eor P2, r10, ror #8 // S2 process and r4, r9, t and r5, r9, t, lsr #8 ldr r4, [r6, r4, lsl #2] and r8, r9, t, lsr #16 ldr r5, [r6, r5, lsl #2] and r10, r9, t, lsr #24 ldr r8, [r6, r8, lsl #2] eor S2, r4 ldr r10, [r6, r10, lsl #2] eor P1, r5, ror #24 ldr t, [sp, #12] eor S0, r8, ror #16 eor P3, r10, ror #8 // S3 process and r4, r9, t and r5, r9, t, lsr #8 ldr r4, [r6, r4, lsl #2] and r8, r9, t, lsr #16 ldr r5, [r6, r5, lsl #2] and r10, r9, t, lsr #24 ldr r8, [r6, r8, lsl #2] eor S3, r4 ldr r10, [r6, r10, lsl #2] eor P2, r5, ror #24 eor S1, r8, ror #16 eor P0, r10, ror #8 #else // better for cortex-a7 and cortex-a9 // S0 process ldrb r4, [sp, #0] // byte 0 ldrb r5, [sp, #1] // byte 1 ldrb r8, [sp, #2] // byte 2 ldrb r10, [sp, #3] // byte 3 ldr r4, [r6, r4, lsl #2] // 1st table lookup ldr r5, [r6, r5, lsl #2] // 2nd table lookup ldr r8, [r6, r8, lsl #2] // 1st table lookup eor S0, r4 // S0 ^= 1st table lookup ldr r10, [r6, r10, lsl #2] // 2nd table lookup eor P3, r5, ror #24 // P3 ^= 2nd table lookup eor S2, r8, ror #16 // S2 ^= 3rd table lookup eor P1, r10, ror #8 // P1 ^= 4th table lookup // S1 process ldrb r4, [sp, #4] // byte 0 ldrb r5, [sp, #5] // byte 1 ldrb r8, [sp, #6] // byte 2 ldrb r10, [sp, #7] // byte 3 ldr r4, [r6, r4, lsl #2] ldr r5, [r6, r5, lsl #2] ldr r8, [r6, r8, lsl #2] eor S1, r4 ldr r10, [r6, r10, lsl #2] eor P0, r5, ror #24 eor S3, r8, ror #16 eor P2, r10, ror #8 // S2 process ldrb r4, [sp, #8] // byte 0 ldrb r5, [sp, #9] // byte 1 ldrb r8, [sp, #10] // byte 2 ldrb r10, [sp, #11] // byte 3 ldr r4, [r6, r4, lsl #2] ldr r5, [r6, r5, lsl #2] ldr r8, [r6, r8, lsl #2] eor S2, r4 ldr r10, [r6, r10, lsl #2] eor P1, r5, ror #24 eor S0, r8, ror #16 eor P3, r10, ror #8 // S3 process ldrb r4, [sp, #12] // byte 0 ldrb r5, [sp, #13] // byte 1 ldrb r8, [sp, #14] // byte 2 ldrb r10, [sp, #15] // byte 3 ldr r4, [r6, r4, lsl #2] ldr r5, [r6, r5, lsl #2] ldr r8, [r6, r8, lsl #2] eor S3, r4 ldr r10, [r6, r10, lsl #2] eor P2, r5, ror #24 eor S1, r8, ror #16 eor P0, r10, ror #8 #endif .endm .macro aes_last_round #if defined (__ARM_ARCH_7S__) // better for swift (and old cortex-a8) // S0 process ldr t, [sp, #0] // load 4 bytes for S0 process and r4, r9, t // byte 0 and r5, r9, t, lsr #8 // byte 1 ldrb r4, [r6, r4] // 1st table lookup and r8, r9, t, lsr #16 // byte 2 ldrb r5, [r6, r5] // 2nd table lookup and r10, r9, t, lsr #24 // byte 3 ldrb r8, [r6, r8] // 3rd table lookup eor S0, r4 // S0 ^= 1st table lookup ldrb r10, [r6, r10] // 4th table lookup eor P3, r5, ror #24 // P3 ^= 2nd table lookup ldr t, [sp, #4] // read Word for next S1 process eor S2, r8, ror #16 // S2 ^= 3rd table lookup eor P1, r10, ror #8 // P1 ^= 4th table lookup // S1 process and r4, r9, t and r5, r9, t, lsr #8 ldrb r4, [r6, r4] and r8, r9, t, lsr #16 ldrb r5, [r6, r5] and r10, r9, t, lsr #24 ldrb r8, [r6, r8] eor S1, r4 ldrb r10, [r6, r10] eor P0, r5, ror #24 ldr t, [sp, #8] eor S3, r8, ror #16 eor P2, r10, ror #8 // S2 process and r4, r9, t and r5, r9, t, lsr #8 ldrb r4, [r6, r4] and r8, r9, t, lsr #16 ldrb r5, [r6, r5] and r10, r9, t, lsr #24 ldrb r8, [r6, r8] eor S2, r4 ldrb r10, [r6, r10] eor P1, r5, ror #24 ldr t, [sp, #12] eor S0, r8, ror #16 eor P3, r10, ror #8 // S3 process and r4, r9, t and r5, r9, t, lsr #8 ldrb r4, [r6, r4] and r8, r9, t, lsr #16 ldrb r5, [r6, r5] and r10, r9, t, lsr #24 ldrb r8, [r6, r8] eor S3, r4 ldrb r10, [r6, r10] eor P2, r5, ror #24 eor S1, r8, ror #16 eor P0, r10, ror #8 #else // better for cortex-a7 and cortex-a9 // S0 process ldrb r4, [sp, #0] // byte 0 ldrb r5, [sp, #1] // byte 1 ldrb r8, [sp, #2] // byte 2 ldrb r10, [sp, #3] // byte 3 ldrb r4, [r6, r4] // 1st table lookup ldrb r5, [r6, r5] // 2nd table lookup ldrb r8, [r6, r8] // 3rd table lookup eor S0, r4 // S0 ^= 1st table lookup ldrb r10, [r6, r10] // 4th table lookup eor P3, r5, ror #24 // P3 ^= 2nd table lookup eor S2, r8, ror #16 // S2 ^= 3rd table lookup eor P1, r10, ror #8 // P1 ^= 4th table lookup // S1 process ldrb r4, [sp, #4] // byte 0 ldrb r5, [sp, #5] // byte 1 ldrb r8, [sp, #6] // byte 2 ldrb r10, [sp, #7] // byte 3 ldrb r4, [r6, r4] ldrb r5, [r6, r5] ldrb r8, [r6, r8] eor S1, r4 ldrb r10, [r6, r10] eor P0, r5, ror #24 eor S3, r8, ror #16 eor P2, r10, ror #8 // S2 process ldrb r4, [sp, #8] // byte 0 ldrb r5, [sp, #9] // byte 1 ldrb r8, [sp, #10] // byte 2 ldrb r10, [sp, #11] // byte 3 ldrb r4, [r6, r4] ldrb r5, [r6, r5] ldrb r8, [r6, r8] eor S2, r4 ldrb r10, [r6, r10] eor P1, r5, ror #24 eor S0, r8, ror #16 eor P3, r10, ror #8 // S3 process ldrb r4, [sp, #12] // byte 0 ldrb r5, [sp, #13] // byte 1 ldrb r8, [sp, #14] // byte 2 ldrb r10, [sp, #15] // byte 3 ldrb r4, [r6, r4] ldrb r5, [r6, r5] ldrb r8, [r6, r8] eor S3, r4 ldrb r10, [r6, r10] eor P2, r5, ror #24 eor S1, r8, ror #16 eor P0, r10, ror #8 #endif .endm 1: aes_per_round // Save state for next iteration and load next round key. stmia sp,{S0-S3} thumb2_ldmia ExpandedKey, S0, S1, S2, S3 cmp ExpandedKeyEnd, ExpandedKey add ExpandedKey, #Increment bne 1b // setup r6 = _AESSubBytesWordTable or _AESInvSubBytesWordTable ldr r6, L_table3 L_table2: mov r12, pc ldr r6, [r12, r6] aes_last_round ldr r4, [sp, #(16+8)] // restore OutputText thumb2_stmia r4, S0, S1, S2, S3 eor r0, r0 // Return success. 9: add sp, #(4+16+8) // skip r1 restore pop {r4-r6,r8-r11} pop {r7, pc} .p2align 2 L_table1: .long L_Tab$non_lazy_ptr-(L_table0+4) .p2align 2 L_table3: .long L_Tab$non_lazy_ptr2-(L_table2+4) .section __DATA,__nl_symbol_ptr,non_lazy_symbol_pointers .p2align 2 L_Tab$non_lazy_ptr: .indirect_symbol MTable .long 0 .p2align 2 L_Tab$non_lazy_ptr2: .indirect_symbol FTable .long 0 #endif // __ARM_NEON__ #undef S0 #undef S1 #undef S2 #undef S3 #undef Name #undef MTable #undef FTable #undef P0 #undef P1 #undef P2 #undef P3 #undef Increment #endif /* defined(__arm__) */