corecrypto/acceleratecrypto/Source/aes/intel/EncryptDecrypt.s

577 lines
14 KiB
ArmAsm
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Copyright (c) (2012,2015,2016,2019) Apple Inc. All rights reserved.
#
# corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
# is contained in the License.txt file distributed with corecrypto) and only to
# people who accept that license. IMPORTANT: Any license rights granted to you by
# Apple Inc. (if any) are limited to internal use within your organization only on
# devices and computers you own or control, for the sole purpose of verifying the
# security characteristics and correct functioning of the Apple Software. You may
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
#if defined(__i386__) || defined(__x86_64__)
/* This file defines _vng_aes_encrypt or _vng_aes_decrypt, according to the value of
the Select preprocessor symbol. This file is designed to be included in
another assembly file using the preprocessor #include directive, to benefit
from some assembly-time calculations.
These two routines are nearly identical. They differ only in the tables
they use, the direction they iterate through the key, and the permutation
performed on part of the state.
Written by Eric Postpischil, January 2008.
*/
#if Select == 0
#define Name _aes_encrypt_nonaesni // Routine name.
#define MTable _AESEncryptTable // Main table.
#define FTable _AESSubBytesWordTable // Final table.
#define P0 S0 // State permutation.
#define P1 S1
#define P2 S2
#define P3 S3
#define Increment +16 // ExpandedKey increment.
#elif Select == 1
#define Name _aes_decrypt_nonaesni // Routine name.
#define MTable _AESDecryptTable // Main table.
#define FTable _AESInvSubBytesWordTable // Final table.
#define P0 S2 // State permutation.
#define P1 S3
#define P2 S0
#define P3 S1
#define Increment -16 // ExpandedKey increment.
#endif // Select
/* Routine:
_AESEncryptWithExpandedKey (if Select is 0) or
_AESDecryptWithExpandedKey (if Select is 1).
Function:
Perform the AES cipher or its inverse as defined in Federal Information
Processing Standards Publication 197 (FIPS-197), November 26, 2001.
The inverse cipher here is the "Equivalent Inverse Cipher" in FIPS-197.
Input:
Constant data:
The following names must be locally defined so the assembler
can calculate certain offsets.
For encryption:
static const Word _AESEncryptTable[4][256].
_AESEncryptTable[i] contains the tables T[i] defined in AES
Proposal: Rijndael, version 2, 03/09/99, by Joan Daemen and
Vincent Rijmen, section 5.2.1, page 18. These tables
combine the SubBytes and MixColumns operations.
static const Word _AESSubBytesWordTable[256].
_AESSubBytesWordTable[i][j] = SubBytes(j) << 8*i, where
SubBytes is defined in FIPS-197. _AESSubBytesWordTable
differs from _AESEncryptTable in that it does not include
the MixColumn operation. It is used in performing the last
round, which differs fromm the previous rounds in that it
does not include the MixColumn operation.
For decryption:
static const Word _AESDecryptTable[4][256].
The analog of _AESEncryptTable for decryption.
static const Word _AESSubBytesWordTable[256].
_AESInvSubBytesWordTable[i][j] = InvSubBytes(j) << 8*i,
where InvSubBytes is defined in FIPS-197.
_AESInvSubBytesWordTable differs from _AESDecryptTable in
that it does not include the InvMixColumn operation. It is
used in performing the last round, which differs from the
previous rounds in that it does not include the
InvMixColumn operation.
Arguments:
const Byte *InputText.
Address of input, 16 bytes. Best if four-byte aligned.
Byte *OutputText.
Address of output, 16 bytes. Best if four-byte aligned.
vng_aes_encrypt_ctx *Context or vng_aes_decrypt_ctx *Context
vng_aes_encrypt_ctx and vng_aes_decrypt_ctx are identical except the
former is used for encryption and the latter for decryption.
Each is a structure containing the expanded key beginning at
offset ContextKey and a four-byte "key length" beginning at
offset ContextKeyLength. The "key length" is the number of
bytes from the start of the first round key to the start of the
last round key. That is 16 less than the number of bytes in
the entire key.
Output:
Encrypted or decrypted data is written to *OutputText.
Return:
aes_rval // -1 if "key length" is invalid. 0 otherwise.
*/
.text
.globl Name
Name:
// Jimmur removed the capabilities check and the just to the AESNI code. This
// will be handled by the C code.
// Push new stack frame.
push r5
/* Save registers and set SaveSize to the number of bytes pushed onto the
stack so far, including the caller's return address.
*/
push r3
#if defined __i386__
push r6
push r7
#define SaveSize (5*4)
#else
#define SaveSize (3*8)
#endif
/* Number of bytes used for local variables:
4 (i386) or 0 (x86_64) bytes for ExpandedKeyEnd.
5 (i386) or 3 (x86_64) 16-byte spaces to save XMM registers.
*/
#define LocalsSize (Arch(4, 0) + Arch(5, 3)*16)
#if 0 < LocalsSize
// Padding to position stack pointer at a multiple of 16 bytes.
#define Padding (15 & -(SaveSize + LocalsSize))
sub $Padding + LocalsSize, r4 // Allocate space on stack.
#else
#define Padding 0
#endif
#if BUILDKERNEL
// Save XMM registers.
movaps %xmm0, 0*16(r4)
movaps %xmm1, 1*16(r4)
movaps %xmm2, 2*16(r4)
#if defined __i386__
movaps %xmm3, 3*16(r4)
movaps %xmm4, 4*16(r4)
#endif
#endif // BUILDKERNEL
#if defined __i386__
// Number of bytes from caller's stack pointer to ours.
#define StackFrame (SaveSize + Padding + LocalsSize)
// Define location of argument i (presuming 4-byte arguments).
#define Argument(i) StackFrame+4*(i)(%esp)
#define ArgInputText Argument(0)
#define ArgOutputText Argument(1)
#define ArgContext Argument(2)
#elif defined __x86_64__
// Arguments.
#define InputText r7 // Used early then overwritten for other use.
#define OutputText r6 // Needed near end of routine.
#define ArgContext r2
/* The argument passed in r2 overlaps registers we need for other
work, so it must be moved early in the routine.
*/
#endif
#define BaseP Arch(r6, r9) // Base pointer for addressing global data.
#define ExpandedKey Arch(t0, r10) // Address of expanded key.
/* The Work registers defined below are used to hold parts of the AES state
while we dissect or assemble it. They must be assigned to the A, B, C, and
D registers so that we can access the bytes in %al, %ah, and so on.
*/
#define Work0d r0d
#define Work0l r0l
#define Work0h r0h
#define Work1d r3d
#define Work1l r3l
#define Work1h r3h
#define Work2d r1d
#define Work2l r1l
#define Work2h r1h
#define Work3d r2d
#define Work3l r2l
#define Work3h r2h
#define t0 r5
#define t0d r5d // Low 32 bits of t0.
#define t0l r5l // Low byte of t0.
#define t1 r7
/* S0, S1, S2, and S3 are where we assemble the new AES state when computing
a regular round. S1, S2, and S3 are assigned to the Work registers, but
S0 needs to go somewhere else because Work0 holds part of the old state.
*/
#define S0 Arch(t1, r8d)
#define S1 Work1d
#define S2 Work2d
#define S3 Work3d
/* These XMM registers are used as holding space, because it is faster to
spill to these registers than to the stack. (On x86_64, we do not need
to spill, because there are additional general registers available.
However, using more general registers requires saving them to the stack
and restoring them. I timed it, and no time was saved.)
*/
#define vS1 %xmm0
#define vS2 %xmm1
#define vS3 %xmm2
#if defined __i386__
#define vExpandedKey %xmm3
#define vIncrement %xmm4
#endif
// Get address of expanded key.
mov ArgContext, ExpandedKey
#if 0 != ContextKey
add $ContextKey, ExpandedKey
#endif
/* Store sentinel value of ExpandedKey on the stack on i386, a register on
x86_64.
*/
#define ExpandedKeyEnd Arch(5*16(r4), r11)
// Get and check "key length".
movzb ContextKeyLength(ExpandedKey), r0
cmp $160, r0
je 2f
cmp $192, r0
je 2f
cmp $224, r0
je 2f
mov $-1, r0 // Return error.
jmp 9f
2:
#if (Select == 0 || Select == 2)
// For encryption, prepare to iterate forward through expanded key.
add ExpandedKey, r0
mov r0, ExpandedKeyEnd
#else
// For decryption, prepare to iterate backward through expanded key.
mov ExpandedKey, ExpandedKeyEnd
add r0, ExpandedKey
#endif
// Initialize State from input text.
#if defined __i386__
mov ArgInputText, BaseP
#define InputText BaseP
#endif
mov 0*4(InputText), Work0d
mov 1*4(InputText), S1
mov 2*4(InputText), S2
mov 3*4(InputText), S3
#undef InputText // Register is reused after this for other purposes.
// Add round key and save results.
xor 0*4(ExpandedKey), Work0d // S0 is in dissection register.
xor 1*4(ExpandedKey), S1
movd S1, vS1 // Save S1 to S3 in vector registers.
xor 2*4(ExpandedKey), S2
movd S2, vS2
xor 3*4(ExpandedKey), S3
movd S3, vS3
add $Increment, ExpandedKey // Advance to next round key.
#if defined __i386__
// Save expanded key address and increment in vector registers.
mov $Increment, t1
movp ExpandedKey, vExpandedKey
movp t1, vIncrement
#endif
// Set up relative addressing.
#if defined __i386__
// Get address of 0 in BaseP.
call 0f // Push program counter onto stack.
0:
pop BaseP // Get program counter.
// Define macros to help address data.
#define LookupM(table, index) MTable-0b+(table)*TableSize(BaseP, index, 4)
#define LookupF(table, index) FTable-0b+(table)*TableSize(BaseP, index, 4)
#elif defined __x86_64__
lea MTable(%rip), BaseP
// Define macros to help address data.
#define LookupM(table, index) (table)*TableSize(BaseP, index, 4)
#define LookupF(table, index) (table)*TableSize(BaseP, index, 4)
/* With these definitions of LookupM and LookupF, BaseP must be loaded with
the address of the table at the point where it is used. So we need an
instruction to change BaseP after we are done with MTable and before we
start using FTable. I would prefer to use something like:
.set FMinusM, FTable - MTable
#define LookupF(table, index) \
FMinusM+(table)*TableSize(BaseP, index, 4)
Then BaseP would not need to change. However, this fails due to an
assembler/linker bug.
*/
#endif
// Get round key.
mov 0*4(ExpandedKey), S0
mov 1*4(ExpandedKey), S1
mov 2*4(ExpandedKey), S2
mov 3*4(ExpandedKey), S3
1:
/* Word 0 of the current state must be in Work0 now, and the next round
key must be in S0 to S3.
*/
// Process previous S0.
movzb Work0l, t0
xor LookupM(0, t0), S0
movzb Work0h, t0d
xor LookupM(1, t0), P3
shr $16, Work0d
movzb Work0l, t0d
xor LookupM(2, t0), S2
movzb Work0h, t0d
xor LookupM(3, t0), P1
// Process previous S1.
movd vS1, Work0d
movzb Work0l, t0d
xor LookupM(0, t0), S1
movzb Work0h, t0d
xor LookupM(1, t0), P0
shr $16, Work0d
movzb Work0l, t0d
xor LookupM(2, t0), S3
movzb Work0h, t0d
xor LookupM(3, t0), P2
// Process previous S2.
movd vS2, Work0d
movzb Work0l, t0d
xor LookupM(0, t0), S2
movzb Work0h, t0d
xor LookupM(1, t0), P1
shr $16, Work0d
movzb Work0l, t0d
xor LookupM(2, t0), S0
movzb Work0h, t0d
xor LookupM(3, t0), P3
// Process previous S3.
movd vS3, Work0d
movzb Work0l, t0d
xor LookupM(0, t0), S3
movzb Work0h, t0d
xor LookupM(1, t0), P2
shr $16, Work0d
movzb Work0l, t0d
xor LookupM(2, t0), S1
movzb Work0h, t0d
xor LookupM(3, t0), P0
#if defined __i386__
paddd vIncrement, vExpandedKey
movp vExpandedKey, ExpandedKey
#else
add $Increment, ExpandedKey
#endif
// Save state for next iteration and load next round key.
mov S0, Work0d
mov 0*4(ExpandedKey), S0
movd S1, vS1
mov 1*4(ExpandedKey), S1
movd S2, vS2
mov 2*4(ExpandedKey), S2
movd S3, vS3
mov 3*4(ExpandedKey), S3
cmp ExpandedKeyEnd, ExpandedKey
jne 1b
/* Word 0 of the current state must be in Work0 now, and the next round
key must be in S0 to S3.
*/
// Work around assembler bug. See comments above about Radar 5683882.
#if defined __x86_64__
lea FTable(%rip), BaseP
#endif
// Process previous S0.
movzb Work0l, t0
xor LookupF(0, t0), S0
movzb Work0h, t0d
xor LookupF(1, t0), P3
shr $16, Work0d
movzb Work0l, t0d
xor LookupF(2, t0), S2
movzb Work0h, t0d
xor LookupF(3, t0), P1
// Process previous S1.
movd vS1, Work0d
movzb Work0l, t0d
xor LookupF(0, t0), S1
movzb Work0h, t0d
xor LookupF(1, t0), P0
shr $16, Work0d
movzb Work0l, t0d
xor LookupF(2, t0), S3
movzb Work0h, t0d
xor LookupF(3, t0), P2
// Process previous S2.
movd vS2, Work0d
movzb Work0l, t0d
xor LookupF(0, t0), S2
movzb Work0h, t0d
xor LookupF(1, t0), P1
shr $16, Work0d
movzb Work0l, t0d
xor LookupF(2, t0), S0
movzb Work0h, t0d
xor LookupF(3, t0), P3
// Process previous S3.
movd vS3, Work0d
movzb Work0l, t0d
xor LookupF(0, t0), S3
movzb Work0h, t0d
xor LookupF(1, t0), P2
shr $16, Work0d
movzb Work0l, t0d
xor LookupF(2, t0), S1
movzb Work0h, t0d
xor LookupF(3, t0), P0
#if defined __i386__ // Architecture.
// Get OutputText address.
#define OutputText BaseP
mov ArgOutputText, OutputText
#endif // Architecture.
// Write output.
mov S0, 0*4(OutputText)
mov S1, 1*4(OutputText)
mov S2, 2*4(OutputText)
mov S3, 3*4(OutputText)
xor r0, r0 // Return success.
9:
// Pop stack and restore registers.
#if BUILDKERNEL
#if defined __i386__
movaps 4*16(r4), %xmm4
movaps 3*16(r4), %xmm3
#endif
movaps 2*16(r4), %xmm2
movaps 1*16(r4), %xmm1
movaps 0*16(r4), %xmm0
#endif // BUILDKERNEL
#if 0 < LocalsSize
add $Padding + LocalsSize, r4
#endif
#if defined __i386__
pop r7
pop r6
#elif defined __x86_64__
#endif
pop r3
pop r5
ret
#undef ArgExpandedKey
#undef ArgInputText
#undef ArgNr
#undef ArgOutputText
#undef Argument
#undef BaseP
#undef ExpandedKey
#undef ExpandedKeyEnd
#undef FTable
#undef InputText
#undef LocalsSize
#undef LookupM
#undef LookupF
#undef MTable
#undef OutputText
#undef Padding
#undef SaveSize
#undef S0
#undef S1
#undef S2
#undef S3
#undef StackFrame
#undef Work0d
#undef Work0h
#undef Work0l
#undef Work1d
#undef Work1h
#undef Work1l
#undef Work2d
#undef Work2h
#undef Work2l
#undef Work3d
#undef Work3h
#undef Work3l
#undef t0
#undef t0d
#undef t0l
#undef t1
#undef vExpandedKey
#undef vS1
#undef vS2
#undef vS3
#undef Name
#undef MTable
#undef FTable
#undef P0
#undef P1
#undef P2
#undef P3
#undef Increment
#endif // defined(__x86_64__) || defined(__i386__)