first commit

This commit is contained in:
vel 2021-12-31 13:03:01 -08:00
commit 67e48c5b1d
Signed by: velvox
GPG Key ID: 8C470C59E7724537
1636 changed files with 1375946 additions and 0 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
build

310
CMakeLists.txt Normal file
View File

@ -0,0 +1,310 @@
# Copyright (c) (2018-2020) Apple Inc. All rights reserved.
#
# corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
# is contained in the License.txt file distributed with corecrypto) and only to
# people who accept that license. IMPORTANT: Any license rights granted to you by
# Apple Inc. (if any) are limited to internal use within your organization only on
# devices and computers you own or control, for the sole purpose of verifying the
# security characteristics and correct functioning of the Apple Software. You may
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
#
# CMake corecrypto build for Linux
#
# This CMake generates corecrypto_static library. It is meant to be
# used for Linux only.
#
cmake_minimum_required(VERSION 3.4.3)
set(CMAKE_OSX_SYSROOT "macosx.internal") # NOTE: This must be set before the call to project
project (corecrypto C)
option(CC_LINUX_ASM "Enable assembler support on Linux platform" OFF)
include (CoreCryptoSources.cmake)
#
# Build Macros and Targets
#
# get_include_dirs: extract include directories from list of headers
macro (get_include_dirs out in)
foreach (file ${in})
# Add directory including the header
get_filename_component(dir ${file} DIRECTORY)
list(APPEND ${out} ${dir})
# If the directory is corecrypto, we should also add its
# parent to the include dir.
get_filename_component(dirname ${dir} NAME)
if (${dirname} STREQUAL "corecrypto")
get_filename_component(parent ${dir} DIRECTORY)
list(APPEND ${out} ${parent})
endif()
endforeach()
endmacro()
# Project-level settings
## Build all objects with -fPIC
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
## CMake spelling of -std=gnu99
set(CMAKE_C_STANDARD 99)
set(CMAKE_C_EXTENSIONS ON)
## Project-globals
set_property(DIRECTORY
APPEND PROPERTY COMPILE_DEFINITIONS
COMPILING_CORECRYPTO=1
$<$<CONFIG:Debug>:DEBUG=1>
$<$<CONFIG:Release>:NDEBUG>
)
set(CC_C_OPTIONS
-DBUILDKERNEL=0
-Wundef
-Wcast-qual
-Wno-error=deprecated-declarations
$<$<CONFIG:Debug>:-Werror>
)
add_compile_options(
"$<$<COMPILE_LANGUAGE:C>:${CC_C_OPTIONS}>"
)
# System dependencies
find_package(UnixCommands REQUIRED) # For ${BASH}
find_package(Threads REQUIRED)
find_library(MATH_LIBRARY m DOC "libm")
if(NOT MATH_LIBRARY)
message(SEND_ERROR "Could not find libm")
endif()
# Platform-specific dependencies
if(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
find_library(SYSTEM_FRAMEWORK NAMES System)
mark_as_advanced(SYSTEM_FRAMEWORK)
find_path(SYSTEM_CPU_CAPABILITIES_PATH i386/cpu_capabilities.h
HINTS "${SYSTEM_FRAMEWORK}/PrivateHeaders")
mark_as_advanced(SYSTEM_CPU_CAPABILITIES_PATH)
if(NOT SYSTEM_FRAMEWORK OR NOT SYSTEM_CPU_CAPABILITIES_PATH)
unset(SYSTEM_FRAMEWORK CACHE)
message(SEND_ERROR
"Could not find internal System.framework\n"
"HINT: Run cmake with xcrun to point it at the right SDK, or try:\n"
" ${CMAKE_COMMAND} -DCMAKE_OSX_SYSROOT=macosx.internal .")
else()
message("-- Found internal System.framework")
endif()
# Compile assembler sources in OSX
enable_language(ASM)
# Enable FIPS POST trace in OSX
set_source_files_properties(cc_fips/src/fipspost_trace.c cc_fips/crypto_test/crypto_test_cc_fips.c
PROPERTIES COMPILE_FLAGS -DCORECRYPTO_POST_TRACE=1)
elseif(CMAKE_SYSTEM_NAME STREQUAL "Linux")
# Exclude sources that don't apply to Linux (or haven't yet been ported)
set (CORECRYPTO_EXCLUDE_SRCS
# exclude files that are OSX dependent
cc_fips/src/fipspost_get_cpu_key.c
cc_fips/src/fipspost_get_hmac.c
cckprng/src/cckprng_diag.c
cckprng/src/cckprng_diaggens.c
cckprng/src/cckprng_generate.c
cckprng/src/cckprng_init.c
cckprng/src/cckprng_initgen.c
cckprng/src/cckprng_loadseed.c
cckprng/src/cckprng_printdiag.c
cckprng/src/cckprng_ratchetseed.c
cckprng/src/cckprng_refresh.c
cckprng/src/cckprng_rekeygen.c
cckprng/src/cckprng_rekeygens.c
cckprng/src/cckprng_reseed.c
cckprng/src/cckprng_storeseed.c
cckprng/src/prng.c
)
set (CORECRYPTO_TEST_EXCLUDE_SRCS
# exclude files that are OSX dependent
cc_fips/src/fipspost_get_cpu_key.c
cc_fips/src/fipspost_get_hmac.c
corecrypto_test/lib/ccshadow.c
corecrypto_test/lib/cccycles.c
cckprng/crypto_test/crypto_test_kprng.c
# this test requires trace to be enabled
cc_fips/crypto_test/crypto_test_cc_fips.c
)
set (CORECRYPTO_PERF_EXCLUDE_SRCS
# exclude files that are OSX dependent
corecrypto_perf/src/ccperf_kprng.c
)
if (CC_LINUX_ASM)
enable_language(ASM)
# Add assembler specific clang flags
set (CC_ASM_OPTIONS
-integrated-as # Always use clang internal assembler
-x assembler-with-cpp # Run preprocessor despite .s name
)
add_compile_options(
"$<$<COMPILE_LANGUAGE:ASM>:${CC_ASM_OPTIONS}>"
)
# Enable Linux assembler in corecrypto
add_compile_options(
"-DCC_LINUX_ASM=1"
)
endif()
endif()
include(GNUInstallDirs)
if(NOT CMAKE_C_COMPILER_ID MATCHES "Clang")
message(FATAL_ERROR "Only clang is supported for compilation, found ${CMAKE_C_COMPILER_ID} (${CMAKE_C_COMPILER})")
endif()
#
# corecrypto_static library target
#
# A few include dirs cannot be automatically generated by the above headers
# list. Manually fix it up.
set (CORECRYPTO_FIXED_INCLUDE_DIRS
ccaes/src/vng
cckprng
cckprng/corecrypto
corecrypto_test/include
acceleratecrypto/Include
acceleratecrypto/Header
ccec25519/src
)
# Find include dirs for corecrypto_static headers.
set (cc_include_dir ${CORECRYPTO_FIXED_INCLUDE_DIRS})
get_include_dirs (cc_include_dir "${CORECRYPTO_PROJECT_HDRS}")
get_include_dirs (cc_include_dir "${CORECRYPTO_PUBLIC_HDRS}")
get_include_dirs (cc_include_dir "${CORECRYPTO_PRIVATE_HDRS}")
list (REMOVE_DUPLICATES cc_include_dir)
# Filter out excluded sources
if(CORECRYPTO_EXCLUDE_SRCS)
list(REMOVE_ITEM CORECRYPTO_SRCS ${CORECRYPTO_EXCLUDE_SRCS})
endif()
# Create target for corecrypto_static
add_library(corecrypto_static STATIC ${CORECRYPTO_SRCS})
target_link_libraries(corecrypto_static
PRIVATE $<$<PLATFORM_ID:Darwin>:${SYSTEM_FRAMEWORK}> ${MATH_LIBRARY})
target_include_directories(corecrypto_static PRIVATE ${cc_include_dir})
set_property(TARGET corecrypto_static PROPERTY POSITION_INDEPENDENT_CODE ON)
# Generate pkgconfig for corecrypto_static
configure_file("corecrypto.pc.in" "corecrypto.pc" @ONLY)
# Install corecrypto_static
install (TARGETS corecrypto_static ARCHIVE
DESTINATION "${CMAKE_INSTALL_LIBDIR}")
install (FILES ${CORECRYPTO_PUBLIC_HDRS} ${CORECRYPTO_PRIVATE_HDRS}
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/corecrypto")
install (FILES ${CMAKE_CURRENT_BINARY_DIR}/corecrypto.pc
DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
#
# corecrypto_test target
#
# Remove the .inc and other non C files from the sources
foreach (file ${CORECRYPTO_TEST_SRCS})
string (REGEX MATCH ".+\\.c$" match ${file})
if (NOT match)
list (REMOVE_ITEM CORECRYPTO_TEST_SRCS ${file})
endif()
endforeach()
# A few include dirs cannot be automatically generated by the above headers
# list. Manually fix it up.
set (CORECRYPTO_TEST_FIXED_INCLUDE_DIRS
ccsha2/src
ccrng/src
ccec25519/src
ccaes/src/ios_hardware
corecrypto_test
cczp/src
)
# Find include dirs for corecrypto_test headers.
set (cctest_include_dir ${CORECRYPTO_TEST_FIXED_INCLUDE_DIRS})
get_include_dirs (cctest_include_dir "${CORECRYPTO_TEST_HDRS}")
get_include_dirs (cctest_include_dir "${CORECRYPTO_TEST_SRCS}")
list (REMOVE_DUPLICATES cctest_include_dir)
# Create target for corecrypto_test
if(CORECRYPTO_TEST_EXCLUDE_SRCS)
list (REMOVE_ITEM CORECRYPTO_TEST_SRCS ${CORECRYPTO_TEST_EXCLUDE_SRCS})
endif()
add_executable(corecrypto_test ${CORECRYPTO_TEST_SRCS})
target_compile_definitions(corecrypto_test PRIVATE CC_UNITTEST=1)
target_include_directories(corecrypto_test
PRIVATE ${cctest_include_dir} ${cc_include_dir})
target_link_libraries(corecrypto_test PRIVATE corecrypto_static
Threads::Threads ${MATH_LIBRARY} ${CMAKE_DL_LIBS})
# Generate test vectors
set(CC_CONVERT_TEST_VECTORS scripts/convert_testvectors.sh)
set(CC_TEST_VECTORS corecrypto_test/test_vectors/wycheproof/chacha20_poly1305_test.json)
set(GENERATED_TEST_VECTORS_DIR ${CMAKE_CURRENT_BINARY_DIR}/gen/corecrypto_test/include)
set(GENERATED_TEST_VECTORS ${GENERATED_TEST_VECTORS_DIR}/cc_generated_test_vectors.h
)
add_custom_command(
OUTPUT ${GENERATED_TEST_VECTORS}
COMMAND ${CMAKE_COMMAND} -E make_directory ${GENERATED_TEST_VECTORS_DIR}
COMMAND ${BASH} ${CMAKE_SOURCE_DIR}/${CC_CONVERT_TEST_VECTORS} ${GENERATED_TEST_VECTORS} ${CMAKE_CURRENT_SOURCE_DIR}/corecrypto_test/test_vectors/wycheproof
COMMENT "Generating test vectors"
DEPENDS ${CC_CONVERT_TEST_VECTORS} ${CC_TEST_VECTORS}
)
target_sources(corecrypto_test PRIVATE ${GENERATED_TEST_VECTORS})
target_include_directories(corecrypto_test PRIVATE ${GENERATED_TEST_VECTORS_DIR})
set(CC_CONVERT_TEST_VECTORS_PC scripts/convert_h2c_testvectors.py)
message(STATUS "Running python convert_h2c_testvectors.py")
execute_process(
COMMAND ${PYTHON} ${CMAKE_SOURCE_DIR}/${CC_CONVERT_TEST_VECTORS_PC} ${CMAKE_CURRENT_SOURCE_DIR}
RESULT_VARIABLE RESULT_PC
OUTPUT_VARIABLE OUTPUT_PC
ERROR_VARIABLE ERROR_PC
)
message(STATUS "result convert_vectors: ${RESULT_PC}")
message(STATUS "output convert_vectors: ${OUTPUT_PC}")
message(STATUS "error convert_vectors: ${ERROR_PC}")
#
# corecrypto_perf target
#
# ccperf.h lives in corecrypto_perf/corecrypto. Add it up
set (CORECRYPTO_PERF_FIXED_INCLUDE_DIRS
corecrypto_perf/corecrypto
)
set (ccperf_include_dir ${CORECRYPTO_PERF_FIXED_INCLUDE_DIRS})
# Create target for corecrypto_perf
if(CORECRYPTO_PERF_EXCLUDE_SRCS)
list (REMOVE_ITEM CORECRYPTO_PERF_SRCS ${CORECRYPTO_PERF_EXCLUDE_SRCS})
endif()
add_executable(corecrypto_perf ${CORECRYPTO_PERF_SRCS})
target_include_directories(corecrypto_perf
PRIVATE ${ccperf_include_dir} ${cctest_include_dir} ${cc_include_dir})
target_link_libraries(corecrypto_perf PRIVATE corecrypto_static Threads::Threads ${MATH_LIBRARY})

1135
CoreCryptoSources.cmake Normal file

File diff suppressed because it is too large Load Diff

61
License.txt Normal file
View File

@ -0,0 +1,61 @@
Copyright (c) Apple Inc. All rights reserved.
corecrypto Internal Use License Agreement
IMPORTANT: This Apple corecrypto software is supplied to you by Apple Inc. ("Apple")
in consideration of your agreement to the following terms, and your download or use
of this Apple software constitutes acceptance of these terms. If you do not agree
with these terms, please do not download or use this Apple software.
1. As used in this Agreement, the term "Apple Software" collectively means and
includes all of the Apple corecrypto materials provided by Apple here, including
but not limited to the Apple corecrypto software, frameworks, libraries, documentation
and other Apple-created materials. In consideration of your agreement to abide by the
following terms, conditioned upon your compliance with these terms and subject to
these terms, Apple grants you, for a period of ninety (90) days from the date you
download the Apple Software, a limited, non-exclusive, non-sublicensable license
under Apples copyrights in the Apple Software to make a reasonable number of copies
of, compile, and run the Apple Software internally within your organization only on
devices and computers you own or control, for the sole purpose of verifying the
security characteristics and correct functioning of the Apple Software; provided
that you must retain this notice and the following text and disclaimers in all
copies of the Apple Software that you make. You may not, directly or indirectly,
redistribute the Apple Software or any portions thereof. The Apple Software is only
licensed and intended for use as expressly stated above and may not be used for other
purposes or in other contexts without Apple's prior written permission. Except as
expressly stated in this notice, no other rights or licenses, express or implied, are
granted by Apple herein.
2. The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO
WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED WARRANTIES
OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, REGARDING
THE APPLE SOFTWARE OR ITS USE AND OPERATION ALONE OR IN COMBINATION WITH YOUR PRODUCTS,
SYSTEMS, OR SERVICES. APPLE DOES NOT WARRANT THAT THE APPLE SOFTWARE WILL MEET YOUR
REQUIREMENTS, THAT THE OPERATION OF THE APPLE SOFTWARE WILL BE UNINTERRUPTED OR
ERROR-FREE, THAT DEFECTS IN THE APPLE SOFTWARE WILL BE CORRECTED, OR THAT THE APPLE
SOFTWARE WILL BE COMPATIBLE WITH FUTURE APPLE PRODUCTS, SOFTWARE OR SERVICES. NO ORAL
OR WRITTEN INFORMATION OR ADVICE GIVEN BY APPLE OR AN APPLE AUTHORIZED REPRESENTATIVE
WILL CREATE A WARRANTY.
3. IN NO EVENT SHALL APPLE BE LIABLE FOR ANY DIRECT, SPECIAL, INDIRECT, INCIDENTAL
OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) ARISING
IN ANY WAY OUT OF THE USE, REPRODUCTION, COMPILATION OR OPERATION OF THE APPLE
SOFTWARE, HOWEVER CAUSED AND WHETHER UNDER THEORY OF CONTRACT, TORT (INCLUDING
NEGLIGENCE), STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
4. This Agreement is effective until terminated. Your rights under this Agreement will
terminate automatically without notice from Apple if you fail to comply with any term(s)
of this Agreement. Upon termination, you agree to cease all use of the Apple Software
and destroy all copies, full or partial, of the Apple Software. This Agreement will be
governed and construed in accordance with the laws of the State of California, without
regard to its choice of law rules.
You may report security issues about Apple products to product-security@apple.com,
as described here:  https://www.apple.com/support/security/. Non-security bugs and
enhancement requests can be made via https://bugreport.apple.com as described
here: https://developer.apple.com/bug-reporting/
EA1350
10/5/15

13
Makefile Normal file
View File

@ -0,0 +1,13 @@
# Copyright (c) (2017,2018,2019) Apple Inc. All rights reserved.
#
# corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
# is contained in the License.txt file distributed with corecrypto) and only to
# people who accept that license. IMPORTANT: Any license rights granted to you by
# Apple Inc. (if any) are limited to internal use within your organization only on
# devices and computers you own or control, for the sole purpose of verifying the
# security characteristics and correct functioning of the Apple Software. You may
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
#
coverage:
./scripts/corecrypto_coverage.sh

127
README.md Normal file
View File

@ -0,0 +1,127 @@
/* Copyright (c) (2010,2012,2014-2020) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
The corecrypto (cc) project
===========================
The main goal is to provide low level fast math routines and crypto APIs which
can be used in various environments (Kernel, bootloader, userspace, etc.). It
is an explicit goal to minimize dependancies between modules and functions so
that clients of this library only end up with the routines they need and
nothing more.
Corecrypto compiles under all Apple OSs, Windows, Android and Linux.
Corecrypto Modules
------------------
Current corecrypto consists of the following submodules:
* `cc`: Headers and code common to all of the modules
* `ccasn1`: ASN.1 typeid constants and ccoid definition.
* `ccder`: DER encoding decoding support
* `ccn`: Math on vectors of n cc_units
* `cczp`: Modular arithmetic mod integer p, on vectors of n cc_units
* `ccz`: Variable sized signed integer math routines
* `ccdrbg`: Deterministic Random Byte Generators
* `ccrng`: Random Bytes Generators
* `ccdh`: Diffie-Hellman routines.
* `ccec25519`: Elliptic curve signature and Diffie-Hellman routines using the Edward's 25519 curve
* `ccrsa`: RSA routines.
* `ccec`: Eliptic Curve Curves, ec specific math and APIs
* `ccdigest`: Digest abstraction layer.
* `cchmac`: HMAC using any ccdigest.
* `ccpbkdf2`: PBKDF2 using any ccdigest.
* `ccmd2`: MD2 digest implementations.
* `ccmd4`: MD4 digest implementations.
* `ccmd5`: MD5 digest implementations.
* `ccripemd`: RIPE-MD digest implementations.
* `ccsha1`: SHA-1 digest implementations.
* `ccsha2`: SHA-2 digest implementations.
* `ccmode`: Symmetric cipher chaining mode interfaces.
* `ccpad`: Symmetric cipher padding code.
* `ccaes`: AES symmetric cipher implementations.
* `ccblowfish`: Blowfish symmetric cipher implementations.
* `cccast`: Cast symmetric cipher implementations.
* `ccdes`: DES and 3DES symmetric cipher implementations.
* `ccrc2`: RC2 symmetric cipher implementations.
* `ccrc4`: RC4 symmetric cipher implementations.
* `ccperf`: Performance testing harness.
* `cctest`: Common utilities for creating self tests and XCunit tests.
* `ccprime`: Functions for generating large prime numbers. Mostly used in RSA key generation.
* `ccspake`: SPAKE2+ password-based key exchange implementation.
### Module Subdirectories
Each module has the following subdirectories:
* `corecrypto`: headers for this module
* `src`: sources for this module
* `doc`: documentation, references, etc.
* `xcunit`: XCTest based unit tests for this module.
* `crypto_tests`: sources for executable tests for this module
* `test_vectors`: test vectors for this module
* `tools`: sources for random helper tools.
The following subdirections don't follow the module layout yet:
* `corecrypto_kext`: Supporting files for kernel extension build and fips support.
* `corecrypto_dylib`: Supporting files for userspace shared lib build and fips support.
ARMV6m
------
The ARMV6m is not on corecrypto project target list. To compile corecrypto under ARMV6m use the following command:
`$xcodebuild -target "corecrypto_static" OTHER_CFLAGS="-Qunused-arguments" -sdk iphoneos.internal -arch armv6m`
Windows
-------
corecrypto compiles under Windows using Visual Studio 2015 and Clang with Microsoft CodeGen. The corecrypto Solution contains three projects:
1. `corecrypto`: This projects compiles corecrypto, and produces a static library in 32 and 64 bit modes.
2. `corecrypto_test`: This project compiles corecrypto test files and links statically with the corecrypto debug library.
3. `corecrypto_perf`: This project compiles corecrypto performance measurement files and links statically with the corecrypto release library.
4. `corecrypto_wintest`: This project contains a simple code that links to the corecrypto.lib and complies in c++ using the Visul C++ compiler. This project created to
make sure corecrypto library can linked to c++ software that are compiled with the Microsoft Compiler.
Android
------
corecrypto library, `corecrypto_test` and `corecrypto_perf` complie under Android. The Android project file is in the android subdirectory.
Linux
-----
The corecrypto library, `corecrypto_test` and `corecrypto_perf` compile under Linux and are built using cmake. See Cmake section for more details.
The Linux implementation does not use ASM implementations due to differences between assemblers on Darwin and Linux.
CMake
-----
The corecrypto library, 'corecrypto_test' and 'corecrypto_perf' can also be built using cmake in macOS and Linux.
To compile using cmake, run the usual cmake commands:
```
$ cd <srcdir>
$ mkdir build && cd build
$ CC=clang CXX=clang++ cmake ..
$ make
```
where `<srcdir>` is the path to the directory containing the sources.
To install, type `make install` from the build directory (will require root privileges).
Prototypes changes:
-------------------
From time to time, corecrypto needs to change the prototypes of functions.
In this case, we use a macro defined as:
CC_CHANGEFUNCTION_<radar>_<function name>
and the header will document instructions to migrate from the old to new function prototype.

View File

@ -0,0 +1,919 @@
// !$*UTF8*$!
{
archiveVersion = 1;
classes = {
};
objectVersion = 50;
objects = {
/* Begin PBXAggregateTarget section */
2CD5E9C120D85B370097F130 /* AccelerateCrypto */ = {
isa = PBXAggregateTarget;
buildConfigurationList = 2CD5E9C420D85B370097F130 /* Build configuration list for PBXAggregateTarget "AccelerateCrypto" */;
buildPhases = (
);
dependencies = (
2C88439021B74BE100C49BD9 /* PBXTargetDependency */,
2C6CED2E20E195E90045D491 /* PBXTargetDependency */,
);
name = AccelerateCrypto;
productName = AccelerateCrypto;
};
/* End PBXAggregateTarget section */
/* Begin PBXBuildFile section */
2C6CED1120E1956A0045D491 /* sha512_compress_armv7neon.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EDA20DD5D2C00840ABB /* sha512_compress_armv7neon.s */; };
2C6CED1220E195710045D491 /* sha512_compress_arm64.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447ED820DD5D2C00840ABB /* sha512_compress_arm64.s */; };
2C6CED1320E1957F0045D491 /* sha512_compress_avx1.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447ED420DD5D2C00840ABB /* sha512_compress_avx1.s */; };
2C6CED1420E1957F0045D491 /* sha512_compress_avx2.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447ED320DD5D2C00840ABB /* sha512_compress_avx2.s */; };
2C6CED1520E1957F0045D491 /* sha512_compress_ssse3.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447ED220DD5D2C00840ABB /* sha512_compress_ssse3.s */; };
2C6CED1620E1957F0045D491 /* sha512_compress.c in Sources */ = {isa = PBXBuildFile; fileRef = 2C447ED620DD5D2C00840ABB /* sha512_compress.c */; };
2C6CED1720E1957F0045D491 /* sha512_K.c in Sources */ = {isa = PBXBuildFile; fileRef = 2C447ED120DD5D2C00840ABB /* sha512_K.c */; };
2C6CED1820E195850045D491 /* sha256_compress_armv7neon.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EC720DD5D1900840ABB /* sha256_compress_armv7neon.s */; };
2C6CED1920E195890045D491 /* sha256_compress_arm64.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EC520DD5D1800840ABB /* sha256_compress_arm64.s */; };
2C6CED1A20E1958D0045D491 /* sha256_compress_avx1.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EB920DD5D1800840ABB /* sha256_compress_avx1.s */; };
2C6CED1B20E1958D0045D491 /* sha256_compress_avx2.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EC820DD5D1900840ABB /* sha256_compress_avx2.s */; };
2C6CED1C20E1958D0045D491 /* sha256_compress_ssse3_32.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EC920DD5D1900840ABB /* sha256_compress_ssse3_32.s */; };
2C6CED1D20E1958D0045D491 /* sha256_compress_ssse3_64.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EBA20DD5D1800840ABB /* sha256_compress_ssse3_64.s */; };
2C6CED1E20E1958D0045D491 /* sha256_compress.c in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EBB20DD5D1800840ABB /* sha256_compress.c */; };
2C6CED1F20E1958D0045D491 /* sha256_K.c in Sources */ = {isa = PBXBuildFile; fileRef = 2C447ECA20DD5D1900840ABB /* sha256_K.c */; };
2C6CED2020E195930045D491 /* sha1_compress_armv7neon.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EB420DD5D0100840ABB /* sha1_compress_armv7neon.s */; };
2C6CED2120E195970045D491 /* sha1_compress_arm64.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EB020DD5D0100840ABB /* sha1_compress_arm64.s */; };
2C6CED2220E1959B0045D491 /* sha1_compress_avx1.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EA720DD5D0100840ABB /* sha1_compress_avx1.s */; };
2C6CED2320E1959B0045D491 /* sha1_compress_avx2.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EB220DD5D0100840ABB /* sha1_compress_avx2.s */; };
2C6CED2420E1959B0045D491 /* sha1_compress_sse.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EB120DD5D0100840ABB /* sha1_compress_sse.s */; };
2C6CED2520E1959B0045D491 /* sha1_compress.c in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EA820DD5D0100840ABB /* sha1_compress.c */; };
2C6CED2620E195A80045D491 /* decrypt.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EE120DD5D4600840ABB /* decrypt.s */; };
2C6CED2720E195A80045D491 /* encrypt.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EE220DD5D4600840ABB /* encrypt.s */; };
2C6CED2820E195A80045D491 /* vpaes-armv7.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EE020DD5D4600840ABB /* vpaes-armv7.s */; };
2C6CED2920E195B60045D491 /* decrypt.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EDC20DD5D4600840ABB /* decrypt.s */; };
2C6CED2A20E195B60045D491 /* decrypt_ecb.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EDD20DD5D4600840ABB /* decrypt_ecb.s */; };
2C6CED2B20E195B60045D491 /* encrypt.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EDE20DD5D4600840ABB /* encrypt.s */; };
2C6CED2C20E195B60045D491 /* encrypt_ecb.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EDF20DD5D4600840ABB /* encrypt_ecb.s */; };
2C6CED2F20E302B40045D491 /* AccelerateCrypto.h in Headers */ = {isa = PBXBuildFile; fileRef = 2C447E9E20DD5BD600840ABB /* AccelerateCrypto.h */; };
2C88436C21B74AD500C49BD9 /* sha1_compress.c in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EA820DD5D0100840ABB /* sha1_compress.c */; };
2C88436D21B74AD500C49BD9 /* sha256_compress_avx2.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EC820DD5D1900840ABB /* sha256_compress_avx2.s */; };
2C88436E21B74AD500C49BD9 /* sha256_compress_ssse3_64.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EBA20DD5D1800840ABB /* sha256_compress_ssse3_64.s */; };
2C88436F21B74AD500C49BD9 /* sha1_compress_sse.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EB120DD5D0100840ABB /* sha1_compress_sse.s */; };
2C88437021B74AD500C49BD9 /* encrypt_ecb.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EDF20DD5D4600840ABB /* encrypt_ecb.s */; };
2C88437121B74AD500C49BD9 /* encrypt.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EE220DD5D4600840ABB /* encrypt.s */; };
2C88437221B74AD500C49BD9 /* encrypt.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EDE20DD5D4600840ABB /* encrypt.s */; };
2C88437321B74AD500C49BD9 /* sha1_compress_avx1.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EA720DD5D0100840ABB /* sha1_compress_avx1.s */; };
2C88437421B74AD500C49BD9 /* sha256_compress.c in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EBB20DD5D1800840ABB /* sha256_compress.c */; };
2C88437521B74AD500C49BD9 /* sha512_compress_ssse3.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447ED220DD5D2C00840ABB /* sha512_compress_ssse3.s */; };
2C88437621B74AD500C49BD9 /* sha512_compress_arm64.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447ED820DD5D2C00840ABB /* sha512_compress_arm64.s */; };
2C88437721B74AD500C49BD9 /* sha512_compress_avx1.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447ED420DD5D2C00840ABB /* sha512_compress_avx1.s */; };
2C88437821B74AD500C49BD9 /* sha256_K.c in Sources */ = {isa = PBXBuildFile; fileRef = 2C447ECA20DD5D1900840ABB /* sha256_K.c */; };
2C88437921B74AD500C49BD9 /* sha1_compress_avx2.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EB220DD5D0100840ABB /* sha1_compress_avx2.s */; };
2C88437A21B74AD500C49BD9 /* sha512_compress.c in Sources */ = {isa = PBXBuildFile; fileRef = 2C447ED620DD5D2C00840ABB /* sha512_compress.c */; };
2C88437B21B74AD500C49BD9 /* sha256_compress_arm64.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EC520DD5D1800840ABB /* sha256_compress_arm64.s */; };
2C88437C21B74AD500C49BD9 /* sha256_compress_armv7neon.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EC720DD5D1900840ABB /* sha256_compress_armv7neon.s */; };
2C88437D21B74AD500C49BD9 /* vpaes-armv7.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EE020DD5D4600840ABB /* vpaes-armv7.s */; };
2C88437E21B74AD500C49BD9 /* sha256_compress_ssse3_32.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EC920DD5D1900840ABB /* sha256_compress_ssse3_32.s */; };
2C88437F21B74AD500C49BD9 /* decrypt.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EE120DD5D4600840ABB /* decrypt.s */; };
2C88438021B74AD500C49BD9 /* sha1_compress_arm64.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EB020DD5D0100840ABB /* sha1_compress_arm64.s */; };
2C88438121B74AD500C49BD9 /* decrypt.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EDC20DD5D4600840ABB /* decrypt.s */; };
2C88438221B74AD500C49BD9 /* sha512_K.c in Sources */ = {isa = PBXBuildFile; fileRef = 2C447ED120DD5D2C00840ABB /* sha512_K.c */; };
2C88438321B74AD500C49BD9 /* sha512_compress_avx2.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447ED320DD5D2C00840ABB /* sha512_compress_avx2.s */; };
2C88438421B74AD500C49BD9 /* sha256_compress_avx1.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EB920DD5D1800840ABB /* sha256_compress_avx1.s */; };
2C88438521B74AD500C49BD9 /* sha1_compress_armv7neon.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EB420DD5D0100840ABB /* sha1_compress_armv7neon.s */; };
2C88438621B74AD500C49BD9 /* decrypt_ecb.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EDD20DD5D4600840ABB /* decrypt_ecb.s */; };
2C88438721B74AD500C49BD9 /* sha512_compress_armv7neon.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EDA20DD5D2C00840ABB /* sha512_compress_armv7neon.s */; };
2C8843A921B8AA8200C49BD9 /* crypt_nonaesni.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C8843A421B8AA8200C49BD9 /* crypt_nonaesni.s */; };
2C8843AA21B8AA8200C49BD9 /* crypt_nonaesni.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C8843A421B8AA8200C49BD9 /* crypt_nonaesni.s */; };
2C8843AB21B8AA8200C49BD9 /* Context.h in Headers */ = {isa = PBXBuildFile; fileRef = 2C8843A521B8AA8200C49BD9 /* Context.h */; };
2C8843AC21B8AA8200C49BD9 /* Context.h in Headers */ = {isa = PBXBuildFile; fileRef = 2C8843A521B8AA8200C49BD9 /* Context.h */; };
2C8843AD21B8AA8200C49BD9 /* crypt_aesni.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C8843A621B8AA8200C49BD9 /* crypt_aesni.s */; };
2C8843AE21B8AA8200C49BD9 /* crypt_aesni.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C8843A621B8AA8200C49BD9 /* crypt_aesni.s */; };
2C8843AF21B8AA8200C49BD9 /* aes.c in Sources */ = {isa = PBXBuildFile; fileRef = 2C8843A721B8AA8200C49BD9 /* aes.c */; };
2C8843B021B8AA8200C49BD9 /* aes.c in Sources */ = {isa = PBXBuildFile; fileRef = 2C8843A721B8AA8200C49BD9 /* aes.c */; };
2C93F58321BAF750009239B3 /* AccelerateCrypto.h in Headers */ = {isa = PBXBuildFile; fileRef = 2C447E9E20DD5BD600840ABB /* AccelerateCrypto.h */; };
/* End PBXBuildFile section */
/* Begin PBXContainerItemProxy section */
2C6CED2D20E195E90045D491 /* PBXContainerItemProxy */ = {
isa = PBXContainerItemProxy;
containerPortal = 2CC8863B20D859F200D17D95 /* Project object */;
proxyType = 1;
remoteGlobalIDString = 2C6CED0720E195360045D491;
remoteInfo = libAccelerateCrypto;
};
2C88438F21B74BE100C49BD9 /* PBXContainerItemProxy */ = {
isa = PBXContainerItemProxy;
containerPortal = 2CC8863B20D859F200D17D95 /* Project object */;
proxyType = 1;
remoteGlobalIDString = 2C88436A21B74AD500C49BD9;
remoteInfo = libAccelerateCrypto_kernel;
};
/* End PBXContainerItemProxy section */
/* Begin PBXFileReference section */
2C447E9E20DD5BD600840ABB /* AccelerateCrypto.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = AccelerateCrypto.h; path = Header/AccelerateCrypto.h; sourceTree = SOURCE_ROOT; };
2C447EA020DD5C1300840ABB /* config.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = config.h; path = Include/config.h; sourceTree = SOURCE_ROOT; };
2C447EA120DD5C1300840ABB /* arm64_isa_compatibility.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = arm64_isa_compatibility.h; path = Include/arm64_isa_compatibility.h; sourceTree = SOURCE_ROOT; };
2C447EA720DD5D0100840ABB /* sha1_compress_avx1.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = sha1_compress_avx1.s; path = Source/sha1/intel/sha1_compress_avx1.s; sourceTree = SOURCE_ROOT; };
2C447EA820DD5D0100840ABB /* sha1_compress.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = sha1_compress.c; path = Source/sha1/intel/sha1_compress.c; sourceTree = SOURCE_ROOT; };
2C447EAB20DD5D0100840ABB /* sha1_compress.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = sha1_compress.c; sourceTree = "<group>"; };
2C447EAC20DD5D0100840ABB /* sha1_compress_avx1.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha1_compress_avx1.s; sourceTree = "<group>"; };
2C447EAD20DD5D0100840ABB /* sha1_compress_avx2.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha1_compress_avx2.s; sourceTree = "<group>"; };
2C447EAE20DD5D0100840ABB /* sha1_compress_sse.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha1_compress_sse.s; sourceTree = "<group>"; };
2C447EB020DD5D0100840ABB /* sha1_compress_arm64.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha1_compress_arm64.s; sourceTree = "<group>"; };
2C447EB120DD5D0100840ABB /* sha1_compress_sse.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = sha1_compress_sse.s; path = Source/sha1/intel/sha1_compress_sse.s; sourceTree = SOURCE_ROOT; };
2C447EB220DD5D0100840ABB /* sha1_compress_avx2.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = sha1_compress_avx2.s; path = Source/sha1/intel/sha1_compress_avx2.s; sourceTree = SOURCE_ROOT; };
2C447EB420DD5D0100840ABB /* sha1_compress_armv7neon.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha1_compress_armv7neon.s; sourceTree = "<group>"; };
2C447EB920DD5D1800840ABB /* sha256_compress_avx1.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = sha256_compress_avx1.s; path = Source/sha256/intel/sha256_compress_avx1.s; sourceTree = SOURCE_ROOT; };
2C447EBA20DD5D1800840ABB /* sha256_compress_ssse3_64.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = sha256_compress_ssse3_64.s; path = Source/sha256/intel/sha256_compress_ssse3_64.s; sourceTree = SOURCE_ROOT; };
2C447EBB20DD5D1800840ABB /* sha256_compress.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = sha256_compress.c; path = Source/sha256/intel/sha256_compress.c; sourceTree = SOURCE_ROOT; };
2C447EBD20DD5D1800840ABB /* sha256_compress.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = sha256_compress.c; sourceTree = "<group>"; };
2C447EBE20DD5D1800840ABB /* sha256_compress_avx1.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha256_compress_avx1.s; sourceTree = "<group>"; };
2C447EBF20DD5D1800840ABB /* sha256_compress_avx2.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha256_compress_avx2.s; sourceTree = "<group>"; };
2C447EC020DD5D1800840ABB /* sha256_compress_ssse3_32.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha256_compress_ssse3_32.s; sourceTree = "<group>"; };
2C447EC120DD5D1800840ABB /* sha256_compress_ssse3_64.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha256_compress_ssse3_64.s; sourceTree = "<group>"; };
2C447EC220DD5D1800840ABB /* sha256_K.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = sha256_K.c; sourceTree = "<group>"; };
2C447EC520DD5D1800840ABB /* sha256_compress_arm64.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha256_compress_arm64.s; sourceTree = "<group>"; };
2C447EC720DD5D1900840ABB /* sha256_compress_armv7neon.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha256_compress_armv7neon.s; sourceTree = "<group>"; };
2C447EC820DD5D1900840ABB /* sha256_compress_avx2.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = sha256_compress_avx2.s; path = Source/sha256/intel/sha256_compress_avx2.s; sourceTree = SOURCE_ROOT; };
2C447EC920DD5D1900840ABB /* sha256_compress_ssse3_32.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = sha256_compress_ssse3_32.s; path = Source/sha256/intel/sha256_compress_ssse3_32.s; sourceTree = SOURCE_ROOT; };
2C447ECA20DD5D1900840ABB /* sha256_K.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = sha256_K.c; path = Source/sha256/intel/sha256_K.c; sourceTree = SOURCE_ROOT; };
2C447ECC20DD5D2C00840ABB /* sha512_compress.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = sha512_compress.c; sourceTree = "<group>"; };
2C447ECD20DD5D2C00840ABB /* sha512_compress_avx1.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha512_compress_avx1.s; sourceTree = "<group>"; };
2C447ECE20DD5D2C00840ABB /* sha512_compress_avx2.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha512_compress_avx2.s; sourceTree = "<group>"; };
2C447ECF20DD5D2C00840ABB /* sha512_compress_ssse3.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha512_compress_ssse3.s; sourceTree = "<group>"; };
2C447ED120DD5D2C00840ABB /* sha512_K.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = sha512_K.c; path = Source/sha512/sha512_K.c; sourceTree = SOURCE_ROOT; };
2C447ED220DD5D2C00840ABB /* sha512_compress_ssse3.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = sha512_compress_ssse3.s; path = Source/sha512/intel/sha512_compress_ssse3.s; sourceTree = SOURCE_ROOT; };
2C447ED320DD5D2C00840ABB /* sha512_compress_avx2.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = sha512_compress_avx2.s; path = Source/sha512/intel/sha512_compress_avx2.s; sourceTree = SOURCE_ROOT; };
2C447ED420DD5D2C00840ABB /* sha512_compress_avx1.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = sha512_compress_avx1.s; path = Source/sha512/intel/sha512_compress_avx1.s; sourceTree = SOURCE_ROOT; };
2C447ED620DD5D2C00840ABB /* sha512_compress.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = sha512_compress.c; path = Source/sha512/intel/sha512_compress.c; sourceTree = SOURCE_ROOT; };
2C447ED820DD5D2C00840ABB /* sha512_compress_arm64.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha512_compress_arm64.s; sourceTree = "<group>"; };
2C447EDA20DD5D2C00840ABB /* sha512_compress_armv7neon.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha512_compress_armv7neon.s; sourceTree = "<group>"; };
2C447EDC20DD5D4600840ABB /* decrypt.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = decrypt.s; sourceTree = "<group>"; };
2C447EDD20DD5D4600840ABB /* decrypt_ecb.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = decrypt_ecb.s; sourceTree = "<group>"; };
2C447EDE20DD5D4600840ABB /* encrypt.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = encrypt.s; sourceTree = "<group>"; };
2C447EDF20DD5D4600840ABB /* encrypt_ecb.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = encrypt_ecb.s; sourceTree = "<group>"; };
2C447EE020DD5D4600840ABB /* vpaes-armv7.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = "vpaes-armv7.s"; path = "Source/aes/arm/vpaes-armv7.s"; sourceTree = SOURCE_ROOT; };
2C447EE120DD5D4600840ABB /* decrypt.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = decrypt.s; path = Source/aes/arm/decrypt.s; sourceTree = SOURCE_ROOT; };
2C447EE220DD5D4600840ABB /* encrypt.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = encrypt.s; path = Source/aes/arm/encrypt.s; sourceTree = SOURCE_ROOT; };
2C447EE420DD5D4700840ABB /* EncryptDecrypt.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = EncryptDecrypt.s; path = Source/aes/arm/EncryptDecrypt.s; sourceTree = SOURCE_ROOT; };
2C6CED0820E195360045D491 /* libAccelerateCrypto.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libAccelerateCrypto.a; sourceTree = BUILT_PRODUCTS_DIR; };
2C88438E21B74AD500C49BD9 /* libAccelerateCrypto_kernel.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libAccelerateCrypto_kernel.a; sourceTree = BUILT_PRODUCTS_DIR; };
2C8843A421B8AA8200C49BD9 /* crypt_nonaesni.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = crypt_nonaesni.s; path = Source/aes/intel/crypt_nonaesni.s; sourceTree = SOURCE_ROOT; };
2C8843A521B8AA8200C49BD9 /* Context.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = Context.h; path = Source/aes/intel/Context.h; sourceTree = SOURCE_ROOT; };
2C8843A621B8AA8200C49BD9 /* crypt_aesni.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = crypt_aesni.s; path = Source/aes/intel/crypt_aesni.s; sourceTree = SOURCE_ROOT; };
2C8843A721B8AA8200C49BD9 /* aes.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = aes.c; path = Source/aes/intel/aes.c; sourceTree = SOURCE_ROOT; };
2C8843A821B8AA8200C49BD9 /* Data.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = Data.s; path = Source/aes/intel/Data.s; sourceTree = SOURCE_ROOT; };
2C8843B321B8AA9700C49BD9 /* EncryptDecrypt.s */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.asm; name = EncryptDecrypt.s; path = Source/aes/intel/EncryptDecrypt.s; sourceTree = SOURCE_ROOT; };
/* End PBXFileReference section */
/* Begin PBXFrameworksBuildPhase section */
2C6CED0520E195360045D491 /* Frameworks */ = {
isa = PBXFrameworksBuildPhase;
buildActionMask = 2147483647;
files = (
);
runOnlyForDeploymentPostprocessing = 0;
};
2C88438821B74AD500C49BD9 /* Frameworks */ = {
isa = PBXFrameworksBuildPhase;
buildActionMask = 2147483647;
files = (
);
runOnlyForDeploymentPostprocessing = 0;
};
/* End PBXFrameworksBuildPhase section */
/* Begin PBXGroup section */
2C447E9D20DD5B2600840ABB /* Header */ = {
isa = PBXGroup;
children = (
2C447E9E20DD5BD600840ABB /* AccelerateCrypto.h */,
);
path = Header;
sourceTree = "<group>";
};
2C447E9F20DD5BF300840ABB /* Include */ = {
isa = PBXGroup;
children = (
2C447EA120DD5C1300840ABB /* arm64_isa_compatibility.h */,
2C447EA020DD5C1300840ABB /* config.h */,
);
path = Include;
sourceTree = "<group>";
};
2C447EA220DD5C2400840ABB /* Source */ = {
isa = PBXGroup;
children = (
2C447EA620DD5C5F00840ABB /* sha512 */,
2C447EA520DD5C5600840ABB /* sha256 */,
2C447EA420DD5C4F00840ABB /* sha1 */,
2C447EA320DD5C4400840ABB /* aes */,
);
path = Source;
sourceTree = "<group>";
};
2C447EA320DD5C4400840ABB /* aes */ = {
isa = PBXGroup;
children = (
2C8843A321B8AA4900C49BD9 /* intel */,
2C447EE320DD5D4600840ABB /* arm */,
2C447EDB20DD5D4600840ABB /* arm64 */,
);
path = aes;
sourceTree = "<group>";
};
2C447EA420DD5C4F00840ABB /* sha1 */ = {
isa = PBXGroup;
children = (
2C447EB320DD5D0100840ABB /* arm */,
2C447EAF20DD5D0100840ABB /* arm64 */,
2C447EAA20DD5D0100840ABB /* intel */,
2C447EA720DD5D0100840ABB /* sha1_compress_avx1.s */,
2C447EB220DD5D0100840ABB /* sha1_compress_avx2.s */,
2C447EB120DD5D0100840ABB /* sha1_compress_sse.s */,
2C447EA820DD5D0100840ABB /* sha1_compress.c */,
);
path = sha1;
sourceTree = "<group>";
};
2C447EA520DD5C5600840ABB /* sha256 */ = {
isa = PBXGroup;
children = (
2C447EC620DD5D1900840ABB /* arm */,
2C447EC320DD5D1800840ABB /* arm64 */,
2C447EBC20DD5D1800840ABB /* intel */,
2C447EB920DD5D1800840ABB /* sha256_compress_avx1.s */,
2C447EC820DD5D1900840ABB /* sha256_compress_avx2.s */,
2C447EC920DD5D1900840ABB /* sha256_compress_ssse3_32.s */,
2C447EBA20DD5D1800840ABB /* sha256_compress_ssse3_64.s */,
2C447EBB20DD5D1800840ABB /* sha256_compress.c */,
2C447ECA20DD5D1900840ABB /* sha256_K.c */,
);
path = sha256;
sourceTree = "<group>";
};
2C447EA620DD5C5F00840ABB /* sha512 */ = {
isa = PBXGroup;
children = (
2C447ED920DD5D2C00840ABB /* arm */,
2C447ED720DD5D2C00840ABB /* arm64 */,
2C447ECB20DD5D2C00840ABB /* intel */,
2C447ED420DD5D2C00840ABB /* sha512_compress_avx1.s */,
2C447ED320DD5D2C00840ABB /* sha512_compress_avx2.s */,
2C447ED220DD5D2C00840ABB /* sha512_compress_ssse3.s */,
2C447ED620DD5D2C00840ABB /* sha512_compress.c */,
2C447ED120DD5D2C00840ABB /* sha512_K.c */,
);
path = sha512;
sourceTree = "<group>";
};
2C447EAA20DD5D0100840ABB /* intel */ = {
isa = PBXGroup;
children = (
2C447EAB20DD5D0100840ABB /* sha1_compress.c */,
2C447EAC20DD5D0100840ABB /* sha1_compress_avx1.s */,
2C447EAD20DD5D0100840ABB /* sha1_compress_avx2.s */,
2C447EAE20DD5D0100840ABB /* sha1_compress_sse.s */,
);
name = intel;
path = Source/sha1/intel;
sourceTree = SOURCE_ROOT;
};
2C447EAF20DD5D0100840ABB /* arm64 */ = {
isa = PBXGroup;
children = (
2C447EB020DD5D0100840ABB /* sha1_compress_arm64.s */,
);
name = arm64;
path = Source/sha1/arm64;
sourceTree = SOURCE_ROOT;
};
2C447EB320DD5D0100840ABB /* arm */ = {
isa = PBXGroup;
children = (
2C447EB420DD5D0100840ABB /* sha1_compress_armv7neon.s */,
);
name = arm;
path = Source/sha1/arm;
sourceTree = SOURCE_ROOT;
};
2C447EBC20DD5D1800840ABB /* intel */ = {
isa = PBXGroup;
children = (
2C447EBD20DD5D1800840ABB /* sha256_compress.c */,
2C447EBE20DD5D1800840ABB /* sha256_compress_avx1.s */,
2C447EBF20DD5D1800840ABB /* sha256_compress_avx2.s */,
2C447EC020DD5D1800840ABB /* sha256_compress_ssse3_32.s */,
2C447EC120DD5D1800840ABB /* sha256_compress_ssse3_64.s */,
2C447EC220DD5D1800840ABB /* sha256_K.c */,
);
name = intel;
path = Source/sha256/intel;
sourceTree = SOURCE_ROOT;
};
2C447EC320DD5D1800840ABB /* arm64 */ = {
isa = PBXGroup;
children = (
2C447EC520DD5D1800840ABB /* sha256_compress_arm64.s */,
);
name = arm64;
path = Source/sha256/arm64;
sourceTree = SOURCE_ROOT;
};
2C447EC620DD5D1900840ABB /* arm */ = {
isa = PBXGroup;
children = (
2C447EC720DD5D1900840ABB /* sha256_compress_armv7neon.s */,
);
name = arm;
path = Source/sha256/arm;
sourceTree = SOURCE_ROOT;
};
2C447ECB20DD5D2C00840ABB /* intel */ = {
isa = PBXGroup;
children = (
2C447ECC20DD5D2C00840ABB /* sha512_compress.c */,
2C447ECD20DD5D2C00840ABB /* sha512_compress_avx1.s */,
2C447ECE20DD5D2C00840ABB /* sha512_compress_avx2.s */,
2C447ECF20DD5D2C00840ABB /* sha512_compress_ssse3.s */,
);
name = intel;
path = Source/sha512/intel;
sourceTree = SOURCE_ROOT;
};
2C447ED720DD5D2C00840ABB /* arm64 */ = {
isa = PBXGroup;
children = (
2C447ED820DD5D2C00840ABB /* sha512_compress_arm64.s */,
);
name = arm64;
path = Source/sha512/arm64;
sourceTree = SOURCE_ROOT;
};
2C447ED920DD5D2C00840ABB /* arm */ = {
isa = PBXGroup;
children = (
2C447EDA20DD5D2C00840ABB /* sha512_compress_armv7neon.s */,
);
name = arm;
path = Source/sha512/arm;
sourceTree = SOURCE_ROOT;
};
2C447EDB20DD5D4600840ABB /* arm64 */ = {
isa = PBXGroup;
children = (
2C447EDC20DD5D4600840ABB /* decrypt.s */,
2C447EDD20DD5D4600840ABB /* decrypt_ecb.s */,
2C447EDE20DD5D4600840ABB /* encrypt.s */,
2C447EDF20DD5D4600840ABB /* encrypt_ecb.s */,
);
name = arm64;
path = Source/aes/arm64;
sourceTree = SOURCE_ROOT;
};
2C447EE320DD5D4600840ABB /* arm */ = {
isa = PBXGroup;
children = (
2C447EE120DD5D4600840ABB /* decrypt.s */,
2C447EE220DD5D4600840ABB /* encrypt.s */,
2C447EE420DD5D4700840ABB /* EncryptDecrypt.s */,
2C447EE020DD5D4600840ABB /* vpaes-armv7.s */,
);
name = arm;
path = Source/aes/arm;
sourceTree = SOURCE_ROOT;
};
2C447EEA20DD5FA700840ABB /* Products */ = {
isa = PBXGroup;
children = (
2C6CED0820E195360045D491 /* libAccelerateCrypto.a */,
2C88438E21B74AD500C49BD9 /* libAccelerateCrypto_kernel.a */,
);
name = Products;
sourceTree = "<group>";
};
2C8843A321B8AA4900C49BD9 /* intel */ = {
isa = PBXGroup;
children = (
2C8843B321B8AA9700C49BD9 /* EncryptDecrypt.s */,
2C8843A721B8AA8200C49BD9 /* aes.c */,
2C8843A521B8AA8200C49BD9 /* Context.h */,
2C8843A621B8AA8200C49BD9 /* crypt_aesni.s */,
2C8843A421B8AA8200C49BD9 /* crypt_nonaesni.s */,
2C8843A821B8AA8200C49BD9 /* Data.s */,
);
name = intel;
sourceTree = "<group>";
};
2CC8863A20D859F200D17D95 = {
isa = PBXGroup;
children = (
2C447EA220DD5C2400840ABB /* Source */,
2C447E9F20DD5BF300840ABB /* Include */,
2C447E9D20DD5B2600840ABB /* Header */,
2C447EEA20DD5FA700840ABB /* Products */,
);
sourceTree = "<group>";
};
/* End PBXGroup section */
/* Begin PBXHeadersBuildPhase section */
2C6CED0620E195360045D491 /* Headers */ = {
isa = PBXHeadersBuildPhase;
buildActionMask = 2147483647;
files = (
2C8843AB21B8AA8200C49BD9 /* Context.h in Headers */,
2C6CED2F20E302B40045D491 /* AccelerateCrypto.h in Headers */,
);
runOnlyForDeploymentPostprocessing = 0;
};
2C88438921B74AD500C49BD9 /* Headers */ = {
isa = PBXHeadersBuildPhase;
buildActionMask = 2147483647;
files = (
2C93F58321BAF750009239B3 /* AccelerateCrypto.h in Headers */,
2C8843AC21B8AA8200C49BD9 /* Context.h in Headers */,
);
runOnlyForDeploymentPostprocessing = 0;
};
/* End PBXHeadersBuildPhase section */
/* Begin PBXNativeTarget section */
2C6CED0720E195360045D491 /* libAccelerateCrypto */ = {
isa = PBXNativeTarget;
buildConfigurationList = 2C6CED1020E195360045D491 /* Build configuration list for PBXNativeTarget "libAccelerateCrypto" */;
buildPhases = (
2C6CED0420E195360045D491 /* Sources */,
2C6CED0520E195360045D491 /* Frameworks */,
2C6CED0620E195360045D491 /* Headers */,
);
buildRules = (
);
dependencies = (
);
name = libAccelerateCrypto;
productName = libAccelerateCrypto;
productReference = 2C6CED0820E195360045D491 /* libAccelerateCrypto.a */;
productType = "com.apple.product-type.library.static";
};
2C88436A21B74AD500C49BD9 /* libAccelerateCrypto_kernel */ = {
isa = PBXNativeTarget;
buildConfigurationList = 2C88438B21B74AD500C49BD9 /* Build configuration list for PBXNativeTarget "libAccelerateCrypto_kernel" */;
buildPhases = (
2C88436B21B74AD500C49BD9 /* Sources */,
2C88438821B74AD500C49BD9 /* Frameworks */,
2C88438921B74AD500C49BD9 /* Headers */,
);
buildRules = (
);
dependencies = (
);
name = libAccelerateCrypto_kernel;
productName = libAccelerateCrypto;
productReference = 2C88438E21B74AD500C49BD9 /* libAccelerateCrypto_kernel.a */;
productType = "com.apple.product-type.library.static";
};
/* End PBXNativeTarget section */
/* Begin PBXProject section */
2CC8863B20D859F200D17D95 /* Project object */ = {
isa = PBXProject;
attributes = {
LastUpgradeCheck = 1000;
TargetAttributes = {
2C6CED0720E195360045D491 = {
CreatedOnToolsVersion = 10.0;
};
2CD5E9C120D85B370097F130 = {
CreatedOnToolsVersion = 10.0;
};
};
};
buildConfigurationList = 2CC8863E20D859F200D17D95 /* Build configuration list for PBXProject "AccelerateCrypto" */;
compatibilityVersion = "Xcode 9.3";
developmentRegion = en;
hasScannedForEncodings = 0;
knownRegions = (
en,
);
mainGroup = 2CC8863A20D859F200D17D95;
productRefGroup = 2C447EEA20DD5FA700840ABB /* Products */;
projectDirPath = "";
projectRoot = "";
targets = (
2CD5E9C120D85B370097F130 /* AccelerateCrypto */,
2C6CED0720E195360045D491 /* libAccelerateCrypto */,
2C88436A21B74AD500C49BD9 /* libAccelerateCrypto_kernel */,
);
};
/* End PBXProject section */
/* Begin PBXSourcesBuildPhase section */
2C6CED0420E195360045D491 /* Sources */ = {
isa = PBXSourcesBuildPhase;
buildActionMask = 2147483647;
files = (
2C6CED2520E1959B0045D491 /* sha1_compress.c in Sources */,
2C6CED1B20E1958D0045D491 /* sha256_compress_avx2.s in Sources */,
2C6CED1D20E1958D0045D491 /* sha256_compress_ssse3_64.s in Sources */,
2C6CED2420E1959B0045D491 /* sha1_compress_sse.s in Sources */,
2C6CED2C20E195B60045D491 /* encrypt_ecb.s in Sources */,
2C6CED2720E195A80045D491 /* encrypt.s in Sources */,
2C6CED2B20E195B60045D491 /* encrypt.s in Sources */,
2C6CED2220E1959B0045D491 /* sha1_compress_avx1.s in Sources */,
2C6CED1E20E1958D0045D491 /* sha256_compress.c in Sources */,
2C6CED1520E1957F0045D491 /* sha512_compress_ssse3.s in Sources */,
2C6CED1220E195710045D491 /* sha512_compress_arm64.s in Sources */,
2C8843AD21B8AA8200C49BD9 /* crypt_aesni.s in Sources */,
2C6CED1320E1957F0045D491 /* sha512_compress_avx1.s in Sources */,
2C6CED1F20E1958D0045D491 /* sha256_K.c in Sources */,
2C6CED2320E1959B0045D491 /* sha1_compress_avx2.s in Sources */,
2C6CED1620E1957F0045D491 /* sha512_compress.c in Sources */,
2C6CED1920E195890045D491 /* sha256_compress_arm64.s in Sources */,
2C6CED1820E195850045D491 /* sha256_compress_armv7neon.s in Sources */,
2C6CED2820E195A80045D491 /* vpaes-armv7.s in Sources */,
2C6CED1C20E1958D0045D491 /* sha256_compress_ssse3_32.s in Sources */,
2C6CED2620E195A80045D491 /* decrypt.s in Sources */,
2C6CED2120E195970045D491 /* sha1_compress_arm64.s in Sources */,
2C6CED2920E195B60045D491 /* decrypt.s in Sources */,
2C6CED1720E1957F0045D491 /* sha512_K.c in Sources */,
2C6CED1420E1957F0045D491 /* sha512_compress_avx2.s in Sources */,
2C6CED1A20E1958D0045D491 /* sha256_compress_avx1.s in Sources */,
2C8843A921B8AA8200C49BD9 /* crypt_nonaesni.s in Sources */,
2C6CED2020E195930045D491 /* sha1_compress_armv7neon.s in Sources */,
2C6CED2A20E195B60045D491 /* decrypt_ecb.s in Sources */,
2C8843AF21B8AA8200C49BD9 /* aes.c in Sources */,
2C6CED1120E1956A0045D491 /* sha512_compress_armv7neon.s in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
2C88436B21B74AD500C49BD9 /* Sources */ = {
isa = PBXSourcesBuildPhase;
buildActionMask = 2147483647;
files = (
2C88436C21B74AD500C49BD9 /* sha1_compress.c in Sources */,
2C88436D21B74AD500C49BD9 /* sha256_compress_avx2.s in Sources */,
2C88436E21B74AD500C49BD9 /* sha256_compress_ssse3_64.s in Sources */,
2C88436F21B74AD500C49BD9 /* sha1_compress_sse.s in Sources */,
2C88437021B74AD500C49BD9 /* encrypt_ecb.s in Sources */,
2C88437121B74AD500C49BD9 /* encrypt.s in Sources */,
2C88437221B74AD500C49BD9 /* encrypt.s in Sources */,
2C88437321B74AD500C49BD9 /* sha1_compress_avx1.s in Sources */,
2C88437421B74AD500C49BD9 /* sha256_compress.c in Sources */,
2C88437521B74AD500C49BD9 /* sha512_compress_ssse3.s in Sources */,
2C88437621B74AD500C49BD9 /* sha512_compress_arm64.s in Sources */,
2C8843AE21B8AA8200C49BD9 /* crypt_aesni.s in Sources */,
2C88437721B74AD500C49BD9 /* sha512_compress_avx1.s in Sources */,
2C88437821B74AD500C49BD9 /* sha256_K.c in Sources */,
2C88437921B74AD500C49BD9 /* sha1_compress_avx2.s in Sources */,
2C88437A21B74AD500C49BD9 /* sha512_compress.c in Sources */,
2C88437B21B74AD500C49BD9 /* sha256_compress_arm64.s in Sources */,
2C88437C21B74AD500C49BD9 /* sha256_compress_armv7neon.s in Sources */,
2C88437D21B74AD500C49BD9 /* vpaes-armv7.s in Sources */,
2C88437E21B74AD500C49BD9 /* sha256_compress_ssse3_32.s in Sources */,
2C88437F21B74AD500C49BD9 /* decrypt.s in Sources */,
2C88438021B74AD500C49BD9 /* sha1_compress_arm64.s in Sources */,
2C88438121B74AD500C49BD9 /* decrypt.s in Sources */,
2C88438221B74AD500C49BD9 /* sha512_K.c in Sources */,
2C88438321B74AD500C49BD9 /* sha512_compress_avx2.s in Sources */,
2C88438421B74AD500C49BD9 /* sha256_compress_avx1.s in Sources */,
2C8843AA21B8AA8200C49BD9 /* crypt_nonaesni.s in Sources */,
2C88438521B74AD500C49BD9 /* sha1_compress_armv7neon.s in Sources */,
2C88438621B74AD500C49BD9 /* decrypt_ecb.s in Sources */,
2C8843B021B8AA8200C49BD9 /* aes.c in Sources */,
2C88438721B74AD500C49BD9 /* sha512_compress_armv7neon.s in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
/* End PBXSourcesBuildPhase section */
/* Begin PBXTargetDependency section */
2C6CED2E20E195E90045D491 /* PBXTargetDependency */ = {
isa = PBXTargetDependency;
target = 2C6CED0720E195360045D491 /* libAccelerateCrypto */;
targetProxy = 2C6CED2D20E195E90045D491 /* PBXContainerItemProxy */;
};
2C88439021B74BE100C49BD9 /* PBXTargetDependency */ = {
isa = PBXTargetDependency;
target = 2C88436A21B74AD500C49BD9 /* libAccelerateCrypto_kernel */;
targetProxy = 2C88438F21B74BE100C49BD9 /* PBXContainerItemProxy */;
};
/* End PBXTargetDependency section */
/* Begin XCBuildConfiguration section */
2C6CED0E20E195360045D491 /* Debug */ = {
isa = XCBuildConfiguration;
buildSettings = {
ALWAYS_SEARCH_USER_PATHS = NO;
CLANG_ANALYZER_NONNULL = YES;
CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
CLANG_CXX_LIBRARY = "libc++";
CLANG_ENABLE_MODULES = YES;
CLANG_ENABLE_OBJC_ARC = YES;
CLANG_ENABLE_OBJC_WEAK = YES;
CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
CLANG_WARN_BOOL_CONVERSION = YES;
CLANG_WARN_COMMA = YES;
CLANG_WARN_CONSTANT_CONVERSION = YES;
CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
CLANG_WARN_EMPTY_BODY = YES;
CLANG_WARN_ENUM_CONVERSION = YES;
CLANG_WARN_INFINITE_RECURSION = YES;
CLANG_WARN_INT_CONVERSION = YES;
CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
CLANG_WARN_STRICT_PROTOTYPES = YES;
CLANG_WARN_SUSPICIOUS_MOVE = YES;
CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
CLANG_WARN_UNREACHABLE_CODE = YES;
CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
CODE_SIGN_IDENTITY = "-";
CODE_SIGN_STYLE = Automatic;
COPY_PHASE_STRIP = NO;
DEBUG_INFORMATION_FORMAT = dwarf;
ENABLE_STRICT_OBJC_MSGSEND = YES;
ENABLE_TESTABILITY = YES;
EXECUTABLE_PREFIX = "";
GCC_C_LANGUAGE_STANDARD = gnu11;
GCC_DYNAMIC_NO_PIC = NO;
GCC_NO_COMMON_BLOCKS = YES;
GCC_OPTIMIZATION_LEVEL = 0;
GCC_PREPROCESSOR_DEFINITIONS = (
"DEBUG=1",
"$(inherited)",
);
GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
GCC_WARN_UNDECLARED_SELECTOR = YES;
GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
GCC_WARN_UNUSED_FUNCTION = YES;
GCC_WARN_UNUSED_VARIABLE = YES;
INSTALL_PATH = "";
MACOSX_DEPLOYMENT_TARGET = 10.14;
MTL_ENABLE_DEBUG_INFO = YES;
ONLY_ACTIVE_ARCH = NO;
PRODUCT_NAME = "$(TARGET_NAME)";
PUBLIC_HEADERS_FOLDER_PATH = /usr/local/include;
SDKROOT = macosx.internal;
SKIP_INSTALL = YES;
};
name = Debug;
};
2C6CED0F20E195360045D491 /* Release */ = {
isa = XCBuildConfiguration;
buildSettings = {
ALWAYS_SEARCH_USER_PATHS = NO;
CLANG_ANALYZER_NONNULL = YES;
CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
CLANG_CXX_LIBRARY = "libc++";
CLANG_ENABLE_MODULES = YES;
CLANG_ENABLE_OBJC_ARC = YES;
CLANG_ENABLE_OBJC_WEAK = YES;
CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
CLANG_WARN_BOOL_CONVERSION = YES;
CLANG_WARN_COMMA = YES;
CLANG_WARN_CONSTANT_CONVERSION = YES;
CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
CLANG_WARN_EMPTY_BODY = YES;
CLANG_WARN_ENUM_CONVERSION = YES;
CLANG_WARN_INFINITE_RECURSION = YES;
CLANG_WARN_INT_CONVERSION = YES;
CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
CLANG_WARN_STRICT_PROTOTYPES = YES;
CLANG_WARN_SUSPICIOUS_MOVE = YES;
CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
CLANG_WARN_UNREACHABLE_CODE = YES;
CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
CODE_SIGN_IDENTITY = "-";
CODE_SIGN_STYLE = Automatic;
COPY_PHASE_STRIP = NO;
DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
ENABLE_NS_ASSERTIONS = NO;
ENABLE_STRICT_OBJC_MSGSEND = YES;
EXECUTABLE_PREFIX = "";
GCC_C_LANGUAGE_STANDARD = gnu11;
GCC_NO_COMMON_BLOCKS = YES;
GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
GCC_WARN_UNDECLARED_SELECTOR = YES;
GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
GCC_WARN_UNUSED_FUNCTION = YES;
GCC_WARN_UNUSED_VARIABLE = YES;
INSTALL_PATH = "";
MACOSX_DEPLOYMENT_TARGET = 10.14;
MTL_ENABLE_DEBUG_INFO = NO;
PRODUCT_NAME = "$(TARGET_NAME)";
PUBLIC_HEADERS_FOLDER_PATH = /usr/local/include;
SDKROOT = macosx.internal;
SKIP_INSTALL = YES;
};
name = Release;
};
2C88438C21B74AD500C49BD9 /* Debug */ = {
isa = XCBuildConfiguration;
buildSettings = {
ALWAYS_SEARCH_USER_PATHS = NO;
CLANG_ANALYZER_NONNULL = YES;
CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
CLANG_CXX_LIBRARY = "libc++";
CLANG_ENABLE_MODULES = YES;
CLANG_ENABLE_OBJC_ARC = YES;
CLANG_ENABLE_OBJC_WEAK = YES;
CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
CLANG_WARN_BOOL_CONVERSION = YES;
CLANG_WARN_COMMA = YES;
CLANG_WARN_CONSTANT_CONVERSION = YES;
CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
CLANG_WARN_EMPTY_BODY = YES;
CLANG_WARN_ENUM_CONVERSION = YES;
CLANG_WARN_INFINITE_RECURSION = YES;
CLANG_WARN_INT_CONVERSION = YES;
CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
CLANG_WARN_STRICT_PROTOTYPES = YES;
CLANG_WARN_SUSPICIOUS_MOVE = YES;
CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
CLANG_WARN_UNREACHABLE_CODE = YES;
CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
CODE_SIGN_IDENTITY = "-";
CODE_SIGN_STYLE = Automatic;
COPY_PHASE_STRIP = NO;
DEBUG_INFORMATION_FORMAT = dwarf;
ENABLE_STRICT_OBJC_MSGSEND = YES;
ENABLE_TESTABILITY = YES;
EXECUTABLE_PREFIX = "";
GCC_C_LANGUAGE_STANDARD = gnu11;
GCC_DYNAMIC_NO_PIC = NO;
GCC_NO_COMMON_BLOCKS = YES;
GCC_OPTIMIZATION_LEVEL = 0;
GCC_PREPROCESSOR_DEFINITIONS = (
"DEBUG=1",
"$(inherited)",
);
GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
GCC_WARN_UNDECLARED_SELECTOR = YES;
GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
GCC_WARN_UNUSED_FUNCTION = YES;
GCC_WARN_UNUSED_VARIABLE = YES;
INSTALL_PATH = "";
MACOSX_DEPLOYMENT_TARGET = 10.14;
MTL_ENABLE_DEBUG_INFO = YES;
ONLY_ACTIVE_ARCH = NO;
OTHER_CFLAGS = "-DBUILDKERNEL=1";
"OTHER_CFLAGS[arch=*]" = "-DBUILDKERNEL=1";
PRODUCT_NAME = "$(TARGET_NAME)";
PUBLIC_HEADERS_FOLDER_PATH = /usr/local/standalone/firmware/include;
SDKROOT = macosx.internal;
SYSTEM_HEADER_SEARCH_PATHS = "$(SDKROOT)/System/Library/Frameworks/Kernel.framework/Headers";
};
name = Debug;
};
2C88438D21B74AD500C49BD9 /* Release */ = {
isa = XCBuildConfiguration;
buildSettings = {
ALWAYS_SEARCH_USER_PATHS = NO;
CLANG_ANALYZER_NONNULL = YES;
CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
CLANG_CXX_LIBRARY = "libc++";
CLANG_ENABLE_MODULES = YES;
CLANG_ENABLE_OBJC_ARC = YES;
CLANG_ENABLE_OBJC_WEAK = YES;
CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
CLANG_WARN_BOOL_CONVERSION = YES;
CLANG_WARN_COMMA = YES;
CLANG_WARN_CONSTANT_CONVERSION = YES;
CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
CLANG_WARN_EMPTY_BODY = YES;
CLANG_WARN_ENUM_CONVERSION = YES;
CLANG_WARN_INFINITE_RECURSION = YES;
CLANG_WARN_INT_CONVERSION = YES;
CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
CLANG_WARN_STRICT_PROTOTYPES = YES;
CLANG_WARN_SUSPICIOUS_MOVE = YES;
CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
CLANG_WARN_UNREACHABLE_CODE = YES;
CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
CODE_SIGN_IDENTITY = "-";
CODE_SIGN_STYLE = Automatic;
COPY_PHASE_STRIP = NO;
DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
ENABLE_NS_ASSERTIONS = NO;
ENABLE_STRICT_OBJC_MSGSEND = YES;
EXECUTABLE_PREFIX = "";
GCC_C_LANGUAGE_STANDARD = gnu11;
GCC_NO_COMMON_BLOCKS = YES;
GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
GCC_WARN_UNDECLARED_SELECTOR = YES;
GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
GCC_WARN_UNUSED_FUNCTION = YES;
GCC_WARN_UNUSED_VARIABLE = YES;
INSTALL_PATH = "";
MACOSX_DEPLOYMENT_TARGET = 10.14;
MTL_ENABLE_DEBUG_INFO = NO;
OTHER_CFLAGS = "-DBUILDKERNEL=1";
PRODUCT_NAME = "$(TARGET_NAME)";
PUBLIC_HEADERS_FOLDER_PATH = /usr/local/standalone/firmware/include;
SDKROOT = macosx.internal;
SYSTEM_HEADER_SEARCH_PATHS = "$(SDKROOT)/System/Library/Frameworks/Kernel.framework/Headers";
};
name = Release;
};
2CC8863F20D859F200D17D95 /* Debug */ = {
isa = XCBuildConfiguration;
buildSettings = {
SUPPORTED_PLATFORMS = "macosx iphoneos tvos watchos";
};
name = Debug;
};
2CC8864020D859F200D17D95 /* Release */ = {
isa = XCBuildConfiguration;
buildSettings = {
SUPPORTED_PLATFORMS = "macosx iphoneos tvos watchos";
};
name = Release;
};
2CD5E9C220D85B370097F130 /* Debug */ = {
isa = XCBuildConfiguration;
buildSettings = {
CODE_SIGN_STYLE = Automatic;
PRODUCT_NAME = "$(TARGET_NAME)";
};
name = Debug;
};
2CD5E9C320D85B370097F130 /* Release */ = {
isa = XCBuildConfiguration;
buildSettings = {
CODE_SIGN_STYLE = Automatic;
PRODUCT_NAME = "$(TARGET_NAME)";
};
name = Release;
};
/* End XCBuildConfiguration section */
/* Begin XCConfigurationList section */
2C6CED1020E195360045D491 /* Build configuration list for PBXNativeTarget "libAccelerateCrypto" */ = {
isa = XCConfigurationList;
buildConfigurations = (
2C6CED0E20E195360045D491 /* Debug */,
2C6CED0F20E195360045D491 /* Release */,
);
defaultConfigurationIsVisible = 0;
defaultConfigurationName = Release;
};
2C88438B21B74AD500C49BD9 /* Build configuration list for PBXNativeTarget "libAccelerateCrypto_kernel" */ = {
isa = XCConfigurationList;
buildConfigurations = (
2C88438C21B74AD500C49BD9 /* Debug */,
2C88438D21B74AD500C49BD9 /* Release */,
);
defaultConfigurationIsVisible = 0;
defaultConfigurationName = Release;
};
2CC8863E20D859F200D17D95 /* Build configuration list for PBXProject "AccelerateCrypto" */ = {
isa = XCConfigurationList;
buildConfigurations = (
2CC8863F20D859F200D17D95 /* Debug */,
2CC8864020D859F200D17D95 /* Release */,
);
defaultConfigurationIsVisible = 0;
defaultConfigurationName = Release;
};
2CD5E9C420D85B370097F130 /* Build configuration list for PBXAggregateTarget "AccelerateCrypto" */ = {
isa = XCConfigurationList;
buildConfigurations = (
2CD5E9C220D85B370097F130 /* Debug */,
2CD5E9C320D85B370097F130 /* Release */,
);
defaultConfigurationIsVisible = 0;
defaultConfigurationName = Release;
};
/* End XCConfigurationList section */
};
rootObject = 2CC8863B20D859F200D17D95 /* Project object */;
}

View File

@ -0,0 +1,121 @@
/* Copyright (c) (2019,2020) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#ifndef AccelerateCrypto_h
#define AccelerateCrypto_h
#include <stddef.h>
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif // __cplusplus
/*! @abstract SHA-1 160-bit digest update for numBlocks chunks of 64-byte (512-bit) data.
*
* @discussion
* This routine is optimized for x86_64 (SSE3,AVX1,AVX2), arm64 (CRYPTO), and armv7 (NEON).
*
* @param state (input/output) Array of 5 uint32_t elements.
*
* @param numBlocks (input) Number of 64-byte data chunks.
*
* @param data (input) Array of size numBlocks*64 input bytes.
*/
void AccelerateCrypto_SHA1_compress(uint32_t *state, size_t numBlocks, const void *data);
/*! @abstract SHA-256 256-bit digest update for numBlocks chunks of 64-byte (512-bit) data.
*
* @discussion
* This routine is optimized for x86_64 (SSE3,AVX1,AVX2), arm64 (CRYPTO), and armv7 (NEON).
*
* @param state (input/output) Array of 8 uint32_t elements.
*
* @param numBlocks (input) Number of 64-byte data chunks.
*
* @param data (input) Array of size numBlocks*64 input bytes.
*/
void AccelerateCrypto_SHA256_compress(uint32_t *state, size_t numBlocks, const void *data);
#if defined(__arm64__)
void AccelerateCrypto_SHA256_compress_arm64neon(uint32_t *state, size_t numBlocks, const void *data);
#endif
/*! @abstract SHA-512 512-bit digest update for numBlocks chunks of 128-byte (1,024-bit) data.
*
* @discussion
* This routine is optimized for x86_64 (SSE3,AVX1,AVX2), arm64 (NEON), and armv7 (NEON).
*
* @param state (input/output) Array of 8 uint64_t elements.
*
* @param numBlocks (input) Number of 128-byte data chunks.
*
* @param data (input) Array of size numBlocks*128 input bytes.
*/
void AccelerateCrypto_SHA512_compress(uint64_t *state, size_t numBlocks, const void *data);
#if defined(__arm64__)
void AccelerateCrypto_SHA512_compress_hwassist(uint64_t *state, size_t numBlocks, const void *data);
#endif
/* AES expanded key context */
#define KS_LENGTH 60
typedef struct
{ uint32_t ks[KS_LENGTH]; // maximum expanded key length = (14+1)*16 bytes = 15*16/4 = 60 uint32 words
uint32_t rn; // rn = 16*(10,12,14) for AES-128,192,256
} AccelerateCrypto_AES_ctx;
/*! @abstract AES function encrypts a 16-byte input buffer to a 16-byte output buffer according to
* a given input expanded key context.
*
* @discussion
* This routine is optimized for x86_64 (aesni), arm64 (CRYPTO), and armv7 (NEON).
*
* @param in (input) Array of 16-byte message.
*
* @param out (output) Array of 16-byte encrypted message.
*
* @param key (input) Expanded key context for encryption.
*
* @return 0 on success; otherwise a nonzero number indicating failure in the encrypt function.
*
*/
int AccelerateCrypto_AES_encrypt(const void *in, void *out, const AccelerateCrypto_AES_ctx *key);
/*! @abstract AES function decrypts a 16-byte input buffer to a 16-byte output buffer according to
* a given input expanded key context.
*
* @discussion
* This routine is optimized for x86_64 (aesni), arm64 (CRYPTO), and armv7 (NEON).
*
* @param in (input) Array of 16-byte encrypted message.
*
* @param out (output) Array of 16-byte decrypted message.
*
* @param key (input) Expanded key context for decryption.
*
* @return 0 on success; otherwise a nonzero number indicating failure in the decrypt function.
*
*/
int AccelerateCrypto_AES_decrypt(const void *in, void *out, const AccelerateCrypto_AES_ctx *key);
#if defined(__arm64__)
int AccelerateCrypto_ecb_AES_encrypt(const AccelerateCrypto_AES_ctx *key, uint32_t nblocks, const void *in, void *out);
int AccelerateCrypto_ecb_AES_decrypt(const AccelerateCrypto_AES_ctx *key, uint32_t nblocks, const void *in, void *out);
#endif
#ifdef __cplusplus
}
#endif // __cplusplus
#endif /* AccelerateCrypto_h */

View File

@ -0,0 +1,167 @@
/* Copyright (c) (2013,2015,2016,2019) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
// #include <Availability.h>
#include <sys/cdefs.h>
#if defined(__clang__) && ((defined(__apple_build_version__) && __apple_build_version__ > 5010000))
#define __USES_V_CRYPTO_INTRINSICS 1
#else
#define __USES_V_CRYPTO_INTRINSICS 0
#endif
// AES INSTRUCTIONS
// aese.16b v0, v1
// aesd.16b v0, v1
// aesmc.16b v0, v1
// aesimc.16b v0, v1
// SHA1 INTRINSICS
// sha1su0.4s v0, v1, v2
// sha1su1.4s v0, v1
// sha1c.4s v0, v1, v2 // or q0, s1, v2.4s
// sha1m.4s v0, v1, v2 // or q0, s1, v2.4s
// sha1p.4s v0, v1, v2 // or q0, s1, v2.4s
// sha1h.4s v0, v1 // or s0, s1
// SHA256 INTRINSICS
// sha256su0.4s v0, v1
// sha256su1.4s v0, v1, v2
// sha256h.4s v0, v1, v2 // or q0, q1, v2.4s
// sha256h2.4s v0, v1, v2 // or q0, q1, v2.4s
#if __USES_V_CRYPTO_INTRINSICS == 1
.macro AESE
aese.16b v$0, v$1
.endm
.macro AESD
aesd.16b v$0, v$1
.endm
.macro AESMC
aesmc.16b v$0, v$1
.endm
.macro AESIMC
aesimc.16b v$0, v$1
.endm
#else
.macro AESE
aese q$0, q$1
.endm
.macro AESD
aesd q$0, q$1
.endm
.macro AESMC
aesmc q$0, q$1
.endm
.macro AESIMC
aesimc q$0, q$1
.endm
#endif
#if __USES_V_CRYPTO_INTRINSICS == 1
.macro SHA1SU0
sha1su0 v$0.4s, v$1.4s, v$2.4s
.endm
.macro SHA1SU1
sha1su1 v$0.4s, v$1.4s
.endm
.macro SHA1C
sha1c q$0, s$1, v$2.4s
.endm
.macro SHA1M
sha1m q$0, s$1, v$2.4s
.endm
.macro SHA1P
sha1p q$0, s$1, v$2.4s
.endm
.macro SHA1H
sha1h s$0, s$1
.endm
.macro SHA256SU0
sha256su0 v$0.4s, v$1.4s
.endm
.macro SHA256SU1
sha256su1 v$0.4s, v$1.4s, v$2.4s
.endm
.macro SHA256H
sha256h q$0, q$1, v$2.4s
.endm
.macro SHA256H2
sha256h2 q$0, q$1, v$2.4s
.endm
#else
.macro SHA1SU0
sha1su0 q$0, q$1, q$2
.endm
.macro SHA1SU1
sha1su1 q$0, q$1
.endm
.macro SHA1C
sha1c q$0, q$1, q$2
.endm
.macro SHA1M
sha1m q$0, q$1, q$2
.endm
.macro SHA1P
sha1p q$0, q$1, q$2
.endm
.macro SHA1H
sha1h q$0, q$1
.endm
.macro SHA256SU0
sha256su0 q$0, q$1
.endm
.macro SHA256SU1
sha256su1 q$0, q$1, q$2
.endm
.macro SHA256H
sha256h q$0, q$1, q$2
.endm
.macro SHA256H2
sha256h2 q$0, q$1, q$2
.endm
#endif

View File

@ -0,0 +1,66 @@
/* Copyright (c) (2019) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#if (defined(__x86_64__) || defined(__i386__))
#if BUILDKERNEL
#include <i386/cpuid.h>
#define HAS_AESNI() ((cpuid_features() & CPUID_FEATURE_AES) != 0)
#define HAS_SupplementalSSE3() ((cpuid_features() & CPUID_FEATURE_SSSE3) != 0)
#define HAS_AVX1() ((cpuid_features() & CPUID_FEATURE_AVX1_0) != 0)
#define HAS_AVX2() ((cpuid_info()->cpuid_leaf7_features & CPUID_LEAF7_FEATURE_AVX2) != 0)
#define HAS_AVX512_AND_IN_KERNEL() ((cpuid_info()->cpuid_leaf7_features & CPUID_LEAF7_FEATURE_AVX512F) !=0)
#elif (defined(__APPLE__) && defined(__MACH__) && (__has_include(<System/i386/cpu_capabilities.h>) || __has_include(<System/arm/cpu_capabilities.h>))) // XNU_KERNEL_AVAILABLE
#include <System/i386/cpu_capabilities.h>
extern int _cpu_capabilities;
#define HAS_AESNI() (_cpu_capabilities & kHasAES)
#define HAS_SupplementalSSE3() (_cpu_capabilities & kHasSupplementalSSE3)
#define HAS_AVX1() (_cpu_capabilities & kHasAVX1_0)
#define HAS_AVX2() (_cpu_capabilities & kHasAVX2_0)
#define HAS_AVX512_AND_IN_KERNEL() 0
#else
#if (defined(__AES__))
#define HAS_AESNI() __AES__
#else
#define HAS_AESNI() 0
#endif // defined(__AES__)
#if (defined(__SSSE3__))
#define HAS_SupplementalSSE3() __SSSE3__
#else
#define HAS_SupplementalSSE3() 0
#endif // defined(__SSE3__)
#if (defined(__AVX__))
#define HAS_AVX1() __AVX__
#else
#define HAS_AVX1() 0
#endif // defined(__AVX__)
#if (defined(__AVX2__))
#define HAS_AVX2() __AVX2__
#else
#define HAS_AVX2() 0
#endif // defined(__AVX2__)
#define HAS_AVX512_AND_IN_KERNEL() 0
#endif
#endif // (defined(__x86_64__) || defined(__i386__))

View File

@ -0,0 +1,12 @@
# Copyright (c) (2019) Apple Inc. All rights reserved.
#
# corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
# is contained in the License.txt file distributed with corecrypto) and only to
# people who accept that license. IMPORTANT: Any license rights granted to you by
# Apple Inc. (if any) are limited to internal use within your organization only on
# devices and computers you own or control, for the sole purpose of verifying the
# security characteristics and correct functioning of the Apple Software. You may
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
#
This is a clone of AccelerateCrypto-2.

View File

@ -0,0 +1,477 @@
# Copyright (c) (2011,2012,2013,2014,2015,2016,2019) Apple Inc. All rights reserved.
#
# corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
# is contained in the License.txt file distributed with corecrypto) and only to
# people who accept that license. IMPORTANT: Any license rights granted to you by
# Apple Inc. (if any) are limited to internal use within your organization only on
# devices and computers you own or control, for the sole purpose of verifying the
# security characteristics and correct functioning of the Apple Software. You may
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
#if defined(__arm__)
#define S0 r0
#define S1 r1
#define S2 r2
#define S3 r3
#if Select == 0
#define Name _AccelerateCrypto_AES_encrypt // Routine name.
#define MTable _AESEncryptTable // Main table.
#define FTable _AESSubBytesWordTable // Final table.
#define P0 S0 // State permutation.
#define P1 S1
#define P2 S2
#define P3 S3
#define Increment +16 // ExpandedKey increment.
#elif Select == 1
#define Name _AccelerateCrypto_AES_decrypt // Routine name.
#define MTable _AESDecryptTable // Main table.
#define FTable _AESInvSubBytesWordTable // Final table.
#define P0 S2 // State permutation.
#define P1 S3
#define P2 S0
#define P3 S1
#define Increment -16 // ExpandedKey increment.
#endif // Select
#if defined(__ARM_NEON__) // vpaes uses NEON instructions
.extern _AccelerateCrypto_vpaes_encrypt
.extern _AccelerateCrypto_vpaes_decrypt
#endif
#define ExpandedKey r11
#define ExpandedKeyEnd lr
#define ContextKeyLength 240
#define t r12
.subsections_via_symbols
.text
.syntax unified
.p2align 2
.code 16
.thumb_func Name
.globl Name
Name:
#if defined(__ARM_NEON__) // if neon is available, use cache-attack resilient vector permute AES
#if Select == 0
b _AccelerateCrypto_vpaes_encrypt
#else
b _AccelerateCrypto_vpaes_decrypt
#endif
#else // __ARM_NEON__
// set up debug trace frame pointer
push {r7,lr}
mov r7, sp
// now setup the stack for the current function
push {r1,r4-r6,r8-r11}
sub sp, #(16+8) // make sp 16-byte aligned
// copy r0,r2 to r4,r11 to release r0,r2 (r1 is saved in stack) for use as S0-S3
mov r4, r0
mov ExpandedKey, r2
// Get and check "key length".
ldr t, [ExpandedKey, #ContextKeyLength]
cmp t, #160
beq 2f
cmp t, #192
beq 2f
cmp t, #224
beq 2f
mov r0, #-1 // Return error.
b 9f
2:
#if (Select == 0)
// For encryption, prepare to iterate forward through expanded key.
add ExpandedKeyEnd, ExpandedKey, t
#else
// For decryption, prepare to iterate backward through expanded key.
mov ExpandedKeyEnd, ExpandedKey
add ExpandedKey, t
#endif
/*
we need to do this for otherwise ldmia $0, {$1-$4} will hit memory access error when $0 is not word-aligned in thumb state
*/
.macro thumb2_ldmia
ldr $1, [$0, #0]
ldr $2, [$0, #4]
ldr $3, [$0, #8]
ldr $4, [$0, #12]
.endm
.macro thumb2_stmia
str $1, [$0, #0]
str $2, [$0, #4]
str $3, [$0, #8]
str $4, [$0, #12]
.endm
// Initialize State from input text.
// we need to do this otherwise ldmia will crash when input (pointed by r4) is not word aligned
thumb2_ldmia r4, S0, S1, S2, S3
// Add round key and save results.
thumb2_ldmia ExpandedKey, r4, r5, r8, r10
add ExpandedKey, #Increment
eor S0, r4
eor S1, r5
eor S2, r8
eor S3, r10
// Set up r6 = _AESEncryptTable or _AESDecryptTable
ldr r6, L_table1
L_table0:
mov r12, pc
ldr r6, [r12, r6]
// save S0-S3 in the stack memory
stmia sp, {S0-S3}
// use this to extract byte from a shifted word, tried use uxtb, same complexity, but then limit to armv6 or above
mov r9, #0xff
// Get round key.
thumb2_ldmia ExpandedKey, S0, S1, S2, S3
add ExpandedKey, #Increment
// per round operation
/*
the following macro defines the per round operation for aes
the state computed from the previous round is now saved in sp[0:15]
and r0-r3 has been initialized with the next expanded round key
the macro reads those 16 bytes in sp[0:15] and for each byte does a table look up
the result (4-byte) word is xor-ed to one of r0-r3
the final r0-r3 is the aes state
r6 : points to Main or Final table
r9 : 0xff is used as a byte mask
*/
.macro aes_per_round
#if defined (__ARM_ARCH_7S__)
// better for swift and (old cortex-a8)
// S0 process
ldr t, [sp, #0] // load 4 bytes for S0 process
and r4, r9, t // byte 0
and r5, r9, t, lsr #8 // byte 1
ldr r4, [r6, r4, lsl #2] // 1st table lookup
and r8, r9, t, lsr #16 // byte 2
ldr r5, [r6, r5, lsl #2] // 2nd table lookup
and r10, r9, t, lsr #24 // byte 3
ldr r8, [r6, r8, lsl #2] // 3rd table lookup
eor S0, r4 // S0 ^= 1st table lookup
ldr r10, [r6, r10, lsl #2] // 4th table lookup
eor P3, r5, ror #24 // P3 ^= 2nd table lookup
ldr t, [sp, #4] // read Word for next S1 process
eor S2, r8, ror #16 // S2 ^= 3rd table lookup
eor P1, r10, ror #8 // P1 ^= 4th table lookup
// S1 process
and r4, r9, t
and r5, r9, t, lsr #8
ldr r4, [r6, r4, lsl #2]
and r8, r9, t, lsr #16
ldr r5, [r6, r5, lsl #2]
and r10, r9, t, lsr #24
ldr r8, [r6, r8, lsl #2]
eor S1, r4
ldr r10, [r6, r10, lsl #2]
eor P0, r5, ror #24
ldr t, [sp, #8]
eor S3, r8, ror #16
eor P2, r10, ror #8
// S2 process
and r4, r9, t
and r5, r9, t, lsr #8
ldr r4, [r6, r4, lsl #2]
and r8, r9, t, lsr #16
ldr r5, [r6, r5, lsl #2]
and r10, r9, t, lsr #24
ldr r8, [r6, r8, lsl #2]
eor S2, r4
ldr r10, [r6, r10, lsl #2]
eor P1, r5, ror #24
ldr t, [sp, #12]
eor S0, r8, ror #16
eor P3, r10, ror #8
// S3 process
and r4, r9, t
and r5, r9, t, lsr #8
ldr r4, [r6, r4, lsl #2]
and r8, r9, t, lsr #16
ldr r5, [r6, r5, lsl #2]
and r10, r9, t, lsr #24
ldr r8, [r6, r8, lsl #2]
eor S3, r4
ldr r10, [r6, r10, lsl #2]
eor P2, r5, ror #24
eor S1, r8, ror #16
eor P0, r10, ror #8
#else
// better for cortex-a7 and cortex-a9
// S0 process
ldrb r4, [sp, #0] // byte 0
ldrb r5, [sp, #1] // byte 1
ldrb r8, [sp, #2] // byte 2
ldrb r10, [sp, #3] // byte 3
ldr r4, [r6, r4, lsl #2] // 1st table lookup
ldr r5, [r6, r5, lsl #2] // 2nd table lookup
ldr r8, [r6, r8, lsl #2] // 1st table lookup
eor S0, r4 // S0 ^= 1st table lookup
ldr r10, [r6, r10, lsl #2] // 2nd table lookup
eor P3, r5, ror #24 // P3 ^= 2nd table lookup
eor S2, r8, ror #16 // S2 ^= 3rd table lookup
eor P1, r10, ror #8 // P1 ^= 4th table lookup
// S1 process
ldrb r4, [sp, #4] // byte 0
ldrb r5, [sp, #5] // byte 1
ldrb r8, [sp, #6] // byte 2
ldrb r10, [sp, #7] // byte 3
ldr r4, [r6, r4, lsl #2]
ldr r5, [r6, r5, lsl #2]
ldr r8, [r6, r8, lsl #2]
eor S1, r4
ldr r10, [r6, r10, lsl #2]
eor P0, r5, ror #24
eor S3, r8, ror #16
eor P2, r10, ror #8
// S2 process
ldrb r4, [sp, #8] // byte 0
ldrb r5, [sp, #9] // byte 1
ldrb r8, [sp, #10] // byte 2
ldrb r10, [sp, #11] // byte 3
ldr r4, [r6, r4, lsl #2]
ldr r5, [r6, r5, lsl #2]
ldr r8, [r6, r8, lsl #2]
eor S2, r4
ldr r10, [r6, r10, lsl #2]
eor P1, r5, ror #24
eor S0, r8, ror #16
eor P3, r10, ror #8
// S3 process
ldrb r4, [sp, #12] // byte 0
ldrb r5, [sp, #13] // byte 1
ldrb r8, [sp, #14] // byte 2
ldrb r10, [sp, #15] // byte 3
ldr r4, [r6, r4, lsl #2]
ldr r5, [r6, r5, lsl #2]
ldr r8, [r6, r8, lsl #2]
eor S3, r4
ldr r10, [r6, r10, lsl #2]
eor P2, r5, ror #24
eor S1, r8, ror #16
eor P0, r10, ror #8
#endif
.endm
.macro aes_last_round
#if defined (__ARM_ARCH_7S__)
// better for swift (and old cortex-a8)
// S0 process
ldr t, [sp, #0] // load 4 bytes for S0 process
and r4, r9, t // byte 0
and r5, r9, t, lsr #8 // byte 1
ldrb r4, [r6, r4] // 1st table lookup
and r8, r9, t, lsr #16 // byte 2
ldrb r5, [r6, r5] // 2nd table lookup
and r10, r9, t, lsr #24 // byte 3
ldrb r8, [r6, r8] // 3rd table lookup
eor S0, r4 // S0 ^= 1st table lookup
ldrb r10, [r6, r10] // 4th table lookup
eor P3, r5, ror #24 // P3 ^= 2nd table lookup
ldr t, [sp, #4] // read Word for next S1 process
eor S2, r8, ror #16 // S2 ^= 3rd table lookup
eor P1, r10, ror #8 // P1 ^= 4th table lookup
// S1 process
and r4, r9, t
and r5, r9, t, lsr #8
ldrb r4, [r6, r4]
and r8, r9, t, lsr #16
ldrb r5, [r6, r5]
and r10, r9, t, lsr #24
ldrb r8, [r6, r8]
eor S1, r4
ldrb r10, [r6, r10]
eor P0, r5, ror #24
ldr t, [sp, #8]
eor S3, r8, ror #16
eor P2, r10, ror #8
// S2 process
and r4, r9, t
and r5, r9, t, lsr #8
ldrb r4, [r6, r4]
and r8, r9, t, lsr #16
ldrb r5, [r6, r5]
and r10, r9, t, lsr #24
ldrb r8, [r6, r8]
eor S2, r4
ldrb r10, [r6, r10]
eor P1, r5, ror #24
ldr t, [sp, #12]
eor S0, r8, ror #16
eor P3, r10, ror #8
// S3 process
and r4, r9, t
and r5, r9, t, lsr #8
ldrb r4, [r6, r4]
and r8, r9, t, lsr #16
ldrb r5, [r6, r5]
and r10, r9, t, lsr #24
ldrb r8, [r6, r8]
eor S3, r4
ldrb r10, [r6, r10]
eor P2, r5, ror #24
eor S1, r8, ror #16
eor P0, r10, ror #8
#else
// better for cortex-a7 and cortex-a9
// S0 process
ldrb r4, [sp, #0] // byte 0
ldrb r5, [sp, #1] // byte 1
ldrb r8, [sp, #2] // byte 2
ldrb r10, [sp, #3] // byte 3
ldrb r4, [r6, r4] // 1st table lookup
ldrb r5, [r6, r5] // 2nd table lookup
ldrb r8, [r6, r8] // 3rd table lookup
eor S0, r4 // S0 ^= 1st table lookup
ldrb r10, [r6, r10] // 4th table lookup
eor P3, r5, ror #24 // P3 ^= 2nd table lookup
eor S2, r8, ror #16 // S2 ^= 3rd table lookup
eor P1, r10, ror #8 // P1 ^= 4th table lookup
// S1 process
ldrb r4, [sp, #4] // byte 0
ldrb r5, [sp, #5] // byte 1
ldrb r8, [sp, #6] // byte 2
ldrb r10, [sp, #7] // byte 3
ldrb r4, [r6, r4]
ldrb r5, [r6, r5]
ldrb r8, [r6, r8]
eor S1, r4
ldrb r10, [r6, r10]
eor P0, r5, ror #24
eor S3, r8, ror #16
eor P2, r10, ror #8
// S2 process
ldrb r4, [sp, #8] // byte 0
ldrb r5, [sp, #9] // byte 1
ldrb r8, [sp, #10] // byte 2
ldrb r10, [sp, #11] // byte 3
ldrb r4, [r6, r4]
ldrb r5, [r6, r5]
ldrb r8, [r6, r8]
eor S2, r4
ldrb r10, [r6, r10]
eor P1, r5, ror #24
eor S0, r8, ror #16
eor P3, r10, ror #8
// S3 process
ldrb r4, [sp, #12] // byte 0
ldrb r5, [sp, #13] // byte 1
ldrb r8, [sp, #14] // byte 2
ldrb r10, [sp, #15] // byte 3
ldrb r4, [r6, r4]
ldrb r5, [r6, r5]
ldrb r8, [r6, r8]
eor S3, r4
ldrb r10, [r6, r10]
eor P2, r5, ror #24
eor S1, r8, ror #16
eor P0, r10, ror #8
#endif
.endm
1:
aes_per_round
// Save state for next iteration and load next round key.
stmia sp,{S0-S3}
thumb2_ldmia ExpandedKey, S0, S1, S2, S3
cmp ExpandedKeyEnd, ExpandedKey
add ExpandedKey, #Increment
bne 1b
// setup r6 = _AESSubBytesWordTable or _AESInvSubBytesWordTable
ldr r6, L_table3
L_table2:
mov r12, pc
ldr r6, [r12, r6]
aes_last_round
ldr r4, [sp, #(16+8)] // restore OutputText
thumb2_stmia r4, S0, S1, S2, S3
eor r0, r0 // Return success.
9:
add sp, #(4+16+8) // skip r1 restore
pop {r4-r6,r8-r11}
pop {r7, pc}
.p2align 2
L_table1:
.long L_Tab$non_lazy_ptr-(L_table0+4)
.p2align 2
L_table3:
.long L_Tab$non_lazy_ptr2-(L_table2+4)
.section __DATA,__nl_symbol_ptr,non_lazy_symbol_pointers
.p2align 2
L_Tab$non_lazy_ptr:
.indirect_symbol MTable
.long 0
.p2align 2
L_Tab$non_lazy_ptr2:
.indirect_symbol FTable
.long 0
#endif // __ARM_NEON__
#undef S0
#undef S1
#undef S2
#undef S3
#undef Name
#undef MTable
#undef FTable
#undef P0
#undef P1
#undef P2
#undef P3
#undef Increment
#endif /* defined(__arm__) */

View File

@ -0,0 +1,13 @@
# Copyright (c) (2019) Apple Inc. All rights reserved.
#
# corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
# is contained in the License.txt file distributed with corecrypto) and only to
# people who accept that license. IMPORTANT: Any license rights granted to you by
# Apple Inc. (if any) are limited to internal use within your organization only on
# devices and computers you own or control, for the sole purpose of verifying the
# security characteristics and correct functioning of the Apple Software. You may
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
#define Select 1
#include "EncryptDecrypt.s"
#undef Select

View File

@ -0,0 +1,13 @@
# Copyright (c) (2019) Apple Inc. All rights reserved.
#
# corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
# is contained in the License.txt file distributed with corecrypto) and only to
# people who accept that license. IMPORTANT: Any license rights granted to you by
# Apple Inc. (if any) are limited to internal use within your organization only on
# devices and computers you own or control, for the sole purpose of verifying the
# security characteristics and correct functioning of the Apple Software. You may
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
#define Select 0
#include "EncryptDecrypt.s"
#undef Select

View File

@ -0,0 +1,751 @@
# Copyright (c) (2015,2016,2019) Apple Inc. All rights reserved.
#
# corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
# is contained in the License.txt file distributed with corecrypto) and only to
# people who accept that license. IMPORTANT: Any license rights granted to you by
# Apple Inc. (if any) are limited to internal use within your organization only on
# devices and computers you own or control, for the sole purpose of verifying the
# security characteristics and correct functioning of the Apple Software. You may
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
#if !defined(__arm64__) && defined(__ARM_NEON__)
#define ekey r2
#define eax r4
.macro save_all_neon
#if BUILDKERNEL
vstmdb sp!, {q12-q15}
vstmdb sp!, {q8-q11}
vstmdb sp!, {q0-q3}
#endif
vstmdb sp!, {q4-q7}
.endm
.macro restore_all_neon
vldmia sp!, {q4-q7}
#if BUILDKERNEL
vldmia sp!, {q0-q3}
vldmia sp!, {q8-q11}
vldmia sp!, {q12-q15}
#endif
.endm
.macro vpaes_push
push {r4-r7,lr}
add r7, sp, #12
push {r8,r10,r11}
.endm
.macro vpaes_pop
pop {r8,r10,r11}
pop {r4-r7,pc}
.endm
.p2align 6
.Lk_ipt:
.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
.Lk_sbo:
.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
.Lk_mc_forward:
.quad 0x0407060500030201, 0x0C0F0E0D080B0A09
.quad 0x080B0A0904070605, 0x000302010C0F0E0D
.quad 0x0C0F0E0D080B0A09, 0x0407060500030201
.quad 0x000302010C0F0E0D, 0x080B0A0904070605
.Lk_mc_backward:
.quad 0x0605040702010003, 0x0E0D0C0F0A09080B
.quad 0x020100030E0D0C0F, 0x0A09080B06050407
.quad 0x0E0D0C0F0A09080B, 0x0605040702010003
.quad 0x0A09080B06050407, 0x020100030E0D0C0F
.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
.quad 0x030E09040F0A0500, 0x0B06010C07020D08
.quad 0x0F060D040B020900, 0x070E050C030A0108
.quad 0x0B0E0104070A0D00, 0x0306090C0F020508
.p2align 4
vpaes_encrypt_core:
mov r9, ekey
mov r11, #16
adr r10, .Lk_ipt
ldr eax, [ekey, #240]
vldmia r10!,{q3-q4}
vbic q1, q0, q9
vld1.8 {q5}, [r9]!
vshr.u32 q1, q1, #4
vand q0, q0, q9
vtbl.8 d4, {q3}, d0
vtbl.8 d5, {q3}, d1
adr r10, .Lk_mc_backward
vtbl.8 d0, {q4}, d2
vtbl.8 d1, {q4}, d3
veor q2, q2, q5
veor q0, q0, q2
cmp eax, #0
b .Lenc_entry
.p2align 4
.Lenc_loop:
vtbl.8 d8, {q13}, d4
vtbl.8 d9, {q13}, d5
vtbl.8 d0, {q12}, d6
vtbl.8 d1, {q12}, d7
veor q4, q4, q5
add r12, r10, r11
veor q5, q0, q4
vld1.8 {q4}, [r12 :128]
sub r12, r12, #64
vtbl.8 d12, {q15}, d4
vtbl.8 d13, {q15}, d5
vld1.8 {q1}, [r12 :128]
vtbl.8 d4, {q14}, d6
vtbl.8 d5, {q14}, d7
veor q2, q2, q6
vtbl.8 d6, {q5}, d8
vtbl.8 d7, {q5}, d9
vtbl.8 d0, {q5}, d2
vtbl.8 d1, {q5}, d3
veor q5, q0, q2
add r11, r11, #16
veor q3, q3, q5
vtbl.8 d0, {q5}, d2
vtbl.8 d1, {q5}, d3
and r11, r11, #48
subs eax, eax, #1
veor q0, q0, q3
.Lenc_entry:
vbic q1, q0, q9
vand q0, q0, q9
vshr.u32 q1, q1, #4
vtbl.8 d10, {q11}, d0
vtbl.8 d11, {q11}, d1
veor q0, q0, q1
vtbl.8 d6, {q10}, d2
vtbl.8 d7, {q10}, d3
vtbl.8 d8, {q10}, d0
vtbl.8 d9, {q10}, d1
veor q3, q3, q5
veor q4, q4, q5
vtbl.8 d4, {q10}, d6
vtbl.8 d5, {q10}, d7
vtbl.8 d6, {q10}, d8
vtbl.8 d7, {q10}, d9
veor q2, q2, q0
veor q3, q3, q1
vld1.8 {q5}, [r9]!
bgt .Lenc_loop
adr r12, .Lk_sbo
vld1.8 {q1}, [r12]!
vtbl.8 d8, {q1}, d4
vtbl.8 d9, {q1}, d5
vld1.8 {q2}, [r12]
add r12, r10, r11
veor q4, q4, q5
add r12, r12, #64
vtbl.8 d0, {q2}, d6
vtbl.8 d1, {q2}, d7
vld1.8 {q1}, [r12]
veor q2, q0, q4
vtbl.8 d0, {q2}, d2
vtbl.8 d1, {q2}, d3
bx lr
.p2align 4
.Lk_dipt:
.quad 0x0F505B040B545F00, 0x154A411E114E451A
.quad 0x86E383E660056500, 0x12771772F491F194
.quad 0x000302010C0F0E0D, 0x080B0A0904070605 // .Lk_mc_forward+48
.Lk_dsb9:
.quad 0x851C03539A86D600, 0xCAD51F504F994CC9
.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
.Lk_dsbd:
.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
.Lk_dsbb:
.quad 0xD022649296B44200, 0x602646F6B0F2D404
.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
.Lk_dsbe:
.quad 0x46F2929626D4D000, 0x2242600464B4F6B0
.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
.Lk_dsbo:
.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
.quad 0x0F060D040B020900, 0x070E050C030A0108
.p2align 4
vpaes_decrypt_core:
mov r9, r2 // dkey
ldr eax, [r2, #240] // Nr
adr r12, .Lk_dipt
vbic q1, q0, q9
vld1.64 {q3}, [r12 :128]!
vshr.u32 q1, q1, #4
vld1.8 {q5}, [r9]!
lsl r11, eax, #4
vand q2, q0, q9
vtbl.8 d4, {q3}, d4
vtbl.8 d5, {q3}, d5
vld1.64 {q4}, [r12 :128]!
eor r11, r11, #48
adr r10, .Lk_dsbd
vtbl.8 d0, {q4}, d2
vtbl.8 d1, {q4}, d3
and r11, r11, #48
veor q2, q2, q5
vld1.64 {q5}, [r12 :128]!
veor q0, q0, q2
cmp eax, #0
b .Ldec_entry
.p2align 4
.Ldec_loop:
sub r12, r10, 32
vld1.64 {q6-q7}, [r12 :128]!
vtbl.8 d8, {q6}, d4
vtbl.8 d9, {q6}, d5
vtbl.8 d2, {q7}, d6
vtbl.8 d3, {q7}, d7
vld1.64 {q6-q7}, [r12 :128]!
veor q0, q0, q4
vtbl.8 d8, {q6}, d4
vtbl.8 d9, {q6}, d5
veor q6, q0, q1
vtbl.8 d2, {q7}, d6
vtbl.8 d3, {q7}, d7
vtbl.8 d0, {q6}, d10
vtbl.8 d1, {q6}, d11
vld1.64 {q6-q7}, [r12 :128]!
veor q0, q0, q4
vtbl.8 d8, {q6}, d4
vtbl.8 d9, {q6}, d5
veor q6, q0, q1
vtbl.8 d2, {q7}, d6
vtbl.8 d3, {q7}, d7
vtbl.8 d0, {q6}, d10
vtbl.8 d1, {q6}, d11
vld1.64 {q6-q7}, [r12 :128]!
veor q0, q0, q4
vtbl.8 d8, {q6}, d4
vtbl.8 d9, {q6}, d5
veor q6, q0, q1
vtbl.8 d2, {q7}, d6
vtbl.8 d3, {q7}, d7
vtbl.8 d0, {q6}, d10
vtbl.8 d1, {q6}, d11
veor q0, q0, q4
vext.8 q5, q5, q5, #12
veor q0, q0, q1
subs eax, eax, #1
.Ldec_entry:
vbic q1, q0, q9
vand q0, q0, q9
vshr.u32 q1, q1, #4
vtbl.8 d4, {q11}, d0
vtbl.8 d5, {q11}, d1
veor q0, q0, q1
vtbl.8 d6, {q10}, d2
vtbl.8 d7, {q10}, d3
veor q3, q3, q2
vtbl.8 d8, {q10}, d0
vtbl.8 d9, {q10}, d1
veor q4, q4, q2
vtbl.8 d4, {q10}, d6
vtbl.8 d5, {q10}, d7
veor q2, q2, q0
vtbl.8 d6, {q10}, d8
vtbl.8 d7, {q10}, d9
vld1.8 {q0}, [r9]!
veor q3, q3, q1
bne .Ldec_loop
vld1.64 {q6-q7}, [r12 :128]!
vtbl.8 d8, {q6}, d4
vtbl.8 d9, {q6}, d5
add r12, r12, r11, lsr #1
vtbl.8 d6, {q7}, d6
vtbl.8 d7, {q7}, d7
vld1.64 {q2}, [r12]
veor q0, q0, q4
veor q1, q0, q3
vtbl.8 d0, {q1}, d4
vtbl.8 d1, {q1}, d5
bx lr
.p2align 6
.Lk_ipt2:
.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
.Lk_rcon:
.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
.Lk_sr:
.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
.quad 0x030E09040F0A0500, 0x0B06010C07020D08
.quad 0x0F060D040B020900, 0x070E050C030A0108
.quad 0x0B0E0104070A0D00, 0x0306090C0F020508
.p2align 4
vpaes_schedule_core:
bl vpaes_preheat
adr r10, .Lk_rcon
vld1.8 {q0}, [r0]
vld1.64 {q8}, [r10 :128]!
vmov q3, q0
adr r11, .Lk_ipt2
bl vpaes_schedule_transform
vmov q7, q0
cmp r3, #0
bne .Lschedule_am_decrypting
vst1.8 {q0}, [r2]
b .Lschedule_go
.Lschedule_am_decrypting:
add r12, r10, r8
vmov q1, q3
vld1.8 {q3}, [r12]
vtbl.8 d6, {q1}, d6
vtbl.8 d7, {q1}, d7
eor r8, r8, #48
vst1.8 {q3}, [r2]
.Lschedule_go:
cmp r1, #192
bgt .Lschedule_256
beq .Lschedule_192
.Lschedule_128:
mov r1, #10
.Loop_schedule_128:
bl vpaes_schedule_round
subs r1, r1, #1
beq .Lschedule_mangle_last
bl vpaes_schedule_mangle
b .Loop_schedule_128
.p2align 4
.Lschedule_192:
add r12, r0, #8
vld1.8 {q0}, [r12]
bl vpaes_schedule_transform
vmov d13, d1
veor d12, d12, d12
mov r1, #4
.Loop_schedule_192:
bl vpaes_schedule_round
vext.8 q0, q6, q0, #8
bl vpaes_schedule_mangle
bl vpaes_schedule_192_smear
bl vpaes_schedule_mangle
bl vpaes_schedule_round
subs r1, r1, #1
beq .Lschedule_mangle_last
bl vpaes_schedule_mangle
bl vpaes_schedule_192_smear
b .Loop_schedule_192
.p2align 4
.Lschedule_256:
add r12, r0, #16
vld1.8 {q0}, [r12]
bl vpaes_schedule_transform
mov r1, #7
.Loop_schedule_256:
bl vpaes_schedule_mangle
vmov q6, q0
bl vpaes_schedule_round
subs r1, r1, #1
beq .Lschedule_mangle_last
bl vpaes_schedule_mangle
vdup.32 q0, d1[1]
vmov q5, q7
vmov q7, q6
bl vpaes_schedule_low_round
vmov q7, q5
b .Loop_schedule_256
.p2align 4
.Lk_opt:
.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
.Lk_deskew:
.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
.p2align 4
.Lschedule_mangle_last:
adr r11, .Lk_deskew
cmp r3, #0
bne .Lschedule_mangle_last_dec
add r12, r8, r10
vld1.8 {q1}, [r12]
adr r11, .Lk_opt
vtbl.8 d2, {q0}, d2
vtbl.8 d3, {q0}, d3
vmov q0, q1
add r2, r2, #32
.Lschedule_mangle_last_dec:
adr r12, .Lk_s63
sub r2, r2, #16
vld1.8 {q1}, [r12]
veor q0, q0, q1
bl vpaes_schedule_transform
vst1.8 {q0}, [r2]
restore_all_neon
eor r0, r0, r0
vpaes_pop
.p2align 4
vpaes_schedule_192_smear:
vdup.32 q1, d12[0]
vdup.32 q0, d15[1]
vmov s7, s26
vmov s0, s30
veor q6, q6, q1
veor q6, q6, q0
vmov q0, q6
veor d12, d12, d12
bx lr
.p2align 4
vpaes_schedule_round:
veor q1, q1, q1
vext.8 q1, q8, q1, #15
vext.8 q8, q8, q8, #15
veor q7, q7, q1
vdup.32 q0, d1[1]
vext.8 q0, q0, q0, #1
vpaes_schedule_low_round:
veor q1, q1, q1
adr r12, .Lk_s63
vext.8 q1, q1, q7, #12
veor q2, q2, q2
veor q7, q7, q1
vld1.8 {q1}, [r12]
vext.8 q2, q2, q7, #8
veor q7, q7, q1
veor q7, q7, q2
vbic q1, q0, q9
vshr.u32 q1, q1, #4
vand q0, q0, q9
vtbl.8 d4, {q11}, d0
vtbl.8 d5, {q11}, d1
veor q0, q0, q1
vtbl.8 d6, {q10}, d2
vtbl.8 d7, {q10}, d3
veor q3, q3, q2
vtbl.8 d8, {q10}, d0
vtbl.8 d9, {q10}, d1
veor q4, q4, q2
vtbl.8 d4, {q10}, d6
vtbl.8 d5, {q10}, d7
veor q2, q2, q0
vtbl.8 d6, {q10}, d8
vtbl.8 d7, {q10}, d9
veor q3, q3, q1
vtbl.8 d8, {q13}, d4
vtbl.8 d9, {q13}, d5
vtbl.8 d0, {q12}, d6
vtbl.8 d1, {q12}, d7
veor q0, q0, q4
veor q0, q0, q7
vmov q7, q0
bx lr
.p2align 4
vpaes_schedule_transform:
vbic q1, q0, q9
vldmia r11, {q4-q5}
vand q0, q0, q9
vshr.u32 q1, q1, #4
vtbl.8 d0, {q4}, d0
vtbl.8 d1, {q4}, d1
vtbl.8 d2, {q5}, d2
vtbl.8 d3, {q5}, d3
veor q0, q0, q1
bx lr
.p2align 4
.Lk_mc_forward2:
.quad 0x0407060500030201, 0x0C0F0E0D080B0A09
.Lk_s63:
.quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
.Lk_dksd:
.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
.Lk_dksb:
.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
.Lk_dkse:
.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
.Lk_dks9:
.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
.p2align 4
vpaes_schedule_mangle:
vstmdb sp!, {q6-q7}
adr r12, .Lk_mc_forward2
vmov q4, q0
cmp r3, #0
vldmia r12!, {q5-q6} // q5 = Lk_mc_forward2, q6 = Lk_s63
bne .Lschedule_mangle_dec
add r2, r2, #16
veor q4, q4, q6
vtbl.8 d6, {q4}, d10
vtbl.8 d7, {q4}, d11
vtbl.8 d8, {q3}, d10
vtbl.8 d9, {q3}, d11
vtbl.8 d2, {q4}, d10
vtbl.8 d3, {q4}, d11
veor q3, q3, q4
veor q3, q3, q1
b .Lschedule_mangle_both
.p2align 4
.Lschedule_mangle_dec:
vbic q1, q4, q9
vldmia r12!, {q6-q7}
vshr.u32 q1, q1, #4
vand q4, q4, q9
vtbl.8 d4, {q6}, d8
vtbl.8 d5, {q6}, d9
vtbl.8 d6, {q7}, d2
vtbl.8 d7, {q7}, d3
vldmia r12!, {q6-q7}
veor q2, q3, q2
vtbl.8 d6, {q2}, d10
vtbl.8 d7, {q2}, d11
vtbl.8 d4, {q6}, d8
vtbl.8 d5, {q6}, d9
veor q2, q2, q3
vtbl.8 d6, {q7}, d2
vtbl.8 d7, {q7}, d3
vldmia r12!, {q6-q7}
veor q2, q3, q2
vtbl.8 d6, {q2}, d10
vtbl.8 d7, {q2}, d11
vtbl.8 d4, {q6}, d8
vtbl.8 d5, {q6}, d9
veor q2, q2, q3
vtbl.8 d6, {q7}, d2
vtbl.8 d7, {q7}, d3
vldmia r12!, {q6-q7}
veor q2, q3, q2
vtbl.8 d6, {q2}, d10
vtbl.8 d7, {q2}, d11
vtbl.8 d4, {q6}, d8
vtbl.8 d5, {q6}, d9
veor q2, q2, q3
vtbl.8 d6, {q7}, d2
vtbl.8 d7, {q7}, d3
veor q3, q3, q2
sub r2, r2, #16
.Lschedule_mangle_both:
add r12, r10, r8
vld1.8 {q1}, [r12]
sub r8, r8, #16
vtbl.8 d4, {q3}, d2
vtbl.8 d5, {q3}, d3
and r8, r8, #48
vst1.8 {q2}, [r2]
vldmia sp!, {q6-q7}
bx lr
/*
int vpaes_set_encrypt_key(const uint8_t *userKey, int bits, void *key);
*/
#define userKey r0
#define AES_bits r1
#define key r2
#define t r12
.globl _AccelerateCrypto_vpaes_set_encrypt_key
.p2align 4
_AccelerateCrypto_vpaes_set_encrypt_key:
// 128/192/256 divide by 32 = 4/6/8 + 5 - 9/11/13
lsr t, AES_bits, #5
vpaes_push
mov r11, t
save_all_neon
add t, r11, #5
mov r3, #0
str t, [key, #240]
mov r8, #48
b vpaes_schedule_core
.globl _AccelerateCrypto_vpaes_set_decrypt_key
.p2align 4
_AccelerateCrypto_vpaes_set_decrypt_key:
lsr t, AES_bits, #5
vpaes_push
mov r11, t
save_all_neon
mov r8, #32
add t, r11, #5
and r8, r8, AES_bits, lsr #1
mov r3, #1
str t, [key, #240]
add key, key, #16
eor r8, r8, #32
add key, key, t, lsl #4
b vpaes_schedule_core
/*
void vpaes_encrypt(const unsigned char *in, unsigned char *out, const AES_KEY *key);
*/
#define in r0
#define out r1
#define key r2
.globl _AccelerateCrypto_vpaes_encrypt
.p2align 4
_AccelerateCrypto_vpaes_encrypt:
vpaes_push
save_all_neon
vld1.8 {q0}, [in]
bl vpaes_preheat
bl vpaes_encrypt_core
vst1.8 {q0}, [out]
restore_all_neon
eor r0, r0 // return 0 for SUCCESS
vpaes_pop
.globl _AccelerateCrypto_vpaes_decrypt
.p2align 4
_AccelerateCrypto_vpaes_decrypt:
vpaes_push
save_all_neon
vld1.8 {q0}, [in]
bl vpaes_preheat
bl vpaes_decrypt_core
vst1.8 {q0}, [out]
restore_all_neon
eor r0, r0 // return 0 for SUCCESS
vpaes_pop
.p2align 4
vpaes_preheat:
adr r12, .Lk_s0F
vldmia r12, {q9-q15}
bx lr
.p2align 6
// the following 7 16-bytes words are loaded into
.Lk_s0F:
.quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
.Lk_inv:
.quad 0x0E05060F0D080180, 0x040703090A0B0C02
.quad 0x01040A060F0B0780, 0x030D0E0C02050809
.Lk_sb1:
.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
.Lk_sb2:
.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
#endif // !defined(__arm64__) && defined(__ARM_NEON__)

View File

@ -0,0 +1,65 @@
# Copyright (c) (2019,2020) Apple Inc. All rights reserved.
#
# corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
# is contained in the License.txt file distributed with corecrypto) and only to
# people who accept that license. IMPORTANT: Any license rights granted to you by
# Apple Inc. (if any) are limited to internal use within your organization only on
# devices and computers you own or control, for the sole purpose of verifying the
# security characteristics and correct functioning of the Apple Software. You may
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
#if defined(__arm64__)
#include "arm64_isa_compatibility.h"
#include "ccarm_pac_bti_macros.h"
// per block
#define in x0
#define out x1
#define key x2
#define keylen x3
#define t x5
.subsections_via_symbols
.text
.p2align 4
.globl _AccelerateCrypto_AES_decrypt
_AccelerateCrypto_AES_decrypt:
BRANCH_TARGET_CALL
#if BUILDKERNEL
// save used vector registers
sub sp, sp, #3*16
st1.4s {v0,v1,v2}, [sp]
#endif
ldr w3, [key, #240] // keylength = 32-bit
ldr q0, [in] // plain data
mov t, keylen
ldr q1, [key, t] // expanded key
sub t, t, #16
ldr q2, [key] // expanded key
0:
AESD 0, 1
AESIMC 0, 0
ldr q1, [key, t] // expanded key
subs t, t, #16
b.gt 0b
AESD 0, 1
eor.16b v0, v0, v2
str q0, [out]
#if BUILDKERNEL
// restore used vector registers
ld1.4s {v0,v1,v2}, [sp], #48
#endif
mov x0, #0
ret lr
#undef in
#undef out
#undef key
#undef keylen
#endif

View File

@ -0,0 +1,114 @@
# Copyright (c) (2011-2016,2018-2020) Apple Inc. All rights reserved.
#
# corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
# is contained in the License.txt file distributed with corecrypto) and only to
# people who accept that license. IMPORTANT: Any license rights granted to you by
# Apple Inc. (if any) are limited to internal use within your organization only on
# devices and computers you own or control, for the sole purpose of verifying the
# security characteristics and correct functioning of the Apple Software. You may
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
#if defined(__arm64__)
#include "arm64_isa_compatibility.h"
#include "ccarm_pac_bti_macros.h"
// ecb mode
#define key x0
#define nblocks w1
#define in x2
#define out x3
#define keylen x4
#define t x5
.subsections_via_symbols
.text
.globl _AccelerateCrypto_ecb_AES_decrypt
.p2align 4
_AccelerateCrypto_ecb_AES_decrypt:
BRANCH_TARGET_CALL
#if BUILDKERNEL
// save used vector registers
sub x4, sp, #6*16
sub sp, sp, #6*16
st1.4s {v0,v1,v2,v3}, [x4], #4*16
st1.4s {v4,v5}, [x4], #2*16
#endif
ldr w4, [key, #240] // keylength = 32-bit
ldr q5, [key] // expanded key
subs nblocks, nblocks, #4
b.lt L_lessthan4
L_4blocks:
mov t, keylen
ld1.4s {v0,v1,v2,v3}, [in], #4*16
ldr q4, [key, t] // expanded key
sub t, t, #16
0:
AESD 0, 4
AESIMC 0, 0
AESD 1, 4
AESIMC 1, 1
AESD 2, 4
AESIMC 2, 2
AESD 3, 4
AESIMC 3, 3
ldr q4, [key, t] // expanded key
subs t, t, #16
b.gt 0b
AESD 0, 4
eor.16b v0, v0, v5
AESD 1, 4
eor.16b v1, v1, v5
AESD 2, 4
eor.16b v2, v2, v5
AESD 3, 4
eor.16b v3, v3, v5
st1.4s {v0,v1,v2,v3}, [out], #4*16
subs nblocks, nblocks, #4
b.ge L_4blocks
L_lessthan4:
ands nblocks, nblocks, #3
b.eq 9f
L_1block:
mov t, keylen
ldr q0, [in], #16 // plain data
ldr q4, [key, t] // expanded key
sub t, t, #16
0:
AESD 0, 4
AESIMC 0, 0
ldr q4, [key, t] // expanded key
subs t, t, #16
b.gt 0b
AESD 0, 4
eor.16b v0, v0, v5
str q0, [out], #16
subs nblocks, nblocks, #1
b.gt L_1block
9:
#if BUILDKERNEL
// restore used vector registers
ld1.4s {v0,v1,v2,v3}, [sp], #4*16
ld1.4s {v4,v5}, [sp], #2*16
#endif
mov x0, #0
ret lr
#undef in
#undef out
#undef key
#undef nblocks
#undef keylen
#endif

View File

@ -0,0 +1,66 @@
# Copyright (c) (2019,2020) Apple Inc. All rights reserved.
#
# corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
# is contained in the License.txt file distributed with corecrypto) and only to
# people who accept that license. IMPORTANT: Any license rights granted to you by
# Apple Inc. (if any) are limited to internal use within your organization only on
# devices and computers you own or control, for the sole purpose of verifying the
# security characteristics and correct functioning of the Apple Software. You may
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
#if defined(__arm64__)
#include "arm64_isa_compatibility.h"
#include "ccarm_pac_bti_macros.h"
// per block implementation
#define in x0
#define out x1
#define key x2
#define keylen x3
#define t x5
.subsections_via_symbols
.text
.p2align 4
.globl _AccelerateCrypto_AES_encrypt
_AccelerateCrypto_AES_encrypt:
BRANCH_TARGET_CALL
#if BUILDKERNEL
// save used vector registers
sub sp, sp, #3*16
st1.4s {v0,v1,v2}, [sp]
#endif
ldr w3, [key, #240] // keylength = 32-bit, 160/192/224
ldr q0, [in] // plain data
ldr q1, [key] // expanded key
ldr q2, [key, keylen] // final expanded key
mov t, #16
0:
AESE 0, 1
AESMC 0, 0
ldr q1, [key, t] // expanded key
add t, t, #16
cmp t, keylen
b.lt 0b
AESE 0, 1
eor.16b v0, v0, v2
str q0, [out]
#if BUILDKERNEL
// restore used vector registers
ld1.4s {v0,v1,v2}, [sp], #48
#endif
mov x0, #0
ret lr
#undef in
#undef out
#undef key
#undef keylen
#endif

View File

@ -0,0 +1,119 @@
# Copyright (c) (2011-2016,2018-2020) Apple Inc. All rights reserved.
#
# corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
# is contained in the License.txt file distributed with corecrypto) and only to
# people who accept that license. IMPORTANT: Any license rights granted to you by
# Apple Inc. (if any) are limited to internal use within your organization only on
# devices and computers you own or control, for the sole purpose of verifying the
# security characteristics and correct functioning of the Apple Software. You may
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
#if defined(__arm64__)
#include "arm64_isa_compatibility.h"
#include "ccarm_pac_bti_macros.h"
#define key x0
#define nblocks w1
#define in x2
#define out x3
#define keylen x4
#define t x5
.subsections_via_symbols
.text
.p2align 4
.globl _AccelerateCrypto_ecb_AES_encrypt
_AccelerateCrypto_ecb_AES_encrypt:
BRANCH_TARGET_CALL
#if BUILDKERNEL
// save used vector registers
sub x4, sp, #6*16
sub sp, sp, #6*16
st1.4s {v0,v1,v2,v3}, [x4], #4*16
st1.4s {v4,v5}, [x4], #2*16
#endif
ldr w4, [key, #240] // keylength = 32-bit, 160/192/224
subs nblocks, nblocks, #4 // pre-decrement nblocks by 4
ldr q5, [key, keylen] // expanded key
b.lt 1f // if nblocks < 4, go to scalar loop
L_4blocks:
// handle 4 blocks per iteration
ldr q4, [key] // expanded key
mov t, #16
ld1.4s {v0,v1,v2,v3}, [in], #4*16
0:
AESE 0, 4
AESMC 0, 0
AESE 1, 4
AESMC 1, 1
AESE 2, 4
AESMC 2, 2
AESE 3, 4
AESMC 3, 3
ldr q4, [key, t] // expanded key
add t, t, #16
cmp t, keylen
b.lt 0b
AESE 0, 4
eor.16b v0, v0, v5
AESE 1, 4
eor.16b v1, v1, v5
AESE 2, 4
eor.16b v2, v2, v5
AESE 3, 4
eor.16b v3, v3, v5
st1.4s {v0,v1,v2,v3}, [out], #4*16
subs nblocks, nblocks, #4
b.ge L_4blocks
1: // handle 1 block per iteration
ands nblocks, nblocks, #3
b.eq 9f
L_1block:
ldr q4, [key] // expanded key
mov t, #16
ldr q0, [in], #16 // plain data
0:
AESE 0, 4
AESMC 0, 0
ldr q4, [key, t] // expanded key
add t, t, #16
cmp t, keylen
b.lt 0b
AESE 0, 4
eor.16b v0, v0, v5
str q0, [out], #16
subs nblocks, nblocks, #1
b.gt L_1block
9:
#if BUILDKERNEL
// restore used vector registers
ld1.4s {v0,v1,v2,v3}, [sp], #4*16
ld1.4s {v4,v5}, [sp], #2*16
#endif
mov x0, #0
ret lr
#undef in
#undef out
#undef key
#undef nblocks
#undef keylen
#endif

View File

@ -0,0 +1,25 @@
/* Copyright (c) (2012,2015,2016,2019) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#ifndef _CORECRYPTO_CONTEXT_H_
#define _CORECRYPTO_CONTEXT_H_
// Define byte offset of key within context structure.
#define ContextKey 0
/* Define byte offset of key length within context structure. The number
stored there is the number of bytes from the start of the first round key
to the start of the last round key. That is 16 less than the number of
bytes in the entire key.
*/
#define ContextKeyLength 240
#endif /* _CORECRYPTO_CONTEXT_H_ */

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,576 @@
# Copyright (c) (2012,2015,2016,2019) Apple Inc. All rights reserved.
#
# corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
# is contained in the License.txt file distributed with corecrypto) and only to
# people who accept that license. IMPORTANT: Any license rights granted to you by
# Apple Inc. (if any) are limited to internal use within your organization only on
# devices and computers you own or control, for the sole purpose of verifying the
# security characteristics and correct functioning of the Apple Software. You may
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
#if defined(__i386__) || defined(__x86_64__)
/* This file defines _vng_aes_encrypt or _vng_aes_decrypt, according to the value of
the Select preprocessor symbol. This file is designed to be included in
another assembly file using the preprocessor #include directive, to benefit
from some assembly-time calculations.
These two routines are nearly identical. They differ only in the tables
they use, the direction they iterate through the key, and the permutation
performed on part of the state.
Written by Eric Postpischil, January 2008.
*/
#if Select == 0
#define Name _aes_encrypt_nonaesni // Routine name.
#define MTable _AESEncryptTable // Main table.
#define FTable _AESSubBytesWordTable // Final table.
#define P0 S0 // State permutation.
#define P1 S1
#define P2 S2
#define P3 S3
#define Increment +16 // ExpandedKey increment.
#elif Select == 1
#define Name _aes_decrypt_nonaesni // Routine name.
#define MTable _AESDecryptTable // Main table.
#define FTable _AESInvSubBytesWordTable // Final table.
#define P0 S2 // State permutation.
#define P1 S3
#define P2 S0
#define P3 S1
#define Increment -16 // ExpandedKey increment.
#endif // Select
/* Routine:
_AESEncryptWithExpandedKey (if Select is 0) or
_AESDecryptWithExpandedKey (if Select is 1).
Function:
Perform the AES cipher or its inverse as defined in Federal Information
Processing Standards Publication 197 (FIPS-197), November 26, 2001.
The inverse cipher here is the "Equivalent Inverse Cipher" in FIPS-197.
Input:
Constant data:
The following names must be locally defined so the assembler
can calculate certain offsets.
For encryption:
static const Word _AESEncryptTable[4][256].
_AESEncryptTable[i] contains the tables T[i] defined in AES
Proposal: Rijndael, version 2, 03/09/99, by Joan Daemen and
Vincent Rijmen, section 5.2.1, page 18. These tables
combine the SubBytes and MixColumns operations.
static const Word _AESSubBytesWordTable[256].
_AESSubBytesWordTable[i][j] = SubBytes(j) << 8*i, where
SubBytes is defined in FIPS-197. _AESSubBytesWordTable
differs from _AESEncryptTable in that it does not include
the MixColumn operation. It is used in performing the last
round, which differs fromm the previous rounds in that it
does not include the MixColumn operation.
For decryption:
static const Word _AESDecryptTable[4][256].
The analog of _AESEncryptTable for decryption.
static const Word _AESSubBytesWordTable[256].
_AESInvSubBytesWordTable[i][j] = InvSubBytes(j) << 8*i,
where InvSubBytes is defined in FIPS-197.
_AESInvSubBytesWordTable differs from _AESDecryptTable in
that it does not include the InvMixColumn operation. It is
used in performing the last round, which differs from the
previous rounds in that it does not include the
InvMixColumn operation.
Arguments:
const Byte *InputText.
Address of input, 16 bytes. Best if four-byte aligned.
Byte *OutputText.
Address of output, 16 bytes. Best if four-byte aligned.
vng_aes_encrypt_ctx *Context or vng_aes_decrypt_ctx *Context
vng_aes_encrypt_ctx and vng_aes_decrypt_ctx are identical except the
former is used for encryption and the latter for decryption.
Each is a structure containing the expanded key beginning at
offset ContextKey and a four-byte "key length" beginning at
offset ContextKeyLength. The "key length" is the number of
bytes from the start of the first round key to the start of the
last round key. That is 16 less than the number of bytes in
the entire key.
Output:
Encrypted or decrypted data is written to *OutputText.
Return:
aes_rval // -1 if "key length" is invalid. 0 otherwise.
*/
.text
.globl Name
Name:
// Jimmur removed the capabilities check and the just to the AESNI code. This
// will be handled by the C code.
// Push new stack frame.
push r5
/* Save registers and set SaveSize to the number of bytes pushed onto the
stack so far, including the caller's return address.
*/
push r3
#if defined __i386__
push r6
push r7
#define SaveSize (5*4)
#else
#define SaveSize (3*8)
#endif
/* Number of bytes used for local variables:
4 (i386) or 0 (x86_64) bytes for ExpandedKeyEnd.
5 (i386) or 3 (x86_64) 16-byte spaces to save XMM registers.
*/
#define LocalsSize (Arch(4, 0) + Arch(5, 3)*16)
#if 0 < LocalsSize
// Padding to position stack pointer at a multiple of 16 bytes.
#define Padding (15 & -(SaveSize + LocalsSize))
sub $Padding + LocalsSize, r4 // Allocate space on stack.
#else
#define Padding 0
#endif
#if BUILDKERNEL
// Save XMM registers.
movaps %xmm0, 0*16(r4)
movaps %xmm1, 1*16(r4)
movaps %xmm2, 2*16(r4)
#if defined __i386__
movaps %xmm3, 3*16(r4)
movaps %xmm4, 4*16(r4)
#endif
#endif // BUILDKERNEL
#if defined __i386__
// Number of bytes from caller's stack pointer to ours.
#define StackFrame (SaveSize + Padding + LocalsSize)
// Define location of argument i (presuming 4-byte arguments).
#define Argument(i) StackFrame+4*(i)(%esp)
#define ArgInputText Argument(0)
#define ArgOutputText Argument(1)
#define ArgContext Argument(2)
#elif defined __x86_64__
// Arguments.
#define InputText r7 // Used early then overwritten for other use.
#define OutputText r6 // Needed near end of routine.
#define ArgContext r2
/* The argument passed in r2 overlaps registers we need for other
work, so it must be moved early in the routine.
*/
#endif
#define BaseP Arch(r6, r9) // Base pointer for addressing global data.
#define ExpandedKey Arch(t0, r10) // Address of expanded key.
/* The Work registers defined below are used to hold parts of the AES state
while we dissect or assemble it. They must be assigned to the A, B, C, and
D registers so that we can access the bytes in %al, %ah, and so on.
*/
#define Work0d r0d
#define Work0l r0l
#define Work0h r0h
#define Work1d r3d
#define Work1l r3l
#define Work1h r3h
#define Work2d r1d
#define Work2l r1l
#define Work2h r1h
#define Work3d r2d
#define Work3l r2l
#define Work3h r2h
#define t0 r5
#define t0d r5d // Low 32 bits of t0.
#define t0l r5l // Low byte of t0.
#define t1 r7
/* S0, S1, S2, and S3 are where we assemble the new AES state when computing
a regular round. S1, S2, and S3 are assigned to the Work registers, but
S0 needs to go somewhere else because Work0 holds part of the old state.
*/
#define S0 Arch(t1, r8d)
#define S1 Work1d
#define S2 Work2d
#define S3 Work3d
/* These XMM registers are used as holding space, because it is faster to
spill to these registers than to the stack. (On x86_64, we do not need
to spill, because there are additional general registers available.
However, using more general registers requires saving them to the stack
and restoring them. I timed it, and no time was saved.)
*/
#define vS1 %xmm0
#define vS2 %xmm1
#define vS3 %xmm2
#if defined __i386__
#define vExpandedKey %xmm3
#define vIncrement %xmm4
#endif
// Get address of expanded key.
mov ArgContext, ExpandedKey
#if 0 != ContextKey
add $ContextKey, ExpandedKey
#endif
/* Store sentinel value of ExpandedKey on the stack on i386, a register on
x86_64.
*/
#define ExpandedKeyEnd Arch(5*16(r4), r11)
// Get and check "key length".
movzb ContextKeyLength(ExpandedKey), r0
cmp $160, r0
je 2f
cmp $192, r0
je 2f
cmp $224, r0
je 2f
mov $-1, r0 // Return error.
jmp 9f
2:
#if (Select == 0 || Select == 2)
// For encryption, prepare to iterate forward through expanded key.
add ExpandedKey, r0
mov r0, ExpandedKeyEnd
#else
// For decryption, prepare to iterate backward through expanded key.
mov ExpandedKey, ExpandedKeyEnd
add r0, ExpandedKey
#endif
// Initialize State from input text.
#if defined __i386__
mov ArgInputText, BaseP
#define InputText BaseP
#endif
mov 0*4(InputText), Work0d
mov 1*4(InputText), S1
mov 2*4(InputText), S2
mov 3*4(InputText), S3
#undef InputText // Register is reused after this for other purposes.
// Add round key and save results.
xor 0*4(ExpandedKey), Work0d // S0 is in dissection register.
xor 1*4(ExpandedKey), S1
movd S1, vS1 // Save S1 to S3 in vector registers.
xor 2*4(ExpandedKey), S2
movd S2, vS2
xor 3*4(ExpandedKey), S3
movd S3, vS3
add $Increment, ExpandedKey // Advance to next round key.
#if defined __i386__
// Save expanded key address and increment in vector registers.
mov $Increment, t1
movp ExpandedKey, vExpandedKey
movp t1, vIncrement
#endif
// Set up relative addressing.
#if defined __i386__
// Get address of 0 in BaseP.
call 0f // Push program counter onto stack.
0:
pop BaseP // Get program counter.
// Define macros to help address data.
#define LookupM(table, index) MTable-0b+(table)*TableSize(BaseP, index, 4)
#define LookupF(table, index) FTable-0b+(table)*TableSize(BaseP, index, 4)
#elif defined __x86_64__
lea MTable(%rip), BaseP
// Define macros to help address data.
#define LookupM(table, index) (table)*TableSize(BaseP, index, 4)
#define LookupF(table, index) (table)*TableSize(BaseP, index, 4)
/* With these definitions of LookupM and LookupF, BaseP must be loaded with
the address of the table at the point where it is used. So we need an
instruction to change BaseP after we are done with MTable and before we
start using FTable. I would prefer to use something like:
.set FMinusM, FTable - MTable
#define LookupF(table, index) \
FMinusM+(table)*TableSize(BaseP, index, 4)
Then BaseP would not need to change. However, this fails due to an
assembler/linker bug.
*/
#endif
// Get round key.
mov 0*4(ExpandedKey), S0
mov 1*4(ExpandedKey), S1
mov 2*4(ExpandedKey), S2
mov 3*4(ExpandedKey), S3
1:
/* Word 0 of the current state must be in Work0 now, and the next round
key must be in S0 to S3.
*/
// Process previous S0.
movzb Work0l, t0
xor LookupM(0, t0), S0
movzb Work0h, t0d
xor LookupM(1, t0), P3
shr $16, Work0d
movzb Work0l, t0d
xor LookupM(2, t0), S2
movzb Work0h, t0d
xor LookupM(3, t0), P1
// Process previous S1.
movd vS1, Work0d
movzb Work0l, t0d
xor LookupM(0, t0), S1
movzb Work0h, t0d
xor LookupM(1, t0), P0
shr $16, Work0d
movzb Work0l, t0d
xor LookupM(2, t0), S3
movzb Work0h, t0d
xor LookupM(3, t0), P2
// Process previous S2.
movd vS2, Work0d
movzb Work0l, t0d
xor LookupM(0, t0), S2
movzb Work0h, t0d
xor LookupM(1, t0), P1
shr $16, Work0d
movzb Work0l, t0d
xor LookupM(2, t0), S0
movzb Work0h, t0d
xor LookupM(3, t0), P3
// Process previous S3.
movd vS3, Work0d
movzb Work0l, t0d
xor LookupM(0, t0), S3
movzb Work0h, t0d
xor LookupM(1, t0), P2
shr $16, Work0d
movzb Work0l, t0d
xor LookupM(2, t0), S1
movzb Work0h, t0d
xor LookupM(3, t0), P0
#if defined __i386__
paddd vIncrement, vExpandedKey
movp vExpandedKey, ExpandedKey
#else
add $Increment, ExpandedKey
#endif
// Save state for next iteration and load next round key.
mov S0, Work0d
mov 0*4(ExpandedKey), S0
movd S1, vS1
mov 1*4(ExpandedKey), S1
movd S2, vS2
mov 2*4(ExpandedKey), S2
movd S3, vS3
mov 3*4(ExpandedKey), S3
cmp ExpandedKeyEnd, ExpandedKey
jne 1b
/* Word 0 of the current state must be in Work0 now, and the next round
key must be in S0 to S3.
*/
// Work around assembler bug. See comments above about Radar 5683882.
#if defined __x86_64__
lea FTable(%rip), BaseP
#endif
// Process previous S0.
movzb Work0l, t0
xor LookupF(0, t0), S0
movzb Work0h, t0d
xor LookupF(1, t0), P3
shr $16, Work0d
movzb Work0l, t0d
xor LookupF(2, t0), S2
movzb Work0h, t0d
xor LookupF(3, t0), P1
// Process previous S1.
movd vS1, Work0d
movzb Work0l, t0d
xor LookupF(0, t0), S1
movzb Work0h, t0d
xor LookupF(1, t0), P0
shr $16, Work0d
movzb Work0l, t0d
xor LookupF(2, t0), S3
movzb Work0h, t0d
xor LookupF(3, t0), P2
// Process previous S2.
movd vS2, Work0d
movzb Work0l, t0d
xor LookupF(0, t0), S2
movzb Work0h, t0d
xor LookupF(1, t0), P1
shr $16, Work0d
movzb Work0l, t0d
xor LookupF(2, t0), S0
movzb Work0h, t0d
xor LookupF(3, t0), P3
// Process previous S3.
movd vS3, Work0d
movzb Work0l, t0d
xor LookupF(0, t0), S3
movzb Work0h, t0d
xor LookupF(1, t0), P2
shr $16, Work0d
movzb Work0l, t0d
xor LookupF(2, t0), S1
movzb Work0h, t0d
xor LookupF(3, t0), P0
#if defined __i386__ // Architecture.
// Get OutputText address.
#define OutputText BaseP
mov ArgOutputText, OutputText
#endif // Architecture.
// Write output.
mov S0, 0*4(OutputText)
mov S1, 1*4(OutputText)
mov S2, 2*4(OutputText)
mov S3, 3*4(OutputText)
xor r0, r0 // Return success.
9:
// Pop stack and restore registers.
#if BUILDKERNEL
#if defined __i386__
movaps 4*16(r4), %xmm4
movaps 3*16(r4), %xmm3
#endif
movaps 2*16(r4), %xmm2
movaps 1*16(r4), %xmm1
movaps 0*16(r4), %xmm0
#endif // BUILDKERNEL
#if 0 < LocalsSize
add $Padding + LocalsSize, r4
#endif
#if defined __i386__
pop r7
pop r6
#elif defined __x86_64__
#endif
pop r3
pop r5
ret
#undef ArgExpandedKey
#undef ArgInputText
#undef ArgNr
#undef ArgOutputText
#undef Argument
#undef BaseP
#undef ExpandedKey
#undef ExpandedKeyEnd
#undef FTable
#undef InputText
#undef LocalsSize
#undef LookupM
#undef LookupF
#undef MTable
#undef OutputText
#undef Padding
#undef SaveSize
#undef S0
#undef S1
#undef S2
#undef S3
#undef StackFrame
#undef Work0d
#undef Work0h
#undef Work0l
#undef Work1d
#undef Work1h
#undef Work1l
#undef Work2d
#undef Work2h
#undef Work2l
#undef Work3d
#undef Work3h
#undef Work3l
#undef t0
#undef t0d
#undef t0l
#undef t1
#undef vExpandedKey
#undef vS1
#undef vS2
#undef vS3
#undef Name
#undef MTable
#undef FTable
#undef P0
#undef P1
#undef P2
#undef P3
#undef Increment
#endif // defined(__x86_64__) || defined(__i386__)

View File

@ -0,0 +1,38 @@
/* Copyright (c) (2019) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#if (defined(__x86_64__) || defined(__i386__))
#include <stddef.h>
#include "config.h"
#include "AccelerateCrypto.h"
extern int aes_encrypt_aesni(const void *in, void *out, const AccelerateCrypto_AES_ctx *key);
extern int aes_decrypt_aesni(const void *in, void *out, const AccelerateCrypto_AES_ctx *key);
extern int aes_encrypt_nonaesni(const void *in, void *out, const AccelerateCrypto_AES_ctx *key);
extern int aes_decrypt_nonaesni(const void *in, void *out, const AccelerateCrypto_AES_ctx *key);
int AccelerateCrypto_AES_encrypt(const void *in, void *out, const AccelerateCrypto_AES_ctx *key)
{
if (HAS_AESNI()) return aes_encrypt_aesni(in, out, key);
else
return aes_encrypt_nonaesni(in, out, key);
}
int AccelerateCrypto_AES_decrypt(const void *in, void *out, const AccelerateCrypto_AES_ctx *key)
{
if (HAS_AESNI()) return aes_decrypt_aesni(in, out, key);
else
return aes_decrypt_nonaesni(in, out, key);
}
#endif // (defined(__x86_64__) || defined(__i386__))

View File

@ -0,0 +1,483 @@
# Copyright (c) (2012,2015,2016,2018,2019) Apple Inc. All rights reserved.
#
# corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
# is contained in the License.txt file distributed with corecrypto) and only to
# people who accept that license. IMPORTANT: Any license rights granted to you by
# Apple Inc. (if any) are limited to internal use within your organization only on
# devices and computers you own or control, for the sole purpose of verifying the
# security characteristics and correct functioning of the Apple Software. You may
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
/* This files defines _aes_encrypt_aesni and _aes_decrypt_aesni --- Intel Westmere HW AES-based implementation
of _aes_encrypt and _aes_decrypt.
These 2 functions SHOULD BE entried ONLY after the AES HW is verified to be available.
They SHOULD NOT be called without AES HW detection. It might cause xnu to crash.
The AES HW is detected 1st thing in
_aes_encrypt (EncryptDecrypt.s)
_aes_decrypt (EncryptDecrypt.s)
and, if AES HW is detected, branch without link (ie, jump) to the functions here.
The implementation here follows the examples in an Intel White Paper
"Intel Advanced Encryption Standard (AES) Instruction Set" Rev.2 01
Note: Rev. 03 Final 2010 01 26 is available. Looks like some code change from Rev.2 01
*/
#if (defined __i386__ || defined __x86_64__)
.text
.p2align 4,0x90
.globl _aes_encrypt_aesni
_aes_encrypt_aesni:
#if defined __i386__
movl 4(%esp), %eax // in
movl 12(%esp), %edx // ctx
movl 8(%esp), %ecx // out
#define LOCAL_SIZE (12+16+16) // 16-byte align (-4 for return address) + 16 (xmm0) + 16 (xmm1)
#define in %eax
#define ctx %edx
#define out %ecx
#define r13 %esp
#else // x86_64
#define LOCAL_SIZE (8+16+16) // 16-byte align (-8 for return address) + 16 (xmm0) + 16 (xmm1)
#define in %rdi
#define ctx %rdx
#define out %rsi
#define r13 %rsp
#endif // i386 or x86_64
#if BUILDKERNEL
sub $LOCAL_SIZE, r13
movaps %xmm0, (r13)
#endif
movups (in), %xmm0
// key length identification
movl 240(ctx), %eax // key length
cmp $160, %eax
je L_AES_128
cmp $192, %eax
je L_AES_192
cmp $224, %eax
je L_AES_256
mov $-1, %eax // return ERROR
#if BUILDKERNEL
movaps (r13), %xmm0
add $LOCAL_SIZE, r13
#endif
ret
L_AES_128:
testb $15, %dl // check whether expanded key is 16-byte aligned
jne 0f // if not 16-byte aligned, aesenc xmm, m128 won't work
pxor (ctx), %xmm0
aesenc 16(ctx), %xmm0
aesenc 32(ctx), %xmm0
aesenc 48(ctx), %xmm0
aesenc 64(ctx), %xmm0
aesenc 80(ctx), %xmm0
aesenc 96(ctx), %xmm0
aesenc 112(ctx), %xmm0
aesenc 128(ctx), %xmm0
aesenc 144(ctx), %xmm0
aesenclast 160(ctx), %xmm0
xorl %eax, %eax
movups %xmm0, (out)
#if BUILDKERNEL
movaps (r13), %xmm0
add $LOCAL_SIZE, r13
#endif
ret
0: // special case expanded key is not 16-byte aligned
#if BUILDKERNEL
movaps %xmm1, 16(r13) // save xmm1 into stack
#endif
movups (ctx), %xmm1
pxor %xmm1, %xmm0
movups 16(ctx), %xmm1
aesenc %xmm1, %xmm0
movups 32(ctx), %xmm1
aesenc %xmm1, %xmm0
movups 48(ctx), %xmm1
aesenc %xmm1, %xmm0
movups 64(ctx), %xmm1
aesenc %xmm1, %xmm0
movups 80(ctx), %xmm1
aesenc %xmm1, %xmm0
movups 96(ctx), %xmm1
aesenc %xmm1, %xmm0
movups 112(ctx), %xmm1
aesenc %xmm1, %xmm0
movups 128(ctx), %xmm1
aesenc %xmm1, %xmm0
movups 144(ctx), %xmm1
aesenc %xmm1, %xmm0
movups 160(ctx), %xmm1
aesenclast %xmm1, %xmm0
xorl %eax, %eax
movups %xmm0, (out)
#if BUILDKERNEL
movaps (r13), %xmm0
movaps 16(r13), %xmm1
add $LOCAL_SIZE, r13
#endif
ret
L_AES_192:
testb $15, %dl // check whether expanded key is 16-byte aligned
jne 0f // if not 16-byte aligned, aesenc xmm, m128 won't work
pxor (ctx), %xmm0
aesenc 16(ctx), %xmm0
aesenc 32(ctx), %xmm0
aesenc 48(ctx), %xmm0
aesenc 64(ctx), %xmm0
aesenc 80(ctx), %xmm0
aesenc 96(ctx), %xmm0
aesenc 112(ctx), %xmm0
aesenc 128(ctx), %xmm0
aesenc 144(ctx), %xmm0
aesenc 160(ctx), %xmm0
aesenc 176(ctx), %xmm0
aesenclast 192(ctx), %xmm0
xorl %eax, %eax
movups %xmm0, (out)
#if BUILDKERNEL
movaps (r13), %xmm0
add $LOCAL_SIZE, r13
#endif
ret
0: // special case expanded key is not 16-byte aligned
#if BUILDKERNEL
movaps %xmm1, 16(r13) // save xmm1 into stack
#endif
movups (ctx), %xmm1
pxor %xmm1, %xmm0
movups 16(ctx), %xmm1
aesenc %xmm1, %xmm0
movups 32(ctx), %xmm1
aesenc %xmm1, %xmm0
movups 48(ctx), %xmm1
aesenc %xmm1, %xmm0
movups 64(ctx), %xmm1
aesenc %xmm1, %xmm0
movups 80(ctx), %xmm1
aesenc %xmm1, %xmm0
movups 96(ctx), %xmm1
aesenc %xmm1, %xmm0
movups 112(ctx), %xmm1
aesenc %xmm1, %xmm0
movups 128(ctx), %xmm1
aesenc %xmm1, %xmm0
movups 144(ctx), %xmm1
aesenc %xmm1, %xmm0
movups 160(ctx), %xmm1
aesenc %xmm1, %xmm0
movups 176(ctx), %xmm1
aesenc %xmm1, %xmm0
movups 192(ctx), %xmm1
aesenclast %xmm1, %xmm0
xorl %eax, %eax
movups %xmm0, (out)
#if BUILDKERNEL
movaps (r13), %xmm0
movaps 16(r13), %xmm1
add $LOCAL_SIZE, r13
#endif
ret
L_AES_256:
testb $15, %dl // check whether expanded key is 16-byte aligned
jne 0f // if not 16-byte aligned, aesenc xmm, m128 won't work
pxor (ctx), %xmm0
aesenc 16(ctx), %xmm0
aesenc 32(ctx), %xmm0
aesenc 48(ctx), %xmm0
aesenc 64(ctx), %xmm0
aesenc 80(ctx), %xmm0
aesenc 96(ctx), %xmm0
aesenc 112(ctx), %xmm0
aesenc 128(ctx), %xmm0
aesenc 144(ctx), %xmm0
aesenc 160(ctx), %xmm0
aesenc 176(ctx), %xmm0
aesenc 192(ctx), %xmm0
aesenc 208(ctx), %xmm0
aesenclast 224(ctx), %xmm0
xorl %eax, %eax
movups %xmm0, (out)
#if BUILDKERNEL
movaps (r13), %xmm0
add $LOCAL_SIZE, r13
#endif
ret
0: // special case expanded key is not 16-byte aligned
#if BUILDKERNEL
movaps %xmm1, 16(r13) // save xmm1 into stack
#endif
movups (ctx), %xmm1
pxor %xmm1, %xmm0
movups 16(ctx), %xmm1
aesenc %xmm1, %xmm0
movups 32(ctx), %xmm1
aesenc %xmm1, %xmm0
movups 48(ctx), %xmm1
aesenc %xmm1, %xmm0
movups 64(ctx), %xmm1
aesenc %xmm1, %xmm0
movups 80(ctx), %xmm1
aesenc %xmm1, %xmm0
movups 96(ctx), %xmm1
aesenc %xmm1, %xmm0
movups 112(ctx), %xmm1
aesenc %xmm1, %xmm0
movups 128(ctx), %xmm1
aesenc %xmm1, %xmm0
movups 144(ctx), %xmm1
aesenc %xmm1, %xmm0
movups 160(ctx), %xmm1
aesenc %xmm1, %xmm0
movups 176(ctx), %xmm1
aesenc %xmm1, %xmm0
movups 192(ctx), %xmm1
aesenc %xmm1, %xmm0
movups 208(ctx), %xmm1
aesenc %xmm1, %xmm0
movups 224(ctx), %xmm1
aesenclast %xmm1, %xmm0
xorl %eax, %eax
movups %xmm0, (out)
#if BUILDKERNEL
movaps (r13), %xmm0
movaps 16(r13), %xmm1
add $LOCAL_SIZE, r13
#endif
ret
.text
.p2align 4,0x90
.globl _aes_decrypt_aesni
_aes_decrypt_aesni:
#if defined __i386__
movl 4(%esp), %eax // in
movl 12(%esp), %edx // ctx
movl 8(%esp), %ecx // out
#endif
#if BUILDKERNEL
sub $LOCAL_SIZE, r13
movaps %xmm0, (r13)
#endif
movups (in), %xmm0
// key length identification
movl 240(ctx), %eax // key length
cmp $160, %eax
je 0f // AES-128
cmp $192, %eax
je 1f // AES-192
cmp $224, %eax
je 2f // AES-256
mov $-1, %eax // return ERROR
#if BUILDKERNEL
movaps (r13), %xmm0
add $LOCAL_SIZE, r13
#endif
ret
0: // AES-128
testb $15, %dl // check whether expanded key is 16-byte aligned
jne 9f // if not 16-byte aligned, aesenc xmm, m128 won't work
pxor 160(ctx), %xmm0
aesdec 144(ctx), %xmm0
aesdec 128(ctx), %xmm0
aesdec 112(ctx), %xmm0
aesdec 96(ctx), %xmm0
aesdec 80(ctx), %xmm0
aesdec 64(ctx), %xmm0
aesdec 48(ctx), %xmm0
aesdec 32(ctx), %xmm0
aesdec 16(ctx), %xmm0
aesdeclast (ctx), %xmm0
xorl %eax, %eax
movups %xmm0, (out)
#if BUILDKERNEL
movaps (r13), %xmm0
add $LOCAL_SIZE, r13
#endif
ret
9: // AES-128 Decrypt : special case expanded key is not 16-byte aligned
#if BUILDKERNEL
movaps %xmm1, 16(r13) // save xmm1 into stack
#endif
movups 160(ctx), %xmm1
pxor %xmm1, %xmm0
movups 144(ctx), %xmm1
aesdec %xmm1, %xmm0
movups 128(ctx), %xmm1
aesdec %xmm1, %xmm0
movups 112(ctx), %xmm1
aesdec %xmm1, %xmm0
movups 96(ctx), %xmm1
aesdec %xmm1, %xmm0
movups 80(ctx), %xmm1
aesdec %xmm1, %xmm0
movups 64(ctx), %xmm1
aesdec %xmm1, %xmm0
movups 48(ctx), %xmm1
aesdec %xmm1, %xmm0
movups 32(ctx), %xmm1
aesdec %xmm1, %xmm0
movups 16(ctx), %xmm1
aesdec %xmm1, %xmm0
movups (ctx), %xmm1
aesdeclast %xmm1, %xmm0
xorl %eax, %eax
movups %xmm0, (out)
#if BUILDKERNEL
movaps (r13), %xmm0
movaps 16(r13), %xmm1
add $LOCAL_SIZE, r13
#endif
ret
1: // AES-192
testb $15, %dl // check whether expanded key is 16-byte aligned
jne 9f // if not 16-byte aligned, aesenc xmm, m128 won't work
pxor 192(ctx), %xmm0
aesdec 176(ctx), %xmm0
aesdec 160(ctx), %xmm0
aesdec 144(ctx), %xmm0
aesdec 128(ctx), %xmm0
aesdec 112(ctx), %xmm0
aesdec 96(ctx), %xmm0
aesdec 80(ctx), %xmm0
aesdec 64(ctx), %xmm0
aesdec 48(ctx), %xmm0
aesdec 32(ctx), %xmm0
aesdec 16(ctx), %xmm0
aesdeclast (ctx), %xmm0
xorl %eax, %eax
movups %xmm0, (out)
#if BUILDKERNEL
movaps (r13), %xmm0
add $LOCAL_SIZE, r13
#endif
ret
9: // AES-192 Decrypt : special case expanded key is not 16-byte aligned
#if BUILDKERNEL
movaps %xmm1, 16(r13) // save xmm1 into stack
#endif
movups 192(ctx), %xmm1
pxor %xmm1, %xmm0
movups 176(ctx), %xmm1
aesdec %xmm1, %xmm0
movups 160(ctx), %xmm1
aesdec %xmm1, %xmm0
movups 144(ctx), %xmm1
aesdec %xmm1, %xmm0
movups 128(ctx), %xmm1
aesdec %xmm1, %xmm0
movups 112(ctx), %xmm1
aesdec %xmm1, %xmm0
movups 96(ctx), %xmm1
aesdec %xmm1, %xmm0
movups 80(ctx), %xmm1
aesdec %xmm1, %xmm0
movups 64(ctx), %xmm1
aesdec %xmm1, %xmm0
movups 48(ctx), %xmm1
aesdec %xmm1, %xmm0
movups 32(ctx), %xmm1
aesdec %xmm1, %xmm0
movups 16(ctx), %xmm1
aesdec %xmm1, %xmm0
movups (ctx), %xmm1
aesdeclast %xmm1, %xmm0
xorl %eax, %eax
movups %xmm0, (out)
#if BUILDKERNEL
movaps (r13), %xmm0
movaps 16(r13), %xmm1
add $LOCAL_SIZE, r13
#endif
ret
2: // AES-256
testb $15, %dl // check whether expanded key is 16-byte aligned
jne 9f // if not 16-byte aligned, aesenc xmm, m128 won't work
pxor 224(ctx), %xmm0
aesdec 208(ctx), %xmm0
aesdec 192(ctx), %xmm0
aesdec 176(ctx), %xmm0
aesdec 160(ctx), %xmm0
aesdec 144(ctx), %xmm0
aesdec 128(ctx), %xmm0
aesdec 112(ctx), %xmm0
aesdec 96(ctx), %xmm0
aesdec 80(ctx), %xmm0
aesdec 64(ctx), %xmm0
aesdec 48(ctx), %xmm0
aesdec 32(ctx), %xmm0
aesdec 16(ctx), %xmm0
aesdeclast (ctx), %xmm0
xorl %eax, %eax
movups %xmm0, (out)
#if BUILDKERNEL
movaps (r13), %xmm0
add $LOCAL_SIZE, r13
#endif
ret
9: // AES-256 Decrypt : special case expanded key is not 16-byte aligned
#if BUILDKERNEL
movaps %xmm1, 16(r13) // save xmm1 into stack
#endif
movups 224(ctx), %xmm1
pxor %xmm1, %xmm0
movups 208(ctx), %xmm1
aesdec %xmm1, %xmm0
movups 192(ctx), %xmm1
aesdec %xmm1, %xmm0
movups 176(ctx), %xmm1
aesdec %xmm1, %xmm0
movups 160(ctx), %xmm1
aesdec %xmm1, %xmm0
movups 144(ctx), %xmm1
aesdec %xmm1, %xmm0
movups 128(ctx), %xmm1
aesdec %xmm1, %xmm0
movups 112(ctx), %xmm1
aesdec %xmm1, %xmm0
movups 96(ctx), %xmm1
aesdec %xmm1, %xmm0
movups 80(ctx), %xmm1
aesdec %xmm1, %xmm0
movups 64(ctx), %xmm1
aesdec %xmm1, %xmm0
movups 48(ctx), %xmm1
aesdec %xmm1, %xmm0
movups 32(ctx), %xmm1
aesdec %xmm1, %xmm0
movups 16(ctx), %xmm1
aesdec %xmm1, %xmm0
movups (ctx), %xmm1
aesdeclast %xmm1, %xmm0
xorl %eax, %eax
movups %xmm0, (out)
#if BUILDKERNEL
movaps (r13), %xmm0
movaps 16(r13), %xmm1
add $LOCAL_SIZE, r13
#endif
ret
#endif /* x86 based build */

View File

@ -0,0 +1,146 @@
# Copyright (c) (2012,2015,2016,2019) Apple Inc. All rights reserved.
#
# corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
# is contained in the License.txt file distributed with corecrypto) and only to
# people who accept that license. IMPORTANT: Any license rights granted to you by
# Apple Inc. (if any) are limited to internal use within your organization only on
# devices and computers you own or control, for the sole purpose of verifying the
# security characteristics and correct functioning of the Apple Software. You may
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
#include <corecrypto/cc_config.h>
/* AES.s -- Core AES routines for Intel processors.
Written by Eric Postpischil, January 30, 2008.
*/
#if (defined __i386__ || defined __x86_64__)
/* We build these AES routines as a single module because the routines refer
to labels in Data.s and it is easier and faster to refer to them as local
labels. In my implementations of AES for CommonCrypto, both i386 and
x86_64 use position-independent code. For this in-kernel implementation,
i386 has been converted to absolute addressing, but x86_64 still uses PIC.
A local label can be referred to with position-independent assembler
expressions such as "label-base(register)", where <base> is a local label
whose address has been loaded into <register>. (On i386, this is typically
done with the idiom of a call to the next instruction and a pop of that
return address into a register.) Without local labels, the references must
be done using spaces for addresses of "lazy symbols" that are filled in by
the dynamic loader and loaded by the code that wants the address.
So the various routines in other files are assembled here via #include
directives.
*/
#include "Data.s"
#define TableSize (256*4)
/* Each of the arrays defined in Data.s except for the round constants
in _AESRcon is composed of four tables of 256 entries of four bytes
each. TableSize is the number of bytes in one of those four tables.
*/
// Include constants describing the AES context structures.
#include "Context.h"
/* Define a macro to select a value based on architecture. This reduces
some of the architecture conditionalization later in the source.
*/
#if defined __i386__
#define Arch(i386, x86_64) i386
#elif defined __x86_64__
#define Arch(i386, x86_64) x86_64
#endif
// Define an instruction for moving pointers.
#define movp Arch(movd, movd)
// Latter argument should be "movq", but the assembler uses "movd".
/* Rename the general registers. This makes it easier to keep track of them
and provides names for the "whole register" that are uniform between i386
and x86_64.
*/
#if defined __i386__
#define r0 %eax // Available for any use.
#define r1 %ecx // Available for any use, some special purposes (loop).
#define r2 %edx // Available for any use.
#define r3 %ebx // Must be preserved by called routine.
#define r4 %esp // Stack pointer.
#define r5 %ebp // Frame pointer, must preserve, no bare indirect.
#define r6 %esi // Must be preserved by called routine.
#define r7 %edi // Must be preserved by called routine.
#elif defined __x86_64__
#define r0 %rax // Available for any use.
#define r1 %rcx // Available for any use.
#define r2 %rdx // Available for any use.
#define r3 %rbx // Must be preserved by called routine.
#define r4 %rsp // Stack pointer.
#define r5 %rbp // Frame pointer. Must be preserved by called routine.
#define r6 %rsi // Available for any use.
#define r7 %rdi // Available for any use.
#define r8 %r8 // Available for any use.
#define r9 %r9 // Available for any use.
#define r10 %r10 // Available for any use.
#define r11 %r11 // Available for any use.
#define r12 %r12 // Must be preserved by called routine.
#define r13 %r13 // Must be preserved by called routine.
#define r14 %r14 // Must be preserved by called routine.
#define r15 %r15 // Must be preserved by called routine.
#else
#error "Unknown architecture."
#endif
// Define names for parts of registers.
#define r0d %eax // Low 32 bits of r0.
#define r1d %ecx // Low 32 bits of r1.
#define r2d %edx // Low 32 bits of r2.
#define r3d %ebx // Low 32 bits of r3.
#define r5d %ebp // Low 32 bits of r5.
#define r6d %esi // Low 32 bits of r6.
#define r7d %edi // Low 32 bits of r7.
#define r8d %r8d // Low 32 bits of r8.
#define r9d %r9d // Low 32 bits of r9.
#define r11d %r11d // Low 32 bits of r11.
#define r0l %al // Low byte of r0.
#define r1l %cl // Low byte of r1.
#define r2l %dl // Low byte of r2.
#define r3l %bl // Low byte of r3.
#define r5l %bpl // Low byte of r5.
#define r0h %ah // Second lowest byte of r0.
#define r1h %ch // Second lowest byte of r1.
#define r2h %dh // Second lowest byte of r2.
#define r3h %bh // Second lowest byte of r3.
.text
// Define encryption routine, _AESEncryptWithExpandedKey
#define Select 0
#include "EncryptDecrypt.s"
#undef Select
// Define decryption routine, _AESDecryptWithExpandedKey
#define Select 1
#include "EncryptDecrypt.s"
#undef Select
// Define key expansion routine for encryption, _AESExpandKeyForEncryption.
// #include "ExpandKeyForEncryption.s"
// Define key expansion for decryption routine, _AESExpandKeyForDecryption.
// #include "ExpandKeyForDecryption.s"
#endif /* x86 based build */

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,362 @@
# Copyright (c) (2018-2020) Apple Inc. All rights reserved.
#
# corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
# is contained in the License.txt file distributed with corecrypto) and only to
# people who accept that license. IMPORTANT: Any license rights granted to you by
# Apple Inc. (if any) are limited to internal use within your organization only on
# devices and computers you own or control, for the sole purpose of verifying the
# security characteristics and correct functioning of the Apple Software. You may
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
/*
void SHA1( int HASH[], int MESSAGE[] )
{
int A[81], B[81], C[81], D[81], E[81];
int W[80];
int i, FN;
A[0] = HASH[0]; B[0] = HASH[1]; C[0] = HASH[2]; D[0] = HASH[3]; E[0] = HASH[4];
for ( i=0; i<80; ++i ) {
if ( i < 16 )
W[i] = BIG_ENDIAN_LOAD( MESSAGE[i] );
else
W[i] = ROTATE_LEFT( W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16], 1 );
FN = F( i, B[i], C[i], D[i] );
A[i+1] = FN + E[i] + ROTATE_LEFT( A[i], 5 ) + W[i] + K(i);
B[i+1] = A[i];
C[i+1] = ROTATE_LEFT( B[i], 30 );
D[i+1] = C[i];
E[i+1] = D[i];
}
HASH[0] += A[80]; HASH[1] += B[80]; HASH[2] += C[80]; HASH[3] += D[80]; HASH[4] += E[80];
}
For i=0:15, W[i] is simply big-endian loading of MESSAGE[i].
For i=16:79, W[i] is updated according to W[i] = ROTATE_LEFT( W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16], 1 );
The approach (by Dean Gaudet) can be used to vectorize the computation of W[i] for i=16:79,
1. update 4 consequtive W[i] (stored in a single 16-byte register)
W[i ] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1
W[i+1] = (W[i-2] ^ W[i-7] ^ W[i-13] ^ W[i-15]) rol 1
W[i+2] = (W[i-1] ^ W[i-6] ^ W[i-12] ^ W[i-14]) rol 1
W[i+3] = ( 0 ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1
2. this additional calculation unfortunately requires many additional operations
W[i+3] ^= W[i] rol 1
3. once we have 4 W[i] values in a Q register, we can also add four K values with one instruction
W[i:i+3] += {K,K,K,K}
Let W0 = {W[i] W[i+1] W[i+2] W[i+3]} be the current W-vector to be computed,
W4 = {W[i-4] W[i-3] W[i-2] W[i-1]} be the previous vector, and so on
The Dean Gaudet approach can be expressed as
1. W0 = rotate_left(left_shift(W4,32) ^ W8 ^ left_shift(concatenate(W16,W12),64) ^ W16,1);
2. W[i+3] ^= W[i] rol 1
3. W0 += {K,K,K,K}
For i>=32, the Intel online article suggests that (using a basic identity (X rol 1) rol 1 = X rol 2)
the update equation is equivalent to
1. W0 = rotate_left(left_shift(concatenate(W8,W4),64) ^ W16 ^ W28 ^ W32, 2);
Note:
1. In total, we need 8 16-byte registers or memory for W0,W4,...,W28. W0 and W32 can be the same register or memory.
2. The registers are used in a circular buffering mode. For example, we start with W28,W24,...,W0
(with W0 indicating the most recent 16-byte)
i=0, W28,W24,...,W0
i=4, W24,W20,...,W28
i=8, W20,W16,...,W24
.
.
and so forth.
3. once W-vector is computed, W+K is then computed and saved in the stack memory, this will be used later when
updating the digests A/B/C/D/E
the execution flow (for 1 single 64-byte block) looks like
W_PRECALC_00_15 // big-endian loading of 64-bytes into 4 W-vectors, compute WK=W+K, save WK in the stack memory
W_PRECALC_16_31 // for each vector, update digests, update W (Gaudet) and WK=W+K, save WK in the stack memory
W_PRECALC_32_79 // for each vector, update digests, update W (Intel) and WK=W+K, save WK in the stack memory
our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block
into the last 16 rounds of its previous block:
----------------------------------------------------------------------------------------------------------
load W(0:15) (big-endian per 4 bytes) into 4 Q registers
pre_calculate and store WK = W+K(0:15) in 16-byte aligned stack memory
L_loop:
load digests a-e from ctx->state;
for (r=0;r<16;r+=4) {
digests a-e update and permute round r:r+3
update W([r:r+3]%16) (Gaudet) and WK([r:r+3]%16) for the next 4th iteration
}
for (r=16;r<64;r+=4) {
digests a-e update and permute round r:r+3
update W([r:r+3]%16) (Intel) and WK([r:r+3]%16) for the next 4th iteration
}
num_block--;
if (num_block==0) jmp L_last_block;
for (r=64;r<80;r+=4) {
digests a-e update and permute round r:r+3
load W([r:r+3]%16) (big-endian per 4 bytes) into 4 Q registers
pre_calculate and store W+K([r:r+3]%16) in stack
}
ctx->states += digests a-e;
jmp L_loop;
L_last_block:
for (r=64;r<80;r+=4) {
digests a-e update and permute round r:r+3
}
ctx->states += digests a-e;
----------------------------------------------------------------------------------------------------------
*/
#if defined(__arm64__)
#include "arm64_isa_compatibility.h"
#include "ccarm_pac_bti_macros.h"
.subsections_via_symbols
.text
.p2align 4
#define K1 0x5a827999
#define K2 0x6ed9eba1
#define K3 0x8f1bbcdc
#define K4 0xca62c1d6
K_XMM_AR:
.long K1
.long K1
.long K1
.long K1
.long K2
.long K2
.long K2
.long K2
.long K3
.long K3
.long K3
.long K3
.long K4
.long K4
.long K4
.long K4
.p2align 4
.globl _AccelerateCrypto_SHA1_compress
_AccelerateCrypto_SHA1_compress:
#define hashes x0
#define numblocks x1
#define data x2
#define ktable x3
BRANCH_TARGET_CALL
#ifdef __ILP32__
uxtw numblocks, numblocks // in arm64_32 size_t is 32-bit, so we need to extend it
#endif
// early exit if input number of blocks is zero
adrp ktable, K_XMM_AR@page
cbnz numblocks, 1f
ret lr
1:
add ktable, ktable, K_XMM_AR@pageoff // K table
#if BUILDKERNEL
// saved vector registers that will be used in the computation v0-v7, v16-v21
sub x4, sp, #17*16
sub sp, sp, #17*16
st1.4s {v0,v1,v2,v3}, [x4], #64
st1.4s {v4,v5,v6,v7}, [x4], #64
st1.4s {v16,v17,v18,v19}, [x4], #64
st1.4s {v20,v21,v22,v23}, [x4], #64
st1.4s {v24}, [x4], #16
#endif
ld1.4s {v0,v1,v2,v3}, [data], #64 // w0,w1,w2,w3 need to bswap into big-endian
ld1.4s {v21,v22,v23,v24}, [ktable], #64 // k1,k2,k3,k4
ldr q16, [hashes], #16
ldr s17, [hashes], #-16
rev32.16b v0, v0 // byte swap of 1st 4 ints
rev32.16b v1, v1 // byte swap of 2nd 4 ints
rev32.16b v2, v2 // byte swap of 3rd 4 ints
rev32.16b v3, v3 // byte swap of 4th 4 ints
mov.16b v18, v16
add.4s v4, v0, v21 // 1st 4 input + K256
add.4s v5, v1, v21 // 2nd 4 input + K256
mov.16b v19, v17
add.4s v6, v2, v21 // 3rd 4 input + K256
add.4s v7, v3, v21 // 4th 4 input + K256
.macro sha1c_round
SHA1SU0 $0, $1, $2
mov.16b v20, v18
SHA1C 18, 19, $4
SHA1H 19, 20
SHA1SU1 $0, $3
add.4s $6, $5, $7
.endm
.macro sha1p_round
SHA1SU0 $0, $1, $2
mov.16b v20, v18
SHA1P 18, 19, $4
SHA1H 19, 20
SHA1SU1 $0, $3
add.4s $6, $5, $7
.endm
.macro sha1m_round
SHA1SU0 $0, $1, $2
mov.16b v20, v18
SHA1M 18, 19, $4
SHA1H 19, 20
SHA1SU1 $0, $3
add.4s $6, $5, $7
.endm
// 4 vector hashes update and load next vector rounds
.macro sha1p_hash_load_round
rev32.16b $1, $1
mov.16b v20, v18
SHA1P 18, 19, $0
SHA1H 19, 20
add.4s $2, $1, $3
.endm
.macro sha1p_hash_round
mov.16b v20, v18
SHA1P 18, 19, $0
SHA1H 19, 20
.endm
sha1c_round 0, 1, 2, 3, 4, v0, v4, v21
sha1c_round 1, 2, 3, 0, 5, v1, v5, v22
sha1c_round 2, 3, 0, 1, 6, v2, v6, v22
sha1c_round 3, 0, 1, 2, 7, v3, v7, v22
sha1c_round 0, 1, 2, 3, 4, v0, v4, v22
sha1p_round 1, 2, 3, 0, 5, v1, v5, v22
sha1p_round 2, 3, 0, 1, 6, v2, v6, v23
sha1p_round 3, 0, 1, 2, 7, v3, v7, v23
sha1p_round 0, 1, 2, 3, 4, v0, v4, v23
sha1p_round 1, 2, 3, 0, 5, v1, v5, v23
sha1m_round 2, 3, 0, 1, 6, v2, v6, v23
sha1m_round 3, 0, 1, 2, 7, v3, v7, v24
sha1m_round 0, 1, 2, 3, 4, v0, v4, v24
sha1m_round 1, 2, 3, 0, 5, v1, v5, v24
sha1m_round 2, 3, 0, 1, 6, v2, v6, v24
sha1p_round 3, 0, 1, 2, 7, v3, v7, v24
subs numblocks, numblocks, #1 // pre-decrement num_blocks by 1
b.le L_wrapup
L_loop:
ld1.4s {v0,v1,v2,v3}, [data], #64 // w0,w1,w2,w3 need to bswap into big-endian
sha1p_hash_load_round 4, v0, v4, v21
sha1p_hash_load_round 5, v1, v5, v21
sha1p_hash_load_round 6, v2, v6, v21
sha1p_hash_load_round 7, v3, v7, v21
add.4s v18, v16, v18
add.4s v19, v17, v19
mov.16b v16, v18
mov.16b v17, v19
sha1c_round 0, 1, 2, 3, 4, v0, v4, v21
sha1c_round 1, 2, 3, 0, 5, v1, v5, v22
sha1c_round 2, 3, 0, 1, 6, v2, v6, v22
sha1c_round 3, 0, 1, 2, 7, v3, v7, v22
sha1c_round 0, 1, 2, 3, 4, v0, v4, v22
sha1p_round 1, 2, 3, 0, 5, v1, v5, v22
sha1p_round 2, 3, 0, 1, 6, v2, v6, v23
sha1p_round 3, 0, 1, 2, 7, v3, v7, v23
sha1p_round 0, 1, 2, 3, 4, v0, v4, v23
sha1p_round 1, 2, 3, 0, 5, v1, v5, v23
sha1m_round 2, 3, 0, 1, 6, v2, v6, v23
sha1m_round 3, 0, 1, 2, 7, v3, v7, v24
sha1m_round 0, 1, 2, 3, 4, v0, v4, v24
sha1m_round 1, 2, 3, 0, 5, v1, v5, v24
sha1m_round 2, 3, 0, 1, 6, v2, v6, v24
sha1p_round 3, 0, 1, 2, 7, v3, v7, v24
subs numblocks, numblocks, #1 // pre-decrement num_blocks by 1
b.gt L_loop
L_wrapup:
sha1p_hash_round 4
sha1p_hash_round 5
sha1p_hash_round 6
sha1p_hash_round 7
add.4s v16, v16, v18
add.4s v17, v17, v19
str q16,[hashes], #16
str s17,[hashes]
#if BUILDKERNEL
// restore vector registers that have be used clobbered in the computation v0-v7, v16-v21
ld1.4s {v0,v1,v2,v3}, [sp], #64
ld1.4s {v4,v5,v6,v7}, [sp], #64
ld1.4s {v16,v17,v18,v19}, [sp], #64
ld1.4s {v20,v21,v22,v23}, [sp], #64
ld1.4s {v24}, [sp], #16
#endif
ret lr
#endif // define(__arm64__)

View File

@ -0,0 +1,30 @@
/* Copyright (c) (2019) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#include <stddef.h>
#include "config.h"
#include "AccelerateCrypto.h"
#if (defined(__x86_64__) || defined(__i386__))
extern void AccelerateCrypto_SHA1_compress_ssse3(uint32_t *state, size_t num, const void *buf) __asm__("_AccelerateCrypto_SHA1_compress_ssse3");
extern void AccelerateCrypto_SHA1_compress_AVX1(uint32_t *state, size_t num, const void *buf) __asm__("_AccelerateCrypto_SHA1_compress_AVX1");
extern void AccelerateCrypto_SHA1_compress_AVX2(uint32_t *state, size_t num, const void *buf) __asm__("_AccelerateCrypto_SHA1_compress_AVX2");
void AccelerateCrypto_SHA1_compress(uint32_t *state, size_t num, const void *buf)
{
#if defined(__x86_64__)
if (HAS_AVX2()) AccelerateCrypto_SHA1_compress_AVX2(state, num, buf);
else if (HAS_AVX1()) AccelerateCrypto_SHA1_compress_AVX1(state, num, buf);
else
#endif
AccelerateCrypto_SHA1_compress_ssse3(state, num, buf);
}
#endif // (defined(__x86_64__) || defined(__i386__))

View File

@ -0,0 +1,785 @@
# Copyright (c) (2016,2018,2019) Apple Inc. All rights reserved.
#
# corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
# is contained in the License.txt file distributed with corecrypto) and only to
# people who accept that license. IMPORTANT: Any license rights granted to you by
# Apple Inc. (if any) are limited to internal use within your organization only on
# devices and computers you own or control, for the sole purpose of verifying the
# security characteristics and correct functioning of the Apple Software. You may
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
#include <corecrypto/cc_config.h>
#if defined(__x86_64__)
/* vng_sha1LittleEndian.s : this file provides optimized x86_64 avx1 implementation of the sha1 function
CoreOS - vector and numerics group
The implementation is based on the principle described in an Intel online article
"Improving the Performance of the Secure Hash Algorithm (SHA-1)"
http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
Update HASH[] by processing a one 64-byte block in MESSAGE[] can be represented by the following C function
void SHA1( int HASH[], int MESSAGE[] )
{
int A[81], B[81], C[81], D[81], E[81];
int W[80];
int i, FN;
A[0] = HASH[0];
B[0] = HASH[1];
C[0] = HASH[2];
D[0] = HASH[3];
E[0] = HASH[4];
for ( i=0; i<80; ++i )
{
if ( i < 16 )
W[i] = BIG_ENDIAN_LOAD( MESSAGE[i] );
else
W[i] = ROTATE_LEFT( W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16], 1 );
FN = F( i, B[i], C[i], D[i] );
A[i+1] = FN + E[i] + ROTATE_LEFT( A[i], 5 ) + W[i] + K(i);
B[i+1] = A[i];
C[i+1] = ROTATE_LEFT( B[i], 30 );
D[i+1] = C[i];
E[i+1] = D[i];
}
HASH[0] += A[80];
HASH[1] += B[80];
HASH[2] += C[80];
HASH[3] += D[80];
HASH[4] += E[80];
}
For i=0:15, W[i] is simply big-endian loading of MESSAGE[i]. For i=16:79, W[i] is updated according to W[i] = ROTATE_LEFT( W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16], 1 );
The approach (by Dean Gaudet) can be used to vectorize the computation of W[i] for i=16:79,
1. done on 4 consequtive W[i] values in a single XMM register
W[i ] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1
W[i+1] = (W[i-2] ^ W[i-7] ^ W[i-13] ^ W[i-15]) rol 1
W[i+2] = (W[i-1] ^ W[i-6] ^ W[i-12] ^ W[i-14]) rol 1
W[i+3] = ( 0 ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1
2. this additional calculation unfortunately requires many additional operations
W[i+3] ^= W[i] rol 1
3. once we have 4 W[i] values in XMM we can also add four K values with one instruction
W[i:i+3] += {K,K,K,K}
Let W0 = {W[i] W[i+1] W[i+2] W[i+3]} be the current W-vector to be computed, W4 = {W[i-4] W[i-3] W[i-2] W[i-1]} be the previous vector, and so on
The Dean Gaudet approach can be expressed as
1. W0 = rotate_left(left_shift(W4,32) ^ W8 ^ left_shift(concatenate(W16,W12),64) ^ W16,1);
2. W[i+3] ^= W[i] rol 1
3. W0 += {K,K,K,K}
For i>=32, the Intel online article suggests that (using a basic identity (X rol 1) rol 1 = X rol 2) the update equation is equivalent to
1. W0 = rotate_left(left_shift(concatenate(W8,W4),64) ^ W16 ^ W28 ^ W32, 2);
Note:
1. In total, we need 8 16-byte registers or memory for W0,W4,...,W28. W0 and W32 can be the same register or memory.
2. The registers are used in a circular buffering mode. For example, we start with W28,W24,...,W0 (with W0 indicating the most recent 16-byte)
i=0, W28,W24,...,W0
i=4, W24,W20,...,W28
i=8, W20,W16,...,W24
.
.
and so forth.
3. 2 ssse3 instructions are used in the Intel article, pshufb and palignr.
a. pshufb is used to simplify the BIG_ENDIAN_LOAD operation
b. palignr is used to simplify the computation of left_shift(concatenate(W12,W8),64)
*/
/* the code can be compiled into single block (64 bytes) per call mode by setting Multiple_blocks to 0 */
#define Multiple_Blocks 1
#if defined (__x86_64__)
#if BUILDKERNEL
#define stack_size (32*10+16*4+16) // ymm0-9 + 4 128-bits for intermediate WK(t) storage + 32-byte alignment
#else
#define stack_size (16*4) // 4 128-bits for intermediate WK(t) storage
#endif
#define sp %rsp // unifying architectural stack pointer representation
#define ctx %rdi // 1st input argument, will move to HASH_PTR (%r9)
#define buf %rdx // 3rd input argument, will move to BUFFER_PTR (%r10)
#define cnt %r11 // will copy from the 2nd input argument (%rsi)
#define K_BASE %r8 // an aligned pointer to point to shufb reference numbers of table of K values
#define HASH_PTR %r9 // pointer to Hash values (A,B,C,D,E)
#define BUFFER_PTR %r10 // pointer to input blocks
// symbolizing registers or stack memory with algorithmic variables W0,W4,...,W28 + W_TMP, W_TMP2, and XMM_SHUFB_BSWAP for code with avx1 support
#define W_TMP %xmm0
#define W_TMP2 %xmm1
#define W0 %xmm2
#define W4 %xmm3
#define W8 %xmm4
#define W12 %xmm5
#define W16 %xmm6
#define W20 %xmm7
#define W24 %xmm8
#define W28 %xmm9
#define XMM_SHUFB_BSWAP REV32(%rip)
#define xmov vmovaps // aligned 16-byte move
#define xmovu vmovups // unaligned 16-byte move
// intermediate hash variables
#define A %ecx
#define B %esi
#define C %edi
#define D %r15d
#define E %edx
// temp variables
#define T1 %eax
#define T2 %ebx
#define WK(t) ((t)&15)*4(sp)
// int F1(int B, int C, int D) { return (D ^ ( B & (C ^ D)); }
// result in T1
.macro F1 arg0, arg1, arg2
mov \arg1, T1
xor \arg2, T1
and \arg0, T1
xor \arg2, T1
.endm
// int F2(int B, int C, int D) { return (D ^ B ^ C); }
// result in T1
.macro F2 arg0, arg1, arg2
mov \arg2, T1
xor \arg1, T1
xor \arg0, T1
.endm
// int F3(int B, int C, int D) { return (B & C) | (D & (B ^ C)); }
// result in T1
.macro F3 arg0, arg1, arg2
mov \arg1, T1
mov \arg0, T2
or \arg0, T1
and \arg1, T2
and \arg2, T1
or T2, T1
.endm
// for i=60:79, F4 is identical to F2
#define F4 F2
/*
i=0:15, W[i] = BIG_ENDIAN_LOAD(MESSAGE[i]);
for (i=0;i<16;i+=4) {
1. W_TMP = new 16 bytes from MESSAGE[]
2. W_TMP = pshufb(W_TMP, XMM_SHUFB_BSWAP); save to W circular buffer for updating W
3. WTMP += {K,K,K,K};
4. save quadruple W[i]+K[i] = W_TMP in the stack memory;
}
each step is represented in one of the following 4 macro definitions
*/
.macro W_PRECALC_00_15_0 arg0 // input argument $0 : 0/4/8/12
xmovu \arg0*4(BUFFER_PTR), W_TMP // read 16-bytes into W_TMP, BUFFER_PTR possibly not 16-byte aligned
.endm
.macro W_PRECALC_00_15_1 arg0 // input argument $0 : current 16-bytes in the circular buffer, one of W0,W4,W8,...,W28
vpshufb XMM_SHUFB_BSWAP, W_TMP, \arg0 // convert W_TMP from little-endian into big-endian
.endm
.macro W_PRECALC_00_15_2 arg0 // K_BASE points to the current K quadruple.
vpaddd (K_BASE), \arg0, W_TMP // W_TMP += {K,K,K,K};
.endm
.macro W_PRECALC_00_15_3 arg0
xmov W_TMP, WK(\arg0&~3) // save quadruple W[i]+K in the stack memory, which would be used later for updating the hashes A/B/C/D/E
.endm
// rounds 16-31 compute W[0] using the vectorization approach by Dean Gaudet
/*
W[i ] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1
W[i+1] = (W[i-2] ^ W[i-7] ^ W[i-13] ^ W[i-15]) rol 1
W[i+2] = (W[i-1] ^ W[i-6] ^ W[i-12] ^ W[i-14]) rol 1
W[i+3] = ( 0 ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1
W[i+3] ^= W[i] rol 1; // this W[i] is already rol by 1, if we are taking from the intial W before rol 1, we should rol this by 2
The operation (updating W and W+K) is scheduled as and divided into 4 steps
0. W_tmp = W3; W = W14 ^ W8
1. W = W3 ^ W8 ^ W14 ^ W16; W_TMP = W; W_TMP2 = (W[i] 0 0 0);
2. W_TMP = (W3 ^ W8 ^ W14 ^ W16) rol 1; split (W[i] 0 0 0) rol 2 in W_TMP2 and W
3. W = W_TMP = W_TMP ^ W_TMP2 ^ W = (W3 ^ W8 ^ W14 ^ W16) rol 1 ^ (W[i] 0 0 0) rol 2; WK = W _TMP+K;
*/
.macro W_PRECALC_16_31_0 arg0, arg1, arg2, arg3, arg4 // input arguments : W16,W12,W8,W4,W
vpalignr $8, \arg0, \arg1, \arg4 // W = W14
vpsrldq $4, \arg3, W_TMP // W_TMP = W3
vpxor \arg2, \arg4, \arg4 // W = W8 ^ W14
.endm
.macro W_PRECALC_16_31_1 arg0, arg1 // input arguments : W16,W
vpxor \arg0, W_TMP, W_TMP // W_TMP = W3 ^ W16
vpxor W_TMP, \arg1, \arg1 // W = W3 ^ W16 ^ W8 ^ W14
vpslldq $12, \arg1, W_TMP2 // W_TMP2 = (W[i] 0 0 0)
.endm
.macro W_PRECALC_16_31_2 arg0 // input argument : W
vpslld $1, \arg0, W_TMP // (W3 ^ W16 ^ W8 ^ W14)<<1
vpsrld $31, \arg0, \arg0 // (W3 ^ W16 ^ W8 ^ W14)>>31
vpor \arg0, W_TMP, W_TMP // W_TMP = (W3 ^ W16 ^ W8 ^ W14) rol 1
vpslld $2, W_TMP2, \arg0 // W = W[i] higher 30 bits after rol 2
vpsrld $30, W_TMP2, W_TMP2 // W_TMP2 = W[i] lower 2 bits after rol 2
.endm
.macro W_PRECALC_16_31_3 arg0, arg1, arg2 // input arguments: W, i, K_XMM
vpxor W_TMP, \arg0, \arg0
vpxor W_TMP2, \arg0, \arg0 // W_TMP = (W3 ^ W16 ^ W8 ^ W14) rol 1 ^ (W[i] 0 0 0) rol 2
vpaddd \arg2(K_BASE), \arg0, W_TMP // W+K
xmov W_TMP, WK(\arg1&~3) // save WK = W+K for later update of the hashes A/B/C/D/E
.endm
/* rounds 32-79 compute W und W+K iusing the vectorization approach from the Intel article
W = rotate_left(left_shift(concatenate(W8,W4),64) ^ W16 ^ W28 ^ W32, 2);
where left_shift(concatenate(W8,W4),64) is equivalent to W6. Note also that W32 and W use the same register.
0. W_tmp = W6; W = W28 ^ W32;
1. W = W_tmp = W6 ^ W16 ^ W28 ^ W32;
2. W_tmp = (W6 ^ W16 ^ W28 ^ W32) rol 2;
3. W = W_Tmp; WK = W_tmp + K;
*/
.macro W_PRECALC_32_79_0 arg0, arg1, arg2, arg3 // inputr arguments : W28,W8,W4,W
vpxor \arg0, \arg3, \arg3 // W = W28 ^ W32;
vpalignr $8, \arg1, \arg2, W_TMP // W_tmp = (w3 w4 w5 w6) = W6;
.endm
.macro W_PRECALC_32_79_1 arg0, arg1 // input arguments : W16,W
vpxor \arg0, \arg1, \arg1 // W_tmp = W6 ^ W16
vpxor W_TMP, \arg1, \arg1 // W_tmp = W6 ^ W16 ^ W28 ^ W32
//xmov W_TMP, \arg1 // W = W_tmp = W6 ^ W16 ^ W28 ^ W32
.endm
.macro W_PRECALC_32_79_2 arg0 // input argument : W
vpslld $2, \arg0, W_TMP // W << 2
vpsrld $30, \arg0, \arg0 // W >> 30
vpor W_TMP, \arg0, \arg0 // W_tmp = (W6 ^ W16 ^ W28 ^ W32) rol 2
.endm
.macro W_PRECALC_32_79_3 arg0, arg1, arg2 // input argument W, i, K_XMM
vpaddd \arg2(K_BASE), \arg0, W_TMP // W + K
xmov W_TMP, WK(\arg1&~3) // write W+K
.endm
/* The hash update operation is completed by the following statements.
A[i+1] = FN + E[i] + ROTATE_LEFT( A[i], 5 ) + WK(i);
B[i+1] = A[i];
C[i+1] = ROTATE_LEFT( B[i], 30 );
D[i+1] = C[i];
E[i+1] = D[i];
Suppose we start with A0,B0,C0,D0,E0. The 1st iteration can be expressed as follows:
A1 = FN + E0 + rol(A0,5) + WK;
B1 = A0;
C1 = rol(B0, 30);
D1 = C0;
E1 = D0;
to avoid excessive memory movement between registers,
1. A1 = FN + E0 + rol(A0,5) + WK; can be temporarily saved in E0,
2. C1 = rol(B0,30) can be temporarily saved in B0.
Therefore, ignoring the time index, the update operation is equivalent to
1. E = FN(B,C,D) + E + rol(A,5) + WK(i)
2. B = rol(B,30)
3. the hashes are now stored in the order of E,A,B,C,D
To pack 2 hash update operations in 1 iteration, starting with A,B,C,D,E
1. E = FN(B,C,D) + E + rol(A,5) + WK(i)
2. B = rol(B,30)
// now the hashes are in the order of E,A,B,C,D
3. D = FN(A,B,C) + D + rol(E,5) + WK(i+1)
4. A = rol(A,30)
// now the hashes are in the order of D,E,A,B,C
These operations are distributed into the following 2 macro definitions RR0 and RR1.
*/
.macro RR0 arg0, arg1, arg2, arg3, arg4, arg5, arg6 // input arguments : FN, A, B, C, D, E, i
\arg0 \arg2, \arg3, \arg4 // T1 = FN(B,C,D)
rol $30, \arg2 // B = rol(B,30)
add WK(\arg6), \arg5 // E + WK(i)
mov \arg1, T2 // T2 = A
add WK(\arg6+1), \arg4 // D + WK(i+1)
rol $5, T2 // rol(A,5)
add T1, \arg5 // E = FN(B,C,D) + E + WK(i)
.endm
.macro RR1 arg0, arg1, arg2, arg3, arg4, arg5, arg6
add \arg5, T2 // T2 = FN(B,C,D) + E + rol(A,5) + WK(i)
mov T2, \arg5 // E = FN(B,C,D) + E + rol(A,5) + WK(i)
rol $5, T2 // rol(E,5)
add T2, \arg4 // D + WK(i+1) + rol(E,5)
\arg0 \arg1, \arg2, \arg3 // FN(A,B,C)
add T1, \arg4 // D = FN(A,B,C) + D + rol(E,5) + WK(i+1)
rol $30, \arg1 // A = rol(A,30)
.endm
.macro INITIAL_W_PRECALC // BIG_ENDIAN_LOAD(64 bytes block) into W (i=0:15) and store W+K into the stack memory
// i=0 : W28,W24,W20,W16,W12,W8,W4,W0
W_PRECALC_00_15_0 0 // W_TMP = (BUFFER_PTR)
W_PRECALC_00_15_1 W0 // convert W_TMP to big-endian, and save W0 = W_TMP
W_PRECALC_00_15_2 W0 // W_TMP = W0 + K
W_PRECALC_00_15_3 3 // (sp) = W_TMP = W0 + K
// i=4 : W24,W20,W16,W12,W8,W4,W0,W28
W_PRECALC_00_15_0 4 // W_TMP = 16(BUFFER_PTR)
W_PRECALC_00_15_1 W28 // convert W_TMP to big-endian, and save W28 = W_TMP
W_PRECALC_00_15_2 W28 // W_TMP = W28 + K
W_PRECALC_00_15_3 7 // 16(sp) = W_TMP = W28 + K
// i=8 : W20,W16,W12,W8,W4,W0,W28,W24
W_PRECALC_00_15_0 8 // W_TMP = 32(BUFFER_PTR)
W_PRECALC_00_15_1 W24 // convert W_TMP to big-endian, and save W24 = W_TMP
W_PRECALC_00_15_2 W24 // W_TMP = W24 + K
W_PRECALC_00_15_3 11 // 32(sp) = W_TMP = W24 + K
// i=12 : W16,W12,W8,W4,W0,W28,W24,W20
W_PRECALC_00_15_0 12 // W_TMP = 48(BUFFER_PTR)
W_PRECALC_00_15_1 W20 // convert W_TMP to big-endian, and save W20 = W_TMP
W_PRECALC_00_15_2 W20 // W_TMP = W20 + K
W_PRECALC_00_15_3 15 // 48(sp) = W_TMP = W20 + K
.endm
.macro INTERNAL // updating W (16:79) and update the digests A/B/C/D/E (i=0:63, based on W+K stored in the stack memory)
// i=16 : W12,W8,W4,W0,W28,W24,W20,W16
W_PRECALC_16_31_0 W0,W28,W24,W20,W16
RR0 F1,A,B,C,D,E,0
W_PRECALC_16_31_1 W0,W16
RR1 F1,A,B,C,D,E,0
W_PRECALC_16_31_2 W16
RR0 F1,D,E,A,B,C,2
W_PRECALC_16_31_3 W16, 2, 0
RR1 F1,D,E,A,B,C,2
// i=20 : W8,W4,W0,W28,W24,W20,W16,W12
W_PRECALC_16_31_0 W28,W24,W20,W16,W12
RR0 F1,B,C,D,E,A,4
W_PRECALC_16_31_1 W28,W12
RR1 F1,B,C,D,E,A,4
W_PRECALC_16_31_2 W12
RR0 F1,E,A,B,C,D,6
W_PRECALC_16_31_3 W12, 6, 16
RR1 F1,E,A,B,C,D,6
// i=24 : W4,W0,W28,W24,W20,W16,W12,W8
W_PRECALC_16_31_0 W24,W20,W16,W12,W8
RR0 F1,C,D,E,A,B,8
W_PRECALC_16_31_1 W24,W8
RR1 F1,C,D,E,A,B,8
W_PRECALC_16_31_2 W8
RR0 F1,A,B,C,D,E,10
W_PRECALC_16_31_3 W8,10,16
RR1 F1,A,B,C,D,E,10
// i=28 : W0,W28,W24,W20,W16,W12,W8,W4
W_PRECALC_16_31_0 W20,W16,W12,W8,W4
RR0 F1,D,E,A,B,C,12
W_PRECALC_16_31_1 W20,W4
RR1 F1,D,E,A,B,C,12
W_PRECALC_16_31_2 W4
RR0 F1,B,C,D,E,A,14
W_PRECALC_16_31_3 W4,14,16
RR1 F1,B,C,D,E,A,14
// i=32 : W28,W24,W20,W16,W12,W8,W4,W0
W_PRECALC_32_79_0 W28,W8,W4,W0
RR0 F1,E,A,B,C,D,16
W_PRECALC_32_79_1 W16,W0
RR1 F1,E,A,B,C,D,16
W_PRECALC_32_79_2 W0
RR0 F1,C,D,E,A,B,18
W_PRECALC_32_79_3 W0,18,16
RR1 F1,C,D,E,A,B,18
// starting using F2
// i=36 : W24,W20,W16,W12,W8,W4,W0,W28
W_PRECALC_32_79_0 W24,W4,W0,W28
RR0 F2,A,B,C,D,E,20
W_PRECALC_32_79_1 W12,W28
RR1 F2,A,B,C,D,E,20
W_PRECALC_32_79_2 W28
RR0 F2,D,E,A,B,C,22
W_PRECALC_32_79_3 W28,22,16
RR1 F2,D,E,A,B,C,22
// i=40 : W20,W16,W12,W8,W4,W0,W28,W24
#undef K_XMM
#define K_XMM 32
W_PRECALC_32_79_0 W20,W0,W28,W24
RR0 F2,B,C,D,E,A,24
W_PRECALC_32_79_1 W8,W24
RR1 F2,B,C,D,E,A,24
W_PRECALC_32_79_2 W24
RR0 F2,E,A,B,C,D,26
W_PRECALC_32_79_3 W24,26,K_XMM
RR1 F2,E,A,B,C,D,26
// i=44 : W16,W12,W8,W4,W0,W28,W24,W20
W_PRECALC_32_79_0 W16,W28,W24,W20
RR0 F2,C,D,E,A,B,28
W_PRECALC_32_79_1 W4,W20
RR1 F2,C,D,E,A,B,28
W_PRECALC_32_79_2 W20
RR0 F2,A,B,C,D,E,30
W_PRECALC_32_79_3 W20,30,K_XMM
RR1 F2,A,B,C,D,E,30
// i=48 : W12,W8,W4,W0,W28,W24,W20,W16
W_PRECALC_32_79_0 W12,W24,W20,W16
RR0 F2,D,E,A,B,C,32
W_PRECALC_32_79_1 W0,W16
RR1 F2,D,E,A,B,C,32
W_PRECALC_32_79_2 W16
RR0 F2,B,C,D,E,A,34
W_PRECALC_32_79_3 W16,34,K_XMM
RR1 F2,B,C,D,E,A,34
// i=52 : W8,W4,W0,W28,W24,W20,W16,W12
W_PRECALC_32_79_0 W8,W20,W16,W12
RR0 F2,E,A,B,C,D,36
W_PRECALC_32_79_1 W28,W12
RR1 F2,E,A,B,C,D,36
W_PRECALC_32_79_2 W12
RR0 F2,C,D,E,A,B,38
W_PRECALC_32_79_3 W12,38,K_XMM
RR1 F2,C,D,E,A,B,38
// starting using F3
// i=56 : W4,W0,W28,W24,W20,W16,W12,W8
W_PRECALC_32_79_0 W4,W16,W12,W8
RR0 F3,A,B,C,D,E,40
W_PRECALC_32_79_1 W24,W8
RR1 F3,A,B,C,D,E,40
W_PRECALC_32_79_2 W8
RR0 F3,D,E,A,B,C,42
W_PRECALC_32_79_3 W8,42,K_XMM
RR1 F3,D,E,A,B,C,42
// i=60 : W0,W28,W24,W20,W16,W12,W8,W4
#undef K_XMM
#define K_XMM 48
W_PRECALC_32_79_0 W0,W12,W8,W4
RR0 F3,B,C,D,E,A,44
W_PRECALC_32_79_1 W20,W4
RR1 F3,B,C,D,E,A,44
W_PRECALC_32_79_2 W4
RR0 F3,E,A,B,C,D,46
W_PRECALC_32_79_3 W4,46,K_XMM
RR1 F3,E,A,B,C,D,46
// i=64 : W28,W24,W20,W16,W12,W8,W4,W0
W_PRECALC_32_79_0 W28,W8,W4,W0
RR0 F3,C,D,E,A,B,48
W_PRECALC_32_79_1 W16,W0
RR1 F3,C,D,E,A,B,48
W_PRECALC_32_79_2 W0
RR0 F3,A,B,C,D,E,50
W_PRECALC_32_79_3 W0,50,K_XMM
RR1 F3,A,B,C,D,E,50
// i=68 : W24,W20,W16,W12,W8,W4,W0,W28
W_PRECALC_32_79_0 W24,W4,W0,W28
RR0 F3,D,E,A,B,C,52
W_PRECALC_32_79_1 W12,W28
RR1 F3,D,E,A,B,C,52
W_PRECALC_32_79_2 W28
RR0 F3,B,C,D,E,A,54
W_PRECALC_32_79_3 W28,54,K_XMM
RR1 F3,B,C,D,E,A,54
// i=72 : W20,W16,W12,W8,W4,W0,W28,W24
W_PRECALC_32_79_0 W20,W0,W28,W24
RR0 F3,E,A,B,C,D,56
W_PRECALC_32_79_1 W8,W24
RR1 F3,E,A,B,C,D,56
W_PRECALC_32_79_2 W24
RR0 F3,C,D,E,A,B,58
W_PRECALC_32_79_3 W24,58,K_XMM
RR1 F3,C,D,E,A,B,58
// starting using F4
// i=76 : W16,W12,W8,W4,W0,W28,W24,W20
W_PRECALC_32_79_0 W16,W28,W24,W20
RR0 F4,A,B,C,D,E,60
W_PRECALC_32_79_1 W4,W20
RR1 F4,A,B,C,D,E,60
W_PRECALC_32_79_2 W20
RR0 F4,D,E,A,B,C,62
W_PRECALC_32_79_3 W20,62,K_XMM
RR1 F4,D,E,A,B,C,62
.endm
.macro SOFTWARE_PIPELINING
// i=0 : W28,W24,W20,W16,W12,W8,W4,W0
W_PRECALC_00_15_0 0 // W_TMP = (BUFFER_PTR)
RR0 F4,B,C,D,E,A,64
W_PRECALC_00_15_1 W0 // convert W_TMP to big-endian, and save W0 = W_TMP
RR1 F4,B,C,D,E,A,64
W_PRECALC_00_15_2 W0 // W_TMP = W0 + K
RR0 F4,E,A,B,C,D,66
W_PRECALC_00_15_3 3 // (sp) = W_TMP = W0 + K
RR1 F4,E,A,B,C,D,66
// i=4 : W24,W20,W16,W12,W8,W4,W0,W28
W_PRECALC_00_15_0 4 // W_TMP = 16(BUFFER_PTR)
RR0 F4,C,D,E,A,B,68
W_PRECALC_00_15_1 W28 // convert W_TMP to big-endian, and save W28 = W_TMP
RR1 F4,C,D,E,A,B,68
W_PRECALC_00_15_2 W28 // W_TMP = W28 + K
RR0 F4,A,B,C,D,E,70
W_PRECALC_00_15_3 7 // 16(sp) = W_TMP = W28 + K[0]
RR1 F4,A,B,C,D,E,70
// i=8 : W20,W16,W12,W8,W4,W0,W28,W24
W_PRECALC_00_15_0 8 // W_TMP = 32(BUFFER_PTR)
RR0 F4,D,E,A,B,C,72
W_PRECALC_00_15_1 W24 // convert W_TMP to big-endian, and save W24 = W_TMP
RR1 F4,D,E,A,B,C,72
W_PRECALC_00_15_2 W24 // W_TMP = W24 + K
RR0 F4,B,C,D,E,A,74
W_PRECALC_00_15_3 11 // 32(sp) = W_TMP = W24 + K
RR1 F4,B,C,D,E,A,74
// i=12 : W16,W12,W8,W4,W0,W28,W24,W20
W_PRECALC_00_15_0 12 // W_TMP = 48(BUFFER_PTR)
RR0 F4,E,A,B,C,D,76
W_PRECALC_00_15_1 W20 // convert W_TMP to big-endian, and save W20 = W_TMP
RR1 F4,E,A,B,C,D,76
W_PRECALC_00_15_2 W20 // W_TMP = W20 + K
RR0 F4,C,D,E,A,B,78
W_PRECALC_00_15_3 15 // 48(sp) = W_TMP = W20 + K
RR1 F4,C,D,E,A,B,78
.endm
#undef W_PRECALC_00_15_0
#undef W_PRECALC_00_15_1
#undef W_PRECALC_16_31_0
#undef W_PRECALC_32_79_0
.macro ENDING // finish up updating hash digests (i=64:79)
//i=80
RR0 F4,B,C,D,E,A,64
RR1 F4,B,C,D,E,A,64
RR0 F4,E,A,B,C,D,66
RR1 F4,E,A,B,C,D,66
//i=84
RR0 F4,C,D,E,A,B,68
RR1 F4,C,D,E,A,B,68
RR0 F4,A,B,C,D,E,70
RR1 F4,A,B,C,D,E,70
//i=88
RR0 F4,D,E,A,B,C,72
RR1 F4,D,E,A,B,C,72
RR0 F4,B,C,D,E,A,74
RR1 F4,B,C,D,E,A,74
//i=92
RR0 F4,E,A,B,C,D,76
RR1 F4,E,A,B,C,D,76
RR0 F4,C,D,E,A,B,78
RR1 F4,C,D,E,A,B,78
.endm
// load hash digests A,B,C,D,E from memory into registers
.macro LOAD_HASH
mov (HASH_PTR), A
mov 4(HASH_PTR), B
mov 8(HASH_PTR), C
mov 12(HASH_PTR), D
mov 16(HASH_PTR), E
.endm
.macro UPDATE_HASH arg0, arg1
add \arg0, \arg1
mov \arg1, \arg0
.endm
.macro UPDATE_ALL_HASH
UPDATE_HASH (HASH_PTR), A
UPDATE_HASH 4(HASH_PTR), B
UPDATE_HASH 8(HASH_PTR), C
UPDATE_HASH 12(HASH_PTR), D
UPDATE_HASH 16(HASH_PTR), E
.endm
/*
main sha1 code for system with avx1 support
*/
.macro SHA1_PIPELINED_MAIN_BODY
LOAD_HASH // load initial hashes into A,B,C,D,E
INITIAL_W_PRECALC // big_endian_load(W) and W+K (i=0:15)
.p2align 4,0x90
0:
INTERNAL // update W (i=16:79) and update ABCDE (i=0:63)
#if Multiple_Blocks
addq _IMM(64), BUFFER_PTR // BUFFER_PTR+=64;
subq _IMM(1), cnt // pre-decrement cnt by 1
jbe 1f // if cnt <= 0, branch to finish off
SOFTWARE_PIPELINING // update ABCDE (i=64:79) || big_endian_load(W) and W+K (i=0:15)
UPDATE_ALL_HASH // update output hashes
jmp 0b // repeat for next block
.p2align 4,0x90
1:
#endif
ENDING // update ABCDE (i=64:79)
UPDATE_ALL_HASH // update output hashes
.endm
/*
I removed the cpu capabilities check. The check is now down
in C code and the appropriate version of the assembler code
is selected.
*/
.text
.globl _AccelerateCrypto_SHA1_compress_AVX1
_AccelerateCrypto_SHA1_compress_AVX1:
// start the sha1 code with avx1 support
// save callee-save registers
push %rbp
mov %rsp, %rbp
push %rbx
push %r15
sub $stack_size, sp // allocate stack memory for use
// save used xmm register if this is for kernel
#if BUILDKERNEL
andq $-32, sp // aligned sp to 32-bytes
leaq 4*16(sp), %rax
xmov %ymm0, 0*32(%rax)
xmov %ymm1, 1*32(%rax)
xmov %ymm2, 2*32(%rax)
xmov %ymm3, 3*32(%rax)
xmov %ymm4, 4*32(%rax)
xmov %ymm5, 5*32(%rax)
xmov %ymm6, 6*32(%rax)
xmov %ymm7, 7*32(%rax)
xmov %ymm8, 8*32(%rax)
xmov %ymm9, 9*32(%rax)
#endif
// set up registers to free %edx/%edi/%esi for other use (ABCDE)
mov ctx, HASH_PTR
mov buf, BUFFER_PTR
#if Multiple_Blocks
mov %rsi, cnt
#endif
lea K_XMM_AR(%rip), K_BASE
SHA1_PIPELINED_MAIN_BODY
// restore used xmm registers if this is for kernel
#if BUILDKERNEL
leaq 4*16(sp), %rax
xmov 0*32(%rax), %ymm0
xmov 1*32(%rax), %ymm1
xmov 2*32(%rax), %ymm2
xmov 3*32(%rax), %ymm3
xmov 4*32(%rax), %ymm4
xmov 5*32(%rax), %ymm5
xmov 6*32(%rax), %ymm6
xmov 7*32(%rax), %ymm7
xmov 8*32(%rax), %ymm8
xmov 9*32(%rax), %ymm9
#endif
leaq -16(%rbp), %rsp
// restore callee-save registers
pop %r15
pop %rbx
pop %rbp
ret // return
CC_ASM_SECTION_CONST
.p2align 4, 0x90
#define K1 0x5a827999
#define K2 0x6ed9eba1
#define K3 0x8f1bbcdc
#define K4 0xca62c1d6
K_XMM_AR:
.long K1
.long K1
.long K1
.long K1
.long K2
.long K2
.long K2
.long K2
.long K3
.long K3
.long K3
.long K3
.long K4
.long K4
.long K4
.long K4
REV32:
// bswap_shufb_ctl: accessed thru 0x40(K_XMM_AR)
.long 0x00010203
.long 0x04050607
.long 0x08090a0b
.long 0x0c0d0e0f
#endif // architecture x86_64
#endif // defined(__x86_64__)

View File

@ -0,0 +1,780 @@
# Copyright (c) (2016,2018,2019) Apple Inc. All rights reserved.
#
# corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
# is contained in the License.txt file distributed with corecrypto) and only to
# people who accept that license. IMPORTANT: Any license rights granted to you by
# Apple Inc. (if any) are limited to internal use within your organization only on
# devices and computers you own or control, for the sole purpose of verifying the
# security characteristics and correct functioning of the Apple Software. You may
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
#include <corecrypto/cc_config.h>
#if defined(__x86_64__)
/* vng_sha1LittleEndian.s : this file provides optimized x86_64 avx2 implementation of the sha1 function
CoreOS - vector and numerics group
The implementation is based on the principle described in an Intel online article
"Improving the Performance of the Secure Hash Algorithm (SHA-1)"
http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
Update HASH[] by processing a one 64-byte block in MESSAGE[] can be represented by the following C function
void SHA1( int HASH[], int MESSAGE[] )
{
int A[81], B[81], C[81], D[81], E[81];
int W[80];
int i, FN;
A[0] = HASH[0];
B[0] = HASH[1];
C[0] = HASH[2];
D[0] = HASH[3];
E[0] = HASH[4];
for ( i=0; i<80; ++i )
{
if ( i < 16 )
W[i] = BIG_ENDIAN_LOAD( MESSAGE[i] );
else
W[i] = ROTATE_LEFT( W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16], 1 );
FN = F( i, B[i], C[i], D[i] );
A[i+1] = FN + E[i] + ROTATE_LEFT( A[i], 5 ) + W[i] + K(i);
B[i+1] = A[i];
C[i+1] = ROTATE_LEFT( B[i], 30 );
D[i+1] = C[i];
E[i+1] = D[i];
}
HASH[0] += A[80];
HASH[1] += B[80];
HASH[2] += C[80];
HASH[3] += D[80];
HASH[4] += E[80];
}
For i=0:15, W[i] is simply big-endian loading of MESSAGE[i]. For i=16:79, W[i] is updated according to W[i] = ROTATE_LEFT( W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16], 1 );
The approach (by Dean Gaudet) can be used to vectorize the computation of W[i] for i=16:79,
1. done on 4 consequtive W[i] values in a single XMM register
W[i ] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1
W[i+1] = (W[i-2] ^ W[i-7] ^ W[i-13] ^ W[i-15]) rol 1
W[i+2] = (W[i-1] ^ W[i-6] ^ W[i-12] ^ W[i-14]) rol 1
W[i+3] = ( 0 ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1
2. this additional calculation unfortunately requires many additional operations
W[i+3] ^= W[i] rol 1
3. once we have 4 W[i] values in XMM we can also add four K values with one instruction
W[i:i+3] += {K,K,K,K}
Let W0 = {W[i] W[i+1] W[i+2] W[i+3]} be the current W-vector to be computed, W4 = {W[i-4] W[i-3] W[i-2] W[i-1]} be the previous vector, and so on
The Dean Gaudet approach can be expressed as
1. W0 = rotate_left(left_shift(W4,32) ^ W8 ^ left_shift(concatenate(W16,W12),64) ^ W16,1);
2. W[i+3] ^= W[i] rol 1
3. W0 += {K,K,K,K}
For i>=32, the Intel online article suggests that (using a basic identity (X rol 1) rol 1 = X rol 2) the update equation is equivalent to
1. W0 = rotate_left(left_shift(concatenate(W8,W4),64) ^ W16 ^ W28 ^ W32, 2);
Note:
1. In total, we need 8 16-byte registers or memory for W0,W4,...,W28. W0 and W32 can be the same register or memory.
2. The registers are used in a circular buffering mode. For example, we start with W28,W24,...,W0 (with W0 indicating the most recent 16-byte)
i=0, W28,W24,...,W0
i=4, W24,W20,...,W28
i=8, W20,W16,...,W24
.
.
and so forth.
3. 2 ssse3 instructions are used in the Intel article, pshufb and palignr.
a. pshufb is used to simplify the BIG_ENDIAN_LOAD operation
b. palignr is used to simplify the computation of left_shift(concatenate(W12,W8),64)
*/
/* the code can be compiled into single block (64 bytes) per call mode by setting Multiple_blocks to 0 */
#define Multiple_Blocks 1
#if BUILDKERNEL
#define stack_size (32*10+16*4+16) // ymm0-9 + 4 128-bits for intermediate WK(t) storage + 32byte alignment
#else
#define stack_size (16*4) // 4 128-bits for intermediate WK(t) storage
#endif
#define sp %rsp // unifying architectural stack pointer representation
#define ctx %rdi // 1st input argument, will move to HASH_PTR (%r9)
#define buf %rdx // 3rd input argument, will move to BUFFER_PTR (%r10)
#define cnt %r11 // will copy from the 2nd input argument (%rsi)
#define K_BASE %r8 // an aligned pointer to point to shufb reference numbers of table of K values
#define HASH_PTR %r9 // pointer to Hash values (A,B,C,D,E)
#define BUFFER_PTR %r10 // pointer to input blocks
// symbolizing registers or stack memory with algorithmic variables W0,W4,...,W28 + W_TMP, W_TMP2, and XMM_SHUFB_BSWAP for code with avx2 support
#define W_TMP %xmm0
#define W_TMP2 %xmm1
#define W0 %xmm2
#define W4 %xmm3
#define W8 %xmm4
#define W12 %xmm5
#define W16 %xmm6
#define W20 %xmm7
#define W24 %xmm8
#define W28 %xmm9
#define XMM_SHUFB_BSWAP REV32(%rip)
#define xmov vmovaps // aligned 16-byte move
#define xmovu vmovups // unaligned 16-byte move
// intermediate hash variables
#define A %ecx
#define B %esi
#define C %edi
#define D %r15d
#define E %edx
// temp variables
#define T1 %eax
#define T2 %ebx
#define WK(t) ((t)&15)*4(sp)
// int F1(int B, int C, int D) { return (D ^ ( B & (C ^ D)); }
// result in T1
.macro F1 arg0, arg1, arg2
mov \arg1, T1
xor \arg2, T1
and \arg0, T1
xor \arg2, T1
.endm
// int F2(int B, int C, int D) { return (D ^ B ^ C); }
// result in T1
.macro F2 arg0, arg1, arg2
mov \arg2, T1
xor \arg1, T1
xor \arg0, T1
.endm
// int F3(int B, int C, int D) { return (B & C) | (D & (B ^ C)); }
// result in T1
.macro F3 arg0, arg1, arg2
mov \arg1, T1
mov \arg0, T2
or \arg0, T1
and \arg1, T2
and \arg2, T1
or T2, T1
.endm
// for i=60:79, F4 is identical to F2
#define F4 F2
/*
i=0:15, W[i] = BIG_ENDIAN_LOAD(MESSAGE[i]);
for (i=0;i<16;i+=4) {
1. W_TMP = new 16 bytes from MESSAGE[]
2. W_TMP = pshufb(W_TMP, XMM_SHUFB_BSWAP); save to W circular buffer for updating W
3. WTMP += {K,K,K,K};
4. save quadruple W[i]+K[i] = W_TMP in the stack memory;
}
each step is represented in one of the following 4 macro definitions
*/
.macro W_PRECALC_00_15_0 arg0 // input argument $0 : 0/4/8/12
xmovu \arg0*4(BUFFER_PTR), W_TMP // read 16-bytes into W_TMP, BUFFER_PTR possibly not 16-byte aligned
.endm
.macro W_PRECALC_00_15_1 arg0 // input argument $0 : current 16-bytes in the circular buffer, one of W0,W4,W8,...,W28
vpshufb XMM_SHUFB_BSWAP, W_TMP, \arg0 // convert W_TMP from little-endian into big-endian
.endm
.macro W_PRECALC_00_15_2 arg0 // K_BASE points to the current K quadruple.
vpaddd (K_BASE), \arg0, W_TMP // W_TMP += {K,K,K,K};
.endm
.macro W_PRECALC_00_15_3 arg0
xmov W_TMP, WK(\arg0&~3) // save quadruple W[i]+K in the stack memory, which would be used later for updating the hashes A/B/C/D/E
.endm
// rounds 16-31 compute W[0] using the vectorization approach by Dean Gaudet
/*
W[i ] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1
W[i+1] = (W[i-2] ^ W[i-7] ^ W[i-13] ^ W[i-15]) rol 1
W[i+2] = (W[i-1] ^ W[i-6] ^ W[i-12] ^ W[i-14]) rol 1
W[i+3] = ( 0 ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1
W[i+3] ^= W[i] rol 1; // this W[i] is already rol by 1, if we are taking from the intial W before rol 1, we should rol this by 2
The operation (updating W and W+K) is scheduled as and divided into 4 steps
0. W_tmp = W3; W = W14 ^ W8
1. W = W3 ^ W8 ^ W14 ^ W16; W_TMP = W; W_TMP2 = (W[i] 0 0 0);
2. W_TMP = (W3 ^ W8 ^ W14 ^ W16) rol 1; split (W[i] 0 0 0) rol 2 in W_TMP2 and W
3. W = W_TMP = W_TMP ^ W_TMP2 ^ W = (W3 ^ W8 ^ W14 ^ W16) rol 1 ^ (W[i] 0 0 0) rol 2; WK = W _TMP+K;
*/
.macro W_PRECALC_16_31_0 arg0, arg1, arg2, arg3, arg4 // input arguments : W16,W12,W8,W4,W
vpalignr $8, \arg0, \arg1, \arg4 // W = W14
vpsrldq $4, \arg3, W_TMP // W_TMP = W3
vpxor \arg2, \arg4, \arg4 // W = W8 ^ W14
.endm
.macro W_PRECALC_16_31_1 arg0, arg1 // input arguments : W16,W
vpxor \arg0, W_TMP, W_TMP // W_TMP = W3 ^ W16
vpxor W_TMP, \arg1, \arg1 // W = W3 ^ W16 ^ W8 ^ W14
vpslldq $12, \arg1, W_TMP2 // W_TMP2 = (W[i] 0 0 0)
.endm
.macro W_PRECALC_16_31_2 arg0 // input argument : W
vpslld $1, \arg0, W_TMP // (W3 ^ W16 ^ W8 ^ W14)<<1
vpsrld $31, \arg0, \arg0 // (W3 ^ W16 ^ W8 ^ W14)>>31
vpor \arg0, W_TMP, W_TMP // W_TMP = (W3 ^ W16 ^ W8 ^ W14) rol 1
vpslld $2, W_TMP2, \arg0 // W = W[i] higher 30 bits after rol 2
vpsrld $30, W_TMP2, W_TMP2 // W_TMP2 = W[i] lower 2 bits after rol 2
.endm
.macro W_PRECALC_16_31_3 arg0, arg1, arg2 // input arguments: W, i, K_XMM
vpxor W_TMP, \arg0, \arg0
vpxor W_TMP2, \arg0, \arg0 // W_TMP = (W3 ^ W16 ^ W8 ^ W14) rol 1 ^ (W[i] 0 0 0) rol 2
vpaddd \arg2(K_BASE), \arg0, W_TMP // W+K
xmov W_TMP, WK(\arg1&~3) // save WK = W+K for later update of the hashes A/B/C/D/E
.endm
/* rounds 32-79 compute W und W+K iusing the vectorization approach from the Intel article
W = rotate_left(left_shift(concatenate(W8,W4),64) ^ W16 ^ W28 ^ W32, 2);
where left_shift(concatenate(W8,W4),64) is equivalent to W6. Note also that W32 and W use the same register.
0. W_tmp = W6; W = W28 ^ W32;
1. W = W_tmp = W6 ^ W16 ^ W28 ^ W32;
2. W_tmp = (W6 ^ W16 ^ W28 ^ W32) rol 2;
3. W = W_Tmp; WK = W_tmp + K;
*/
.macro W_PRECALC_32_79_0 arg0, arg1, arg2, arg3 // inputr arguments : W28,W8,W4,W
vpxor \arg0, \arg3, \arg3 // W = W28 ^ W32;
vpalignr $8, \arg1, \arg2, W_TMP // W_tmp = (w3 w4 w5 w6) = W6;
.endm
.macro W_PRECALC_32_79_1 arg0, arg1 // input arguments : W16,W
vpxor \arg0, \arg1, \arg1 // W_tmp = W6 ^ W16
vpxor W_TMP, \arg1, \arg1 // W_tmp = W6 ^ W16 ^ W28 ^ W32
//xmov W_TMP, \arg1 // W = W_tmp = W6 ^ W16 ^ W28 ^ W32
.endm
.macro W_PRECALC_32_79_2 arg0 // input argument : W
vpslld $2, \arg0, W_TMP // W << 2
vpsrld $30, \arg0, \arg0 // W >> 30
vpor W_TMP, \arg0, \arg0 // W_tmp = (W6 ^ W16 ^ W28 ^ W32) rol 2
.endm
.macro W_PRECALC_32_79_3 arg0, arg1, arg2 // input argument W, i, K_XMM
vpaddd \arg2(K_BASE), \arg0, W_TMP // W + K
xmov W_TMP, WK(\arg1&~3) // write W+K
.endm
/* The hash update operation is completed by the following statements.
A[i+1] = FN + E[i] + ROTATE_LEFT( A[i], 5 ) + WK(i);
B[i+1] = A[i];
C[i+1] = ROTATE_LEFT( B[i], 30 );
D[i+1] = C[i];
E[i+1] = D[i];
Suppose we start with A0,B0,C0,D0,E0. The 1st iteration can be expressed as follows:
A1 = FN + E0 + rol(A0,5) + WK;
B1 = A0;
C1 = rol(B0, 30);
D1 = C0;
E1 = D0;
to avoid excessive memory movement between registers,
1. A1 = FN + E0 + rol(A0,5) + WK; can be temporarily saved in E0,
2. C1 = rol(B0,30) can be temporarily saved in B0.
Therefore, ignoring the time index, the update operation is equivalent to
1. E = FN(B,C,D) + E + rol(A,5) + WK(i)
2. B = rol(B,30)
3. the hashes are now stored in the order of E,A,B,C,D
To pack 2 hash update operations in 1 iteration, starting with A,B,C,D,E
1. E = FN(B,C,D) + E + rol(A,5) + WK(i)
2. B = rol(B,30)
// now the hashes are in the order of E,A,B,C,D
3. D = FN(A,B,C) + D + rol(E,5) + WK(i+1)
4. A = rol(A,30)
// now the hashes are in the order of D,E,A,B,C
These operations are distributed into the following 2 macro definitions RR0 and RR1.
*/
.macro RR0 arg0, arg1, arg2, arg3, arg4, arg5, arg6 // input arguments : FN, A, B, C, D, E, i
\arg0 \arg2, \arg3, \arg4 // T1 = FN(B,C,D)
rol $30, \arg2 // B = rol(B,30)
add WK(\arg6), \arg5 // E + WK(i)
rorx $27, \arg1, T2 // rol(A,5)
add WK(\arg6+1), \arg4 // D + WK(i+1)
add T1, \arg5 // E = FN(B,C,D) + E + WK(i)
.endm
.macro RR1 arg0, arg1, arg2, arg3, arg4, arg5, arg6
add T2, \arg5 // T2 = FN(B,C,D) + E + rol(A,5) + WK(i)
rorx $27, \arg5, T2 // rol(E,5)
add T2, \arg4 // D + WK(i+1) + rol(E,5)
\arg0 \arg1, \arg2, \arg3 // FN(A,B,C)
add T1, \arg4 // D = FN(A,B,C) + D + rol(E,5) + WK(i+1)
rol $30, \arg1 // A = rol(A,30)
.endm
.macro INITIAL_W_PRECALC // BIG_ENDIAN_LOAD(64 bytes block) into W (i=0:15) and store W+K into the stack memory
// i=0 : W28,W24,W20,W16,W12,W8,W4,W0
W_PRECALC_00_15_0 0 // W_TMP = (BUFFER_PTR)
W_PRECALC_00_15_1 W0 // convert W_TMP to big-endian, and save W0 = W_TMP
W_PRECALC_00_15_2 W0 // W_TMP = W0 + K
W_PRECALC_00_15_3 3 // (sp) = W_TMP = W0 + K
// i=4 : W24,W20,W16,W12,W8,W4,W0,W28
W_PRECALC_00_15_0 4 // W_TMP = 16(BUFFER_PTR)
W_PRECALC_00_15_1 W28 // convert W_TMP to big-endian, and save W28 = W_TMP
W_PRECALC_00_15_2 W28 // W_TMP = W28 + K
W_PRECALC_00_15_3 7 // 16(sp) = W_TMP = W28 + K
// i=8 : W20,W16,W12,W8,W4,W0,W28,W24
W_PRECALC_00_15_0 8 // W_TMP = 32(BUFFER_PTR)
W_PRECALC_00_15_1 W24 // convert W_TMP to big-endian, and save W24 = W_TMP
W_PRECALC_00_15_2 W24 // W_TMP = W24 + K
W_PRECALC_00_15_3 11 // 32(sp) = W_TMP = W24 + K
// i=12 : W16,W12,W8,W4,W0,W28,W24,W20
W_PRECALC_00_15_0 12 // W_TMP = 48(BUFFER_PTR)
W_PRECALC_00_15_1 W20 // convert W_TMP to big-endian, and save W20 = W_TMP
W_PRECALC_00_15_2 W20 // W_TMP = W20 + K
W_PRECALC_00_15_3 15 // 48(sp) = W_TMP = W20 + K
.endm
.macro INTERNAL // updating W (16:79) and update the digests A/B/C/D/E (i=0:63, based on W+K stored in the stack memory)
// i=16 : W12,W8,W4,W0,W28,W24,W20,W16
W_PRECALC_16_31_0 W0,W28,W24,W20,W16
RR0 F1,A,B,C,D,E,0
W_PRECALC_16_31_1 W0,W16
RR1 F1,A,B,C,D,E,0
W_PRECALC_16_31_2 W16
RR0 F1,D,E,A,B,C,2
W_PRECALC_16_31_3 W16, 2, 0
RR1 F1,D,E,A,B,C,2
// i=20 : W8,W4,W0,W28,W24,W20,W16,W12
W_PRECALC_16_31_0 W28,W24,W20,W16,W12
RR0 F1,B,C,D,E,A,4
W_PRECALC_16_31_1 W28,W12
RR1 F1,B,C,D,E,A,4
W_PRECALC_16_31_2 W12
RR0 F1,E,A,B,C,D,6
W_PRECALC_16_31_3 W12, 6, 16
RR1 F1,E,A,B,C,D,6
// i=24 : W4,W0,W28,W24,W20,W16,W12,W8
W_PRECALC_16_31_0 W24,W20,W16,W12,W8
RR0 F1,C,D,E,A,B,8
W_PRECALC_16_31_1 W24,W8
RR1 F1,C,D,E,A,B,8
W_PRECALC_16_31_2 W8
RR0 F1,A,B,C,D,E,10
W_PRECALC_16_31_3 W8,10,16
RR1 F1,A,B,C,D,E,10
// i=28 : W0,W28,W24,W20,W16,W12,W8,W4
W_PRECALC_16_31_0 W20,W16,W12,W8,W4
RR0 F1,D,E,A,B,C,12
W_PRECALC_16_31_1 W20,W4
RR1 F1,D,E,A,B,C,12
W_PRECALC_16_31_2 W4
RR0 F1,B,C,D,E,A,14
W_PRECALC_16_31_3 W4,14,16
RR1 F1,B,C,D,E,A,14
// i=32 : W28,W24,W20,W16,W12,W8,W4,W0
W_PRECALC_32_79_0 W28,W8,W4,W0
RR0 F1,E,A,B,C,D,16
W_PRECALC_32_79_1 W16,W0
RR1 F1,E,A,B,C,D,16
W_PRECALC_32_79_2 W0
RR0 F1,C,D,E,A,B,18
W_PRECALC_32_79_3 W0,18,16
RR1 F1,C,D,E,A,B,18
// starting using F2
// i=36 : W24,W20,W16,W12,W8,W4,W0,W28
W_PRECALC_32_79_0 W24,W4,W0,W28
RR0 F2,A,B,C,D,E,20
W_PRECALC_32_79_1 W12,W28
RR1 F2,A,B,C,D,E,20
W_PRECALC_32_79_2 W28
RR0 F2,D,E,A,B,C,22
W_PRECALC_32_79_3 W28,22,16
RR1 F2,D,E,A,B,C,22
// i=40 : W20,W16,W12,W8,W4,W0,W28,W24
#undef K_XMM
#define K_XMM 32
W_PRECALC_32_79_0 W20,W0,W28,W24
RR0 F2,B,C,D,E,A,24
W_PRECALC_32_79_1 W8,W24
RR1 F2,B,C,D,E,A,24
W_PRECALC_32_79_2 W24
RR0 F2,E,A,B,C,D,26
W_PRECALC_32_79_3 W24,26,K_XMM
RR1 F2,E,A,B,C,D,26
// i=44 : W16,W12,W8,W4,W0,W28,W24,W20
W_PRECALC_32_79_0 W16,W28,W24,W20
RR0 F2,C,D,E,A,B,28
W_PRECALC_32_79_1 W4,W20
RR1 F2,C,D,E,A,B,28
W_PRECALC_32_79_2 W20
RR0 F2,A,B,C,D,E,30
W_PRECALC_32_79_3 W20,30,K_XMM
RR1 F2,A,B,C,D,E,30
// i=48 : W12,W8,W4,W0,W28,W24,W20,W16
W_PRECALC_32_79_0 W12,W24,W20,W16
RR0 F2,D,E,A,B,C,32
W_PRECALC_32_79_1 W0,W16
RR1 F2,D,E,A,B,C,32
W_PRECALC_32_79_2 W16
RR0 F2,B,C,D,E,A,34
W_PRECALC_32_79_3 W16,34,K_XMM
RR1 F2,B,C,D,E,A,34
// i=52 : W8,W4,W0,W28,W24,W20,W16,W12
W_PRECALC_32_79_0 W8,W20,W16,W12
RR0 F2,E,A,B,C,D,36
W_PRECALC_32_79_1 W28,W12
RR1 F2,E,A,B,C,D,36
W_PRECALC_32_79_2 W12
RR0 F2,C,D,E,A,B,38
W_PRECALC_32_79_3 W12,38,K_XMM
RR1 F2,C,D,E,A,B,38
// starting using F3
// i=56 : W4,W0,W28,W24,W20,W16,W12,W8
W_PRECALC_32_79_0 W4,W16,W12,W8
RR0 F3,A,B,C,D,E,40
W_PRECALC_32_79_1 W24,W8
RR1 F3,A,B,C,D,E,40
W_PRECALC_32_79_2 W8
RR0 F3,D,E,A,B,C,42
W_PRECALC_32_79_3 W8,42,K_XMM
RR1 F3,D,E,A,B,C,42
// i=60 : W0,W28,W24,W20,W16,W12,W8,W4
#undef K_XMM
#define K_XMM 48
W_PRECALC_32_79_0 W0,W12,W8,W4
RR0 F3,B,C,D,E,A,44
W_PRECALC_32_79_1 W20,W4
RR1 F3,B,C,D,E,A,44
W_PRECALC_32_79_2 W4
RR0 F3,E,A,B,C,D,46
W_PRECALC_32_79_3 W4,46,K_XMM
RR1 F3,E,A,B,C,D,46
// i=64 : W28,W24,W20,W16,W12,W8,W4,W0
W_PRECALC_32_79_0 W28,W8,W4,W0
RR0 F3,C,D,E,A,B,48
W_PRECALC_32_79_1 W16,W0
RR1 F3,C,D,E,A,B,48
W_PRECALC_32_79_2 W0
RR0 F3,A,B,C,D,E,50
W_PRECALC_32_79_3 W0,50,K_XMM
RR1 F3,A,B,C,D,E,50
// i=68 : W24,W20,W16,W12,W8,W4,W0,W28
W_PRECALC_32_79_0 W24,W4,W0,W28
RR0 F3,D,E,A,B,C,52
W_PRECALC_32_79_1 W12,W28
RR1 F3,D,E,A,B,C,52
W_PRECALC_32_79_2 W28
RR0 F3,B,C,D,E,A,54
W_PRECALC_32_79_3 W28,54,K_XMM
RR1 F3,B,C,D,E,A,54
// i=72 : W20,W16,W12,W8,W4,W0,W28,W24
W_PRECALC_32_79_0 W20,W0,W28,W24
RR0 F3,E,A,B,C,D,56
W_PRECALC_32_79_1 W8,W24
RR1 F3,E,A,B,C,D,56
W_PRECALC_32_79_2 W24
RR0 F3,C,D,E,A,B,58
W_PRECALC_32_79_3 W24,58,K_XMM
RR1 F3,C,D,E,A,B,58
// starting using F4
// i=76 : W16,W12,W8,W4,W0,W28,W24,W20
W_PRECALC_32_79_0 W16,W28,W24,W20
RR0 F4,A,B,C,D,E,60
W_PRECALC_32_79_1 W4,W20
RR1 F4,A,B,C,D,E,60
W_PRECALC_32_79_2 W20
RR0 F4,D,E,A,B,C,62
W_PRECALC_32_79_3 W20,62,K_XMM
RR1 F4,D,E,A,B,C,62
.endm
.macro SOFTWARE_PIPELINING
// i=0 : W28,W24,W20,W16,W12,W8,W4,W0
W_PRECALC_00_15_0 0 // W_TMP = (BUFFER_PTR)
RR0 F4,B,C,D,E,A,64
W_PRECALC_00_15_1 W0 // convert W_TMP to big-endian, and save W0 = W_TMP
RR1 F4,B,C,D,E,A,64
W_PRECALC_00_15_2 W0 // W_TMP = W0 + K
RR0 F4,E,A,B,C,D,66
W_PRECALC_00_15_3 3 // (sp) = W_TMP = W0 + K
RR1 F4,E,A,B,C,D,66
// i=4 : W24,W20,W16,W12,W8,W4,W0,W28
W_PRECALC_00_15_0 4 // W_TMP = 16(BUFFER_PTR)
RR0 F4,C,D,E,A,B,68
W_PRECALC_00_15_1 W28 // convert W_TMP to big-endian, and save W28 = W_TMP
RR1 F4,C,D,E,A,B,68
W_PRECALC_00_15_2 W28 // W_TMP = W28 + K
RR0 F4,A,B,C,D,E,70
W_PRECALC_00_15_3 7 // 16(sp) = W_TMP = W28 + K[0]
RR1 F4,A,B,C,D,E,70
// i=8 : W20,W16,W12,W8,W4,W0,W28,W24
W_PRECALC_00_15_0 8 // W_TMP = 32(BUFFER_PTR)
RR0 F4,D,E,A,B,C,72
W_PRECALC_00_15_1 W24 // convert W_TMP to big-endian, and save W24 = W_TMP
RR1 F4,D,E,A,B,C,72
W_PRECALC_00_15_2 W24 // W_TMP = W24 + K
RR0 F4,B,C,D,E,A,74
W_PRECALC_00_15_3 11 // 32(sp) = W_TMP = W24 + K
RR1 F4,B,C,D,E,A,74
// i=12 : W16,W12,W8,W4,W0,W28,W24,W20
W_PRECALC_00_15_0 12 // W_TMP = 48(BUFFER_PTR)
RR0 F4,E,A,B,C,D,76
W_PRECALC_00_15_1 W20 // convert W_TMP to big-endian, and save W20 = W_TMP
RR1 F4,E,A,B,C,D,76
W_PRECALC_00_15_2 W20 // W_TMP = W20 + K
RR0 F4,C,D,E,A,B,78
W_PRECALC_00_15_3 15 // 48(sp) = W_TMP = W20 + K
RR1 F4,C,D,E,A,B,78
.endm
#undef W_PRECALC_00_15_0
#undef W_PRECALC_00_15_1
#undef W_PRECALC_16_31_0
#undef W_PRECALC_32_79_0
.macro ENDING // finish up updating hash digests (i=64:79)
//i=80
RR0 F4,B,C,D,E,A,64
RR1 F4,B,C,D,E,A,64
RR0 F4,E,A,B,C,D,66
RR1 F4,E,A,B,C,D,66
//i=84
RR0 F4,C,D,E,A,B,68
RR1 F4,C,D,E,A,B,68
RR0 F4,A,B,C,D,E,70
RR1 F4,A,B,C,D,E,70
//i=88
RR0 F4,D,E,A,B,C,72
RR1 F4,D,E,A,B,C,72
RR0 F4,B,C,D,E,A,74
RR1 F4,B,C,D,E,A,74
//i=92
RR0 F4,E,A,B,C,D,76
RR1 F4,E,A,B,C,D,76
RR0 F4,C,D,E,A,B,78
RR1 F4,C,D,E,A,B,78
.endm
// load hash digests A,B,C,D,E from memory into registers
.macro LOAD_HASH
mov (HASH_PTR), A
mov 4(HASH_PTR), B
mov 8(HASH_PTR), C
mov 12(HASH_PTR), D
mov 16(HASH_PTR), E
.endm
.macro UPDATE_HASH arg0, arg1
add \arg0, \arg1
mov \arg1, \arg0
.endm
.macro UPDATE_ALL_HASH
UPDATE_HASH (HASH_PTR), A
UPDATE_HASH 4(HASH_PTR), B
UPDATE_HASH 8(HASH_PTR), C
UPDATE_HASH 12(HASH_PTR), D
UPDATE_HASH 16(HASH_PTR), E
.endm
/*
main sha1 code for system with avx2 support
*/
.macro SHA1_PIPELINED_MAIN_BODY
LOAD_HASH // load initial hashes into A,B,C,D,E
INITIAL_W_PRECALC // big_endian_load(W) and W+K (i=0:15)
.p2align 4,0x90
0:
INTERNAL // update W (i=16:79) and update ABCDE (i=0:63)
#if Multiple_Blocks
addq _IMM(64), BUFFER_PTR // BUFFER_PTR+=64;
subq _IMM(1), cnt // pre-decrement cnt by 1
jbe 1f // if cnt <= 0, branch to finish off
SOFTWARE_PIPELINING // update ABCDE (i=64:79) || big_endian_load(W) and W+K (i=0:15)
UPDATE_ALL_HASH // update output hashes
jmp 0b // repeat for next block
.p2align 4,0x90
1:
#endif
ENDING // update ABCDE (i=64:79)
UPDATE_ALL_HASH // update output hashes
.endm
/*
I removed the cpu capabilities check. The check is now down
in C code and the appropriate version of the assembler code
is selected.
*/
.text
.globl _AccelerateCrypto_SHA1_compress_AVX2
_AccelerateCrypto_SHA1_compress_AVX2:
// start the sha1 code with avx2 support
// save callee-save registers
push %rbp
mov %rsp, %rbp
push %rbx
push %r15
sub $stack_size, sp // allocate stack memory for use
// save used xmm register if this is for kernel
#if BUILDKERNEL
andq $-32, sp // aligned sp to 32-bytes
leaq 4*16(sp), %rax
xmov %ymm0, 0*32(%rax)
xmov %ymm1, 1*32(%rax)
xmov %ymm2, 2*32(%rax)
xmov %ymm3, 3*32(%rax)
xmov %ymm4, 4*32(%rax)
xmov %ymm5, 5*32(%rax)
xmov %ymm6, 6*32(%rax)
xmov %ymm7, 7*32(%rax)
xmov %ymm8, 8*32(%rax)
xmov %ymm9, 9*32(%rax)
#endif
// set up registers to free %edx/%edi/%esi for other use (ABCDE)
mov ctx, HASH_PTR
mov buf, BUFFER_PTR
#if Multiple_Blocks
mov %rsi, cnt
#endif
lea K_XMM_AR(%rip), K_BASE
SHA1_PIPELINED_MAIN_BODY
// restore used xmm registers if this is for kernel
#if BUILDKERNEL
leaq 4*16(sp), %rax
xmov 0*32(%rax), %ymm0
xmov 1*32(%rax), %ymm1
xmov 2*32(%rax), %ymm2
xmov 3*32(%rax), %ymm3
xmov 4*32(%rax), %ymm4
xmov 5*32(%rax), %ymm5
xmov 6*32(%rax), %ymm6
xmov 7*32(%rax), %ymm7
xmov 8*32(%rax), %ymm8
xmov 9*32(%rax), %ymm9
#endif
leaq -16(%rbp), %rsp
// restore callee-save registers
pop %r15
pop %rbx
pop %rbp
ret // return
CC_ASM_SECTION_CONST
.p2align 4, 0x90
#define K1 0x5a827999
#define K2 0x6ed9eba1
#define K3 0x8f1bbcdc
#define K4 0xca62c1d6
K_XMM_AR:
.long K1
.long K1
.long K1
.long K1
.long K2
.long K2
.long K2
.long K2
.long K3
.long K3
.long K3
.long K3
.long K4
.long K4
.long K4
.long K4
REV32:
// bswap_shufb_ctl: accessed thru 0x40(K_XMM_AR)
.long 0x00010203
.long 0x04050607
.long 0x08090a0b
.long 0x0c0d0e0f
#endif // defined(__x86_64__)

View File

@ -0,0 +1,983 @@
# Copyright (c) (2010,2011,2012,2014,2015,2016,2018,2019) Apple Inc. All rights reserved.
#
# corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
# is contained in the License.txt file distributed with corecrypto) and only to
# people who accept that license. IMPORTANT: Any license rights granted to you by
# Apple Inc. (if any) are limited to internal use within your organization only on
# devices and computers you own or control, for the sole purpose of verifying the
# security characteristics and correct functioning of the Apple Software. You may
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
#include <corecrypto/cc_config.h>
#if (defined(__x86_64__) || defined(__i386__))
/* vng_sha1LittleEndian.s : this file provides optimized x86_64 and i386 implementation of the sha1 function
CoreOS - vector and numerics group
The implementation is based on the principle described in an Intel online article
"Improving the Performance of the Secure Hash Algorithm (SHA-1)"
http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
Update HASH[] by processing a one 64-byte block in MESSAGE[] can be represented by the following C function
void SHA1( int HASH[], int MESSAGE[] )
{
int A[81], B[81], C[81], D[81], E[81];
int W[80];
int i, FN;
A[0] = HASH[0];
B[0] = HASH[1];
C[0] = HASH[2];
D[0] = HASH[3];
E[0] = HASH[4];
for ( i=0; i<80; ++i )
{
if ( i < 16 )
W[i] = BIG_ENDIAN_LOAD( MESSAGE[i] );
else
W[i] = ROTATE_LEFT( W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16], 1 );
FN = F( i, B[i], C[i], D[i] );
A[i+1] = FN + E[i] + ROTATE_LEFT( A[i], 5 ) + W[i] + K(i);
B[i+1] = A[i];
C[i+1] = ROTATE_LEFT( B[i], 30 );
D[i+1] = C[i];
E[i+1] = D[i];
}
HASH[0] += A[80];
HASH[1] += B[80];
HASH[2] += C[80];
HASH[3] += D[80];
HASH[4] += E[80];
}
For i=0:15, W[i] is simply big-endian loading of MESSAGE[i]. For i=16:79, W[i] is updated according to W[i] = ROTATE_LEFT( W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16], 1 );
The approach (by Dean Gaudet) can be used to vectorize the computation of W[i] for i=16:79,
1. done on 4 consequtive W[i] values in a single XMM register
W[i ] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1
W[i+1] = (W[i-2] ^ W[i-7] ^ W[i-13] ^ W[i-15]) rol 1
W[i+2] = (W[i-1] ^ W[i-6] ^ W[i-12] ^ W[i-14]) rol 1
W[i+3] = ( 0 ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1
2. this additional calculation unfortunately requires many additional operations
W[i+3] ^= W[i] rol 1
3. once we have 4 W[i] values in XMM we can also add four K values with one instruction
W[i:i+3] += {K,K,K,K}
Let W0 = {W[i] W[i+1] W[i+2] W[i+3]} be the current W-vector to be computed, W4 = {W[i-4] W[i-3] W[i-2] W[i-1]} be the previous vector, and so on
The Dean Gaudet approach can be expressed as
1. W0 = rotate_left(left_shift(W4,32) ^ W8 ^ left_shift(concatenate(W16,W12),64) ^ W16,1);
2. W[i+3] ^= W[i] rol 1
3. W0 += {K,K,K,K}
For i>=32, the Intel online article suggests that (using a basic identity (X rol 1) rol 1 = X rol 2) the update equation is equivalent to
1. W0 = rotate_left(left_shift(concatenate(W8,W4),64) ^ W16 ^ W28 ^ W32, 2);
Note:
1. In total, we need 8 16-byte registers or memory for W0,W4,...,W28. W0 and W32 can be the same register or memory.
2. The registers are used in a circular buffering mode. For example, we start with W28,W24,...,W0 (with W0 indicating the most recent 16-byte)
i=0, W28,W24,...,W0
i=4, W24,W20,...,W28
i=8, W20,W16,...,W24
.
.
and so forth.
3. 2 ssse3 instructions are used in the Intel article, pshufb and palignr.
a. pshufb is used to simplify the BIG_ENDIAN_LOAD operation
b. palignr is used to simplify the computation of left_shift(concatenate(W12,W8),64)
*/
/* the code can be compiled into single block (64 bytes) per call mode by setting Multiple_blocks to 0 */
#define Multiple_Blocks 1
#if defined (__x86_64__) || defined(__i386__) // x86_64 or i386 architectures
#if defined(__x86_64__)
// set up for x86_64
#define stack_size (16*11+16*4) // x0-x10 + 4 128-bits for intermediate WK(t) storage
#define sp %rsp // unifying architectural stack pointer representation
#define ctx %rdi // 1st input argument, will move to HASH_PTR (%r9)
#define buf %rdx // 3rd input argument, will move to BUFFER_PTR (%r10)
#define cnt %r11 // will copy from the 2nd input argument (%rsi)
#define K_BASE %r8 // an aligned pointer to point to shufb reference numbers of table of K values
#define HASH_PTR %r9 // pointer to Hash values (A,B,C,D,E)
#define BUFFER_PTR %r10 // pointer to input blocks
#else // !__x86_64__
// set up for i386
#define stack_size (12+16*2+16*11+16*4) // 12-bytes (alignment) + extra 2 + 3 (W24/W28/XMM_SHUFB_BSWAP) + 8 (xmm0-xmm7) + 4 (WK(t))
#define sp %esp // unifying architectural stack pointer representation
#define HASH_PTR stack_size+16+4(sp) // use 1st input argument from caller function, 16 for (esi/edi/ebx/ebp)
#define cnt stack_size+16+8(sp) // use 2nd input argument from caller function
#define BUFFER_PTR stack_size+16+12(sp) // use 3rd input argument from caller function
#define K_BASE stack_size-4(sp) // use for K_BASE
#endif // __x86_64__
// symbolizing registers or stack memory with algorithmic variables W0,W4,...,W28 + W_TMP, W_TMP2, and XMM_SHUFB_BSWAP for code with ssse3 support
#define W_TMP %xmm0
#define W_TMP2 %xmm1
#define W0 %xmm2
#define W4 %xmm3
#define W8 %xmm4
#define W12 %xmm5
#define W16 %xmm6
#define W20 %xmm7
#if defined(__x86_64__)
#define W24 %xmm8
#define W28 %xmm9
#define XMM_SHUFB_BSWAP %xmm10 // used only when ssse3 is supported
#else // defined (__i386__)
#define W24 12*16(sp)
#define W28 13*16(sp)
#define XMM_SHUFB_BSWAP 14*16(sp) // used only when ssse3 is supported
#endif
#define xmov movaps // aligned 16-byte move
#define xmovu movups // unaligned 16-byte move
// intermediate hash variables
#define A %ecx
#define B %esi
#define C %edi
#if defined(__x86_64__)
#define D %r15d
#else
#define D %ebp
#endif
#define E %edx
// temp variables
#define T1 %eax
#define T2 %ebx
#define WK(t) ((t)&15)*4(sp)
// int F1(int B, int C, int D) { return (D ^ ( B & (C ^ D)); }
// result in T1
.macro F1 arg0, arg1, arg2
mov \arg1, T1
xor \arg2, T1
and \arg0, T1
xor \arg2, T1
.endm
// int F2(int B, int C, int D) { return (D ^ B ^ C); }
// result in T1
.macro F2 arg0, arg1, arg2
mov \arg2, T1
xor \arg1, T1
xor \arg0, T1
.endm
// int F3(int B, int C, int D) { return (B & C) | (D & (B ^ C)); }
// result in T1
.macro F3 arg0, arg1, arg2
mov \arg1, T1
mov \arg0, T2
or \arg0, T1
and \arg1, T2
and \arg2, T1
or T2, T1
.endm
// for i=60:79, F4 is identical to F2
#define F4 F2
/*
i=0:15, W[i] = BIG_ENDIAN_LOAD(MESSAGE[i]);
with ssse3 support, this is achived via
for (i=0;i<16;i+=4) {
1. W_TMP = new 16 bytes from MESSAGE[]
2. W_TMP = pshufb(W_TMP, XMM_SHUFB_BSWAP); save to W circular buffer for updating W
3. WTMP += {K,K,K,K};
4. save quadruple W[i]+K[i] = W_TMP in the stack memory;
}
each step is represented in one of the following 4 macro definitions
*/
.macro W_PRECALC_00_15_0_ssse3 arg0 // input argument $0 : 0/4/8/12
#if defined (__x86_64__) // BUFFER_PTR is already an address register in x86_64
xmovu \arg0*4(BUFFER_PTR), W_TMP // read 16-bytes into W_TMP, BUFFER_PTR possibly not 16-byte aligned
#else // BUFFER_PTR is from the argument set up in the caller
mov BUFFER_PTR, T1 // T1 = BUFFER_PTR
xmovu \arg0*4(T1), W_TMP // read 16-bytes into W_TMP, BUFFER_PTR possibly not 16-byte aligned
#endif
.endm
.macro W_PRECALC_00_15_1_ssse3 arg0 // input argument $0 : current 16-bytes in the circular buffer, one of W0,W4,W8,...,W28
pshufb XMM_SHUFB_BSWAP, W_TMP // convert W_TMP from little-endian into big-endian
xmov W_TMP, \arg0 // save W_TMP in the circular buffer
.endm
.macro W_PRECALC_00_15_2 // K_BASE points to the current K quadruple.
#if defined (__x86_64__) // K_BASE is already an address register in x86_64
paddd (K_BASE), W_TMP // W_TMP += {K,K,K,K};
#else // K_BASE is previously set up in the stack memory
mov K_BASE, T1 // T1 = K_BASE
paddd (T1), W_TMP // W_TMP += {K,K,K,K};
#endif
.endm
.macro W_PRECALC_00_15_3 arg0
xmov W_TMP, WK(\arg0&~3) // save quadruple W[i]+K in the stack memory, which would be used later for updating the hashes A/B/C/D/E
.endm
// rounds 16-31 compute W[0] using the vectorization approach by Dean Gaudet
/*
W[i ] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1
W[i+1] = (W[i-2] ^ W[i-7] ^ W[i-13] ^ W[i-15]) rol 1
W[i+2] = (W[i-1] ^ W[i-6] ^ W[i-12] ^ W[i-14]) rol 1
W[i+3] = ( 0 ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1
W[i+3] ^= W[i] rol 1; // this W[i] is already rol by 1, if we are taking from the intial W before rol 1, we should rol this by 2
The operation (updating W and W+K) is scheduled as and divided into 4 steps
0. W_tmp = W3; W = W14 ^ W8
1. W = W3 ^ W8 ^ W14 ^ W16; W_TMP = W; W_TMP2 = (W[i] 0 0 0);
2. W_TMP = (W3 ^ W8 ^ W14 ^ W16) rol 1; split (W[i] 0 0 0) rol 2 in W_TMP2 and W
3. W = W_TMP = W_TMP ^ W_TMP2 ^ W = (W3 ^ W8 ^ W14 ^ W16) rol 1 ^ (W[i] 0 0 0) rol 2; WK = W _TMP+K;
*/
.macro W_PRECALC_16_31_0_ssse3 arg0, arg1, arg2, arg3, arg4 // input arguments : W16,W12,W8,W4,W
xmov \arg1, \arg4 // W = W12
palignr $8, \arg0, \arg4 // W = W14
xmov \arg3, W_TMP // W_TMP = W4
psrldq $4, W_TMP // W_TMP = W3
pxor \arg2, \arg4 // W = W8 ^ W14
.endm
.macro W_PRECALC_16_31_1 arg0, arg1 // input arguments : W16,W
pxor \arg0, W_TMP // W_TMP = W3 ^ W16
pxor W_TMP, \arg1 // W = W3 ^ W16 ^ W8 ^ W14
xmov \arg1, W_TMP2 // W_TMP2 = W3 ^ W16 ^ W8 ^ W14
xmov \arg1, W_TMP // W_TMP = W3 ^ W16 ^ W8 ^ W14
pslldq $12, W_TMP2 // W_TMP2 = (W[i] 0 0 0)
.endm
.macro W_PRECALC_16_31_2 arg0 // input argument : W
psrld $31, \arg0 // (W3 ^ W16 ^ W8 ^ W14)>>31
pslld $1, W_TMP // (W3 ^ W16 ^ W8 ^ W14)<<1
por \arg0, W_TMP // W_TMP = (W3 ^ W16 ^ W8 ^ W14) rol 1
xmov W_TMP2, \arg0 // copy W[i] at location of W[i+3]
psrld $30, W_TMP2 // W_TMP2 = W[i] lower 2 bits after rol 2
pslld $2, \arg0 // W = W[i] higher 30 bits after rol 2
.endm
.macro W_PRECALC_16_31_3 arg0, arg1, arg2 // input arguments: W, i, K_XMM
#if defined (__i386__)
mov K_BASE, T1 // K_BASE is store in the stack memory for i386
#endif
pxor \arg0, W_TMP
pxor W_TMP2, W_TMP // W_TMP = (W3 ^ W16 ^ W8 ^ W14) rol 1 ^ (W[i] 0 0 0) rol 2
xmov W_TMP, \arg0 // save W = W_TMP in the W circular buffer
#if defined (__x86_64__)
paddd \arg2(K_BASE), W_TMP // W+K
#else
paddd \arg2(T1), W_TMP // W+K
#endif
xmov W_TMP, WK(\arg1&~3) // save WK = W+K for later update of the hashes A/B/C/D/E
.endm
/* rounds 32-79 compute W und W+K iusing the vectorization approach from the Intel article
W = rotate_left(left_shift(concatenate(W8,W4),64) ^ W16 ^ W28 ^ W32, 2);
where left_shift(concatenate(W8,W4),64) is equivalent to W6. Note also that W32 and W use the same register.
0. W_tmp = W6; W = W28 ^ W32;
1. W = W_tmp = W6 ^ W16 ^ W28 ^ W32;
2. W_tmp = (W6 ^ W16 ^ W28 ^ W32) rol 2;
3. W = W_Tmp; WK = W_tmp + K;
*/
.macro W_PRECALC_32_79_0_ssse3 arg0, arg1, arg2, arg3 // inputr arguments : W28,W8,W4,W
xmov \arg2, W_TMP // (w1 w2 w3 w4)
pxor \arg0, \arg3 // W = W28 ^ W32;
palignr $8, \arg1, W_TMP // W_tmp = (w3 w4 w5 w6) = W6;
.endm
// this is a variant of W_PRECALC_32_79_0_ssse3 for i386 (as W24/W28 are stored in memory, not in registers)
.macro W_PRECALC_32_79_0_i386_ssse3 arg0, arg1, arg2, arg3 // input arguments : W28,W8,W4,W
xmov \arg3, W_TMP // W32
pxor \arg0, W_TMP // W28 ^ W32
xmov W_TMP, \arg3 // W = W28 ^ W32;
xmov \arg2, W_TMP // W4
palignr $8, \arg1, W_TMP // W_tmp = (w3 w4 w5 w6) = W6;
.endm
.macro W_PRECALC_32_79_1 arg0, arg1 // input arguments : W16,W
pxor \arg0, W_TMP // W_tmp = W6 ^ W16
pxor \arg1, W_TMP // W_tmp = W6 ^ W16 ^ W28 ^ W32
xmov W_TMP, \arg1 // W = W_tmp = W6 ^ W16 ^ W28 ^ W32
.endm
.macro W_PRECALC_32_79_2 arg0 // input argument : W
psrld $30, \arg0 // W >> 30
pslld $2, W_TMP // W << 2
por \arg0, W_TMP // W_tmp = (W6 ^ W16 ^ W28 ^ W32) rol 2
.endm
// this is a variant of W_PRECALC_32_79_2 for i386 (as W24/W28 are stored in memory, not in registers)
// this should be used when the input is either W24 or W28 on i386 architecture
.macro W_PRECALC_32_79_2_i386 arg0 // input argument : W
xmov \arg0, W_TMP2 // W
psrld $30, W_TMP2 // W >> 30
xmov W_TMP2, \arg0 // save (W >> 30) at W
pslld $2, W_TMP // W_tmp << 2
por \arg0, W_TMP // W_tmp = (W6 ^ W16 ^ W28 ^ W32) rol 2
.endm
.macro W_PRECALC_32_79_3 arg0, arg1, arg2 // input argument W, i, K_XMM
#if defined (__x86_64__)
xmov W_TMP, \arg0 // W = (W6 ^ W16 ^ W28 ^ W32) rol 2
paddd \arg2(K_BASE), W_TMP // W + K
xmov W_TMP, WK(\arg1&~3) // write W+K
#else
mov K_BASE, T1 // T1 = K_BASE (which is in the caller argument)
xmov W_TMP, \arg0 // W = (W6 ^ W16 ^ W28 ^ W32) rol 2
paddd \arg2(T1), W_TMP // W_tmp = W + K
xmov W_TMP, WK(\arg1&~3) // write WK
#endif
.endm
/* The hash update operation is completed by the following statements.
A[i+1] = FN + E[i] + ROTATE_LEFT( A[i], 5 ) + WK(i);
B[i+1] = A[i];
C[i+1] = ROTATE_LEFT( B[i], 30 );
D[i+1] = C[i];
E[i+1] = D[i];
Suppose we start with A0,B0,C0,D0,E0. The 1st iteration can be expressed as follows:
A1 = FN + E0 + rol(A0,5) + WK;
B1 = A0;
C1 = rol(B0, 30);
D1 = C0;
E1 = D0;
to avoid excessive memory movement between registers,
1. A1 = FN + E0 + rol(A0,5) + WK; can be temporarily saved in E0,
2. C1 = rol(B0,30) can be temporarily saved in B0.
Therefore, ignoring the time index, the update operation is equivalent to
1. E = FN(B,C,D) + E + rol(A,5) + WK(i)
2. B = rol(B,30)
3. the hashes are now stored in the order of E,A,B,C,D
To pack 2 hash update operations in 1 iteration, starting with A,B,C,D,E
1. E = FN(B,C,D) + E + rol(A,5) + WK(i)
2. B = rol(B,30)
// now the hashes are in the order of E,A,B,C,D
3. D = FN(A,B,C) + D + rol(E,5) + WK(i+1)
4. A = rol(A,30)
// now the hashes are in the order of D,E,A,B,C
These operations are distributed into the following 2 macro definitions RR0 and RR1.
*/
.macro RR0 arg0, arg1, arg2, arg3, arg4, arg5, arg6 // input arguments : FN, A, B, C, D, E, i
\arg0 \arg2, \arg3, \arg4 // T1 = FN(B,C,D)
add WK(\arg6), \arg5 // E + WK(i)
rol $30, \arg2 // B = rol(B,30)
mov \arg1, T2 // T2 = A
add WK(\arg6+1), \arg4 // D + WK(i+1)
rol $5, T2 // rol(A,5)
add T1, \arg5 // E = FN(B,C,D) + E + WK(i)
.endm
.macro RR1 arg0, arg1, arg2, arg3, arg4, arg5, arg6
add \arg5, T2 // T2 = FN(B,C,D) + E + rol(A,5) + WK(i)
mov T2, \arg5 // E = FN(B,C,D) + E + rol(A,5) + WK(i)
rol $5, T2 // rol(E,5)
add T2, \arg4 // D + WK(i+1) + rol(E,5)
\arg0 \arg1, \arg2, \arg3 // FN(A,B,C)
add T1, \arg4 // D = FN(A,B,C) + D + rol(E,5) + WK(i+1)
rol $30, \arg1 // A = rol(A,30)
.endm
/*
The following macro definitions are used to expand code for the per-block sha1 operation.
INITIAL_W_PRECALC_ssse3 : BIG_ENDIAN_LOAD(64 bytes block) into W (i=0:15) and store W+K into the stack memory
INTERNAL_ssse3 : updating W (16:79) and update the digests A/B/C/D/E (i=0:63, based on W+K stored in the stack memory)
ENDING : finishing up update the digests A/B/C/D/E (i=64:79)
For multiple-block sha1 operation (Multiple_Blocks = 1), INITIAL_W_PRECALC_ssse3 and ENDING are combined
into 1 macro definition for software pipeling.
SOFTWARE_PIPELINING_ssse3 : BIG_ENDIAN_LOAD(64 bytes block) into W (i=0:15) and store W+K into the stack, and finishing up update the digests A/B/C/D/E (i=64:79)
assume cnt (the number of blocks) >= 1, the main code body should look like
INITIAL_W_PRECALC_ssse3 // W = big_endian_load and pre-compute W+K (i=0:15)
do {
INTERNAL_ssse3 // update W(i=16:79), and update hash digests A/B/C/D/E (i=0:63)
cnt--;
if (cnt==0) break;
BUFFER_PTR += 64;
SOFTWARE_PIPELINING_ssse3; // update hash digests A/B/C/D/E (i=64:79) + W = big_endian_load and pre-compute W+K (i=0:15)
}
ENDING // update hash digests A/B/C/D/E (i=64:79)
*/
#define W_PRECALC_00_15_0 W_PRECALC_00_15_0_ssse3
#define W_PRECALC_00_15_1 W_PRECALC_00_15_1_ssse3
#define W_PRECALC_16_31_0 W_PRECALC_16_31_0_ssse3
#define W_PRECALC_32_79_0 W_PRECALC_32_79_0_ssse3
#define W_PRECALC_32_79_0_i386 W_PRECALC_32_79_0_i386_ssse3
.macro INITIAL_W_PRECALC_ssse3 // BIG_ENDIAN_LOAD(64 bytes block) into W (i=0:15) and store W+K into the stack memory
// i=0 : W28,W24,W20,W16,W12,W8,W4,W0
W_PRECALC_00_15_0 0 // W_TMP = (BUFFER_PTR)
W_PRECALC_00_15_1 W0 // convert W_TMP to big-endian, and save W0 = W_TMP
W_PRECALC_00_15_2 // W_TMP = W0 + K
W_PRECALC_00_15_3 3 // (sp) = W_TMP = W0 + K
// i=4 : W24,W20,W16,W12,W8,W4,W0,W28
W_PRECALC_00_15_0 4 // W_TMP = 16(BUFFER_PTR)
W_PRECALC_00_15_1 W28 // convert W_TMP to big-endian, and save W28 = W_TMP
W_PRECALC_00_15_2 // W_TMP = W28 + K
W_PRECALC_00_15_3 7 // 16(sp) = W_TMP = W28 + K
// i=8 : W20,W16,W12,W8,W4,W0,W28,W24
W_PRECALC_00_15_0 8 // W_TMP = 32(BUFFER_PTR)
W_PRECALC_00_15_1 W24 // convert W_TMP to big-endian, and save W24 = W_TMP
W_PRECALC_00_15_2 // W_TMP = W24 + K
W_PRECALC_00_15_3 11 // 32(sp) = W_TMP = W24 + K
// i=12 : W16,W12,W8,W4,W0,W28,W24,W20
W_PRECALC_00_15_0 12 // W_TMP = 48(BUFFER_PTR)
W_PRECALC_00_15_1 W20 // convert W_TMP to big-endian, and save W20 = W_TMP
W_PRECALC_00_15_2 // W_TMP = W20 + K
W_PRECALC_00_15_3 15 // 48(sp) = W_TMP = W20 + K
.endm
.macro INTERNAL_ssse3 // updating W (16:79) and update the digests A/B/C/D/E (i=0:63, based on W+K stored in the stack memory)
// i=16 : W12,W8,W4,W0,W28,W24,W20,W16
W_PRECALC_16_31_0 W0,W28,W24,W20,W16
RR0 F1,A,B,C,D,E,0
W_PRECALC_16_31_1 W0,W16
RR1 F1,A,B,C,D,E,0
W_PRECALC_16_31_2 W16
RR0 F1,D,E,A,B,C,2
W_PRECALC_16_31_3 W16, 2, 0
RR1 F1,D,E,A,B,C,2
// i=20 : W8,W4,W0,W28,W24,W20,W16,W12
W_PRECALC_16_31_0 W28,W24,W20,W16,W12
RR0 F1,B,C,D,E,A,4
W_PRECALC_16_31_1 W28,W12
RR1 F1,B,C,D,E,A,4
W_PRECALC_16_31_2 W12
RR0 F1,E,A,B,C,D,6
W_PRECALC_16_31_3 W12, 6, 16
RR1 F1,E,A,B,C,D,6
// i=24 : W4,W0,W28,W24,W20,W16,W12,W8
W_PRECALC_16_31_0 W24,W20,W16,W12,W8
RR0 F1,C,D,E,A,B,8
W_PRECALC_16_31_1 W24,W8
RR1 F1,C,D,E,A,B,8
W_PRECALC_16_31_2 W8
RR0 F1,A,B,C,D,E,10
W_PRECALC_16_31_3 W8,10,16
RR1 F1,A,B,C,D,E,10
// i=28 : W0,W28,W24,W20,W16,W12,W8,W4
W_PRECALC_16_31_0 W20,W16,W12,W8,W4
RR0 F1,D,E,A,B,C,12
W_PRECALC_16_31_1 W20,W4
RR1 F1,D,E,A,B,C,12
W_PRECALC_16_31_2 W4
RR0 F1,B,C,D,E,A,14
W_PRECALC_16_31_3 W4,14,16
RR1 F1,B,C,D,E,A,14
// i=32 : W28,W24,W20,W16,W12,W8,W4,W0
W_PRECALC_32_79_0 W28,W8,W4,W0
RR0 F1,E,A,B,C,D,16
W_PRECALC_32_79_1 W16,W0
RR1 F1,E,A,B,C,D,16
W_PRECALC_32_79_2 W0
RR0 F1,C,D,E,A,B,18
W_PRECALC_32_79_3 W0,18,16
RR1 F1,C,D,E,A,B,18
// starting using F2
// i=36 : W24,W20,W16,W12,W8,W4,W0,W28
#if defined (__x86_64__)
W_PRECALC_32_79_0 W24,W4,W0,W28
#else
W_PRECALC_32_79_0_i386 W24,W4,W0,W28
#endif
RR0 F2,A,B,C,D,E,20
W_PRECALC_32_79_1 W12,W28
RR1 F2,A,B,C,D,E,20
#if defined (__x86_64__)
W_PRECALC_32_79_2 W28
#else
W_PRECALC_32_79_2_i386 W28
#endif
RR0 F2,D,E,A,B,C,22
W_PRECALC_32_79_3 W28,22,16
RR1 F2,D,E,A,B,C,22
// i=40 : W20,W16,W12,W8,W4,W0,W28,W24
#undef K_XMM
#define K_XMM 32
#if defined (__x86_64__)
W_PRECALC_32_79_0 W20,W0,W28,W24
#else
W_PRECALC_32_79_0_i386 W20,W0,W28,W24
#endif
RR0 F2,B,C,D,E,A,24
W_PRECALC_32_79_1 W8,W24
RR1 F2,B,C,D,E,A,24
#if defined (__x86_64__)
W_PRECALC_32_79_2 W24
#else
W_PRECALC_32_79_2_i386 W24
#endif
RR0 F2,E,A,B,C,D,26
W_PRECALC_32_79_3 W24,26,K_XMM
RR1 F2,E,A,B,C,D,26
// i=44 : W16,W12,W8,W4,W0,W28,W24,W20
W_PRECALC_32_79_0 W16,W28,W24,W20
RR0 F2,C,D,E,A,B,28
W_PRECALC_32_79_1 W4,W20
RR1 F2,C,D,E,A,B,28
W_PRECALC_32_79_2 W20
RR0 F2,A,B,C,D,E,30
W_PRECALC_32_79_3 W20,30,K_XMM
RR1 F2,A,B,C,D,E,30
// i=48 : W12,W8,W4,W0,W28,W24,W20,W16
W_PRECALC_32_79_0 W12,W24,W20,W16
RR0 F2,D,E,A,B,C,32
W_PRECALC_32_79_1 W0,W16
RR1 F2,D,E,A,B,C,32
W_PRECALC_32_79_2 W16
RR0 F2,B,C,D,E,A,34
W_PRECALC_32_79_3 W16,34,K_XMM
RR1 F2,B,C,D,E,A,34
// i=52 : W8,W4,W0,W28,W24,W20,W16,W12
W_PRECALC_32_79_0 W8,W20,W16,W12
RR0 F2,E,A,B,C,D,36
W_PRECALC_32_79_1 W28,W12
RR1 F2,E,A,B,C,D,36
W_PRECALC_32_79_2 W12
RR0 F2,C,D,E,A,B,38
W_PRECALC_32_79_3 W12,38,K_XMM
RR1 F2,C,D,E,A,B,38
// starting using F3
// i=56 : W4,W0,W28,W24,W20,W16,W12,W8
W_PRECALC_32_79_0 W4,W16,W12,W8
RR0 F3,A,B,C,D,E,40
W_PRECALC_32_79_1 W24,W8
RR1 F3,A,B,C,D,E,40
W_PRECALC_32_79_2 W8
RR0 F3,D,E,A,B,C,42
W_PRECALC_32_79_3 W8,42,K_XMM
RR1 F3,D,E,A,B,C,42
// i=60 : W0,W28,W24,W20,W16,W12,W8,W4
#undef K_XMM
#define K_XMM 48
W_PRECALC_32_79_0 W0,W12,W8,W4
RR0 F3,B,C,D,E,A,44
W_PRECALC_32_79_1 W20,W4
RR1 F3,B,C,D,E,A,44
W_PRECALC_32_79_2 W4
RR0 F3,E,A,B,C,D,46
W_PRECALC_32_79_3 W4,46,K_XMM
RR1 F3,E,A,B,C,D,46
// i=64 : W28,W24,W20,W16,W12,W8,W4,W0
W_PRECALC_32_79_0 W28,W8,W4,W0
RR0 F3,C,D,E,A,B,48
W_PRECALC_32_79_1 W16,W0
RR1 F3,C,D,E,A,B,48
W_PRECALC_32_79_2 W0
RR0 F3,A,B,C,D,E,50
W_PRECALC_32_79_3 W0,50,K_XMM
RR1 F3,A,B,C,D,E,50
// i=68 : W24,W20,W16,W12,W8,W4,W0,W28
#if defined (__x86_64__)
W_PRECALC_32_79_0 W24,W4,W0,W28
#else
W_PRECALC_32_79_0_i386 W24,W4,W0,W28
#endif
RR0 F3,D,E,A,B,C,52
W_PRECALC_32_79_1 W12,W28
RR1 F3,D,E,A,B,C,52
#if defined (__x86_64__)
W_PRECALC_32_79_2 W28
#else
W_PRECALC_32_79_2_i386 W28
#endif
RR0 F3,B,C,D,E,A,54
W_PRECALC_32_79_3 W28,54,K_XMM
RR1 F3,B,C,D,E,A,54
// i=72 : W20,W16,W12,W8,W4,W0,W28,W24
#if defined (__x86_64__)
W_PRECALC_32_79_0 W20,W0,W28,W24
#else
W_PRECALC_32_79_0_i386 W20,W0,W28,W24
#endif
RR0 F3,E,A,B,C,D,56
W_PRECALC_32_79_1 W8,W24
RR1 F3,E,A,B,C,D,56
#if defined (__x86_64__)
W_PRECALC_32_79_2 W24
#else
W_PRECALC_32_79_2_i386 W24
#endif
RR0 F3,C,D,E,A,B,58
W_PRECALC_32_79_3 W24,58,K_XMM
RR1 F3,C,D,E,A,B,58
// starting using F4
// i=76 : W16,W12,W8,W4,W0,W28,W24,W20
W_PRECALC_32_79_0 W16,W28,W24,W20
RR0 F4,A,B,C,D,E,60
W_PRECALC_32_79_1 W4,W20
RR1 F4,A,B,C,D,E,60
W_PRECALC_32_79_2 W20
RR0 F4,D,E,A,B,C,62
W_PRECALC_32_79_3 W20,62,K_XMM
RR1 F4,D,E,A,B,C,62
.endm
.macro SOFTWARE_PIPELINING_ssse3
// i=0 : W28,W24,W20,W16,W12,W8,W4,W0
W_PRECALC_00_15_0 0 // W_TMP = (BUFFER_PTR)
RR0 F4,B,C,D,E,A,64
W_PRECALC_00_15_1 W0 // convert W_TMP to big-endian, and save W0 = W_TMP
RR1 F4,B,C,D,E,A,64
W_PRECALC_00_15_2 // W_TMP = W0 + K
RR0 F4,E,A,B,C,D,66
W_PRECALC_00_15_3 3 // (sp) = W_TMP = W0 + K
RR1 F4,E,A,B,C,D,66
// i=4 : W24,W20,W16,W12,W8,W4,W0,W28
W_PRECALC_00_15_0 4 // W_TMP = 16(BUFFER_PTR)
RR0 F4,C,D,E,A,B,68
W_PRECALC_00_15_1 W28 // convert W_TMP to big-endian, and save W28 = W_TMP
RR1 F4,C,D,E,A,B,68
W_PRECALC_00_15_2 // W_TMP = W28 + K
RR0 F4,A,B,C,D,E,70
W_PRECALC_00_15_3 7 // 16(sp) = W_TMP = W28 + K[0]
RR1 F4,A,B,C,D,E,70
// i=8 : W20,W16,W12,W8,W4,W0,W28,W24
W_PRECALC_00_15_0 8 // W_TMP = 32(BUFFER_PTR)
RR0 F4,D,E,A,B,C,72
W_PRECALC_00_15_1 W24 // convert W_TMP to big-endian, and save W24 = W_TMP
RR1 F4,D,E,A,B,C,72
W_PRECALC_00_15_2 // W_TMP = W24 + K
RR0 F4,B,C,D,E,A,74
W_PRECALC_00_15_3 11 // 32(sp) = W_TMP = W24 + K
RR1 F4,B,C,D,E,A,74
// i=12 : W16,W12,W8,W4,W0,W28,W24,W20
W_PRECALC_00_15_0 12 // W_TMP = 48(BUFFER_PTR)
RR0 F4,E,A,B,C,D,76
W_PRECALC_00_15_1 W20 // convert W_TMP to big-endian, and save W20 = W_TMP
RR1 F4,E,A,B,C,D,76
W_PRECALC_00_15_2 // W_TMP = W20 + K
RR0 F4,C,D,E,A,B,78
W_PRECALC_00_15_3 15 // 48(sp) = W_TMP = W20 + K
RR1 F4,C,D,E,A,B,78
.endm
#undef W_PRECALC_00_15_0
#undef W_PRECALC_00_15_1
#undef W_PRECALC_16_31_0
#undef W_PRECALC_32_79_0
#undef W_PRECALC_32_79_0_i386
.macro ENDING // finish up updating hash digests (i=64:79)
//i=80
RR0 F4,B,C,D,E,A,64
RR1 F4,B,C,D,E,A,64
RR0 F4,E,A,B,C,D,66
RR1 F4,E,A,B,C,D,66
//i=84
RR0 F4,C,D,E,A,B,68
RR1 F4,C,D,E,A,B,68
RR0 F4,A,B,C,D,E,70
RR1 F4,A,B,C,D,E,70
//i=88
RR0 F4,D,E,A,B,C,72
RR1 F4,D,E,A,B,C,72
RR0 F4,B,C,D,E,A,74
RR1 F4,B,C,D,E,A,74
//i=92
RR0 F4,E,A,B,C,D,76
RR1 F4,E,A,B,C,D,76
RR0 F4,C,D,E,A,B,78
RR1 F4,C,D,E,A,B,78
.endm
// load hash digests A,B,C,D,E from memory into registers
.macro LOAD_HASH
#if defined (__x86_64__)
mov (HASH_PTR), A
mov 4(HASH_PTR), B
mov 8(HASH_PTR), C
mov 12(HASH_PTR), D
mov 16(HASH_PTR), E
#else
mov HASH_PTR, T1
mov (T1), A
mov 4(T1), B
mov 8(T1), C
mov 12(T1), D
mov 16(T1), E
#endif
.endm
.macro UPDATE_HASH arg0, arg1
add \arg0, \arg1
mov \arg1, \arg0
.endm
.macro UPDATE_ALL_HASH
#if defined (__x86_64__)
UPDATE_HASH (HASH_PTR), A
UPDATE_HASH 4(HASH_PTR), B
UPDATE_HASH 8(HASH_PTR), C
UPDATE_HASH 12(HASH_PTR), D
UPDATE_HASH 16(HASH_PTR), E
#else
mov HASH_PTR, T1
UPDATE_HASH (T1), A
UPDATE_HASH 4(T1), B
UPDATE_HASH 8(T1), C
UPDATE_HASH 12(T1), D
UPDATE_HASH 16(T1), E
#endif
.endm
/*
main sha1 code for system with ssse3 support
*/
.macro SHA1_PIPELINED_MAIN_BODY_ssse3
LOAD_HASH // load initial hashes into A,B,C,D,E
INITIAL_W_PRECALC_ssse3 // big_endian_load(W) and W+K (i=0:15)
.p2align 4,0x90
0:
INTERNAL_ssse3 // update W (i=16:79) and update ABCDE (i=0:63)
#if Multiple_Blocks
#if defined (__x86_64__)
addq _IMM(64), BUFFER_PTR // BUFFER_PTR+=64;
subq _IMM(1), cnt // pre-decrement cnt by 1
#else
addl _IMM(64), BUFFER_PTR // BUFFER_PTR+=64;
subl _IMM(1), cnt // pre-decrement cnt by 1
#endif
jbe 1f // if cnt <= 0, branch to finish off
SOFTWARE_PIPELINING_ssse3 // update ABCDE (i=64:79) || big_endian_load(W) and W+K (i=0:15)
UPDATE_ALL_HASH // update output hashes
jmp 0b // repeat for next block
.p2align 4,0x90
1:
#endif
ENDING // update ABCDE (i=64:79)
UPDATE_ALL_HASH // update output hashes
.endm
/*
I removed the cpu capabilities check. The check is now down
in C code and the appropriate version of the assembler code
is selected.
*/
.text
.globl _AccelerateCrypto_SHA1_compress_ssse3
_AccelerateCrypto_SHA1_compress_ssse3:
// start the sha1 code with ssse3 support
// save callee-save registers
#if defined (__x86_64__)
push %rbp
mov %rsp, %rbp
push %rbx
push %r15
#else
push %ebx
push %ebp
push %esi
push %edi
#endif
sub $stack_size, sp // allocate stack memory for use
// save used xmm register if this is for kernel
#if BUILDKERNEL
xmov %xmm0, 4*16(sp)
xmov %xmm1, 5*16(sp)
xmov %xmm2, 6*16(sp)
xmov %xmm3, 7*16(sp)
xmov %xmm4, 8*16(sp)
xmov %xmm5, 9*16(sp)
xmov %xmm6, 10*16(sp)
xmov %xmm7, 11*16(sp)
#if defined (__x86_64__)
xmov %xmm8, 12*16(sp)
xmov %xmm9, 13*16(sp)
xmov %xmm10, 14*16(sp)
#endif
#endif
#if defined (__x86_64__)
// set up registers to free %edx/%edi/%esi for other use (ABCDE)
mov ctx, HASH_PTR
mov buf, BUFFER_PTR
#if Multiple_Blocks
mov %rsi, cnt
#endif
lea K_XMM_AR(%rip), K_BASE
xmov 0x40(K_BASE), XMM_SHUFB_BSWAP
#else // __i386__
#if BUILDKERNEL
lea K_XMM_AR, %eax
#else
// Get address of 0 in R.
call 0f // Push program counter onto stack.
0: pop %eax // Get program counter.
lea K_XMM_AR-0b(%eax), %eax
#endif
mov %eax, K_BASE
xmov 0x40(%eax), %xmm0
xmov %xmm0, XMM_SHUFB_BSWAP
#endif
SHA1_PIPELINED_MAIN_BODY_ssse3
// restore used xmm registers if this is for kernel
#if BUILDKERNEL
xmov 4*16(sp), %xmm0
xmov 5*16(sp), %xmm1
xmov 6*16(sp), %xmm2
xmov 7*16(sp), %xmm3
xmov 8*16(sp), %xmm4
xmov 9*16(sp), %xmm5
xmov 10*16(sp), %xmm6
xmov 11*16(sp), %xmm7
#if defined (__x86_64__)
xmov 12*16(sp), %xmm8
xmov 13*16(sp), %xmm9
xmov 14*16(sp), %xmm10
#endif
#endif
add $stack_size, sp // deallocate stack memory
// restore callee-save registers
#if defined (__x86_64__)
pop %r15
pop %rbx
pop %rbp
#else
pop %edi
pop %esi
pop %ebp
pop %ebx
#endif
ret // return
CC_ASM_SECTION_CONST
.p2align 4, 0x90
#define K1 0x5a827999
#define K2 0x6ed9eba1
#define K3 0x8f1bbcdc
#define K4 0xca62c1d6
K_XMM_AR:
.long K1
.long K1
.long K1
.long K1
.long K2
.long K2
.long K2
.long K2
.long K3
.long K3
.long K3
.long K3
.long K4
.long K4
.long K4
.long K4
// bswap_shufb_ctl: accessed thru 0x40(K_XMM_AR)
.long 0x00010203
.long 0x04050607
.long 0x08090a0b
.long 0x0c0d0e0f
#endif // architecture x86_64 or i386
#endif // (defined(__x86_64__) || defined(__i386__))

View File

@ -0,0 +1,854 @@
# Copyright (c) (2011,2012,2013,2015,2016,2018,2019) Apple Inc. All rights reserved.
#
# corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
# is contained in the License.txt file distributed with corecrypto) and only to
# people who accept that license. IMPORTANT: Any license rights granted to you by
# Apple Inc. (if any) are limited to internal use within your organization only on
# devices and computers you own or control, for the sole purpose of verifying the
# security characteristics and correct functioning of the Apple Software. You may
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
/*
This file provides armv7+neon hand implementation of the following function
void SHA256_Transform(SHA256_ctx *ctx, char *data, unsigned int num_blocks);
which is a C function in sha2.c (from xnu).
sha256 algorithm per block description:
1. W(0:15) = big-endian (per 4 bytes) loading of input data (64 byte)
2. load 8 digests a-h from ctx->state
3. for r = 0:15
T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
d += T1;
h = T1 + Sigma0(a) + Maj(a,b,c)
permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
4. for r = 16:63
W[r] = W[r-16] + sigma1(W[r-2]) + W[r-7] + sigma0(W[r-15]);
T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
d += T1;
h = T1 + Sigma0(a) + Maj(a,b,c)
permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
In the assembly implementation:
- a circular window of message schedule W(r:r+15) is updated and stored in q0-q3
- its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
- the 8 digests (a-h) will be stored in GPR or memory
the implementation per block looks like
----------------------------------------------------------------------------
load W(0:15) (big-endian per 4 bytes) into q0:q3
pre_calculate and store W+K(0:15) in stack
load digests a-h from ctx->state;
for (r=0;r<48;r+=4) {
digests a-h update and permute round r:r+3
update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
}
for (r=48;r<64;r+=4) {
digests a-h update and permute round r:r+3
}
ctx->states += digests a-h;
----------------------------------------------------------------------------
our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block
into the last 16 rounds of its previous block:
----------------------------------------------------------------------------
load W(0:15) (big-endian per 4 bytes) into q0:q3
pre_calculate and store W+K(0:15) in stack
L_loop:
load digests a-h from ctx->state;
for (r=0;r<48;r+=4) {
digests a-h update and permute round r:r+3
update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
}
num_block--;
if (num_block==0) jmp L_last_block;
for (r=48;r<64;r+=4) {
digests a-h update and permute round r:r+3
load W([r:r+3]%16) (big-endian per 4 bytes) into q0:q3
pre_calculate and store W+K([r:r+3]%16) in stack
}
ctx->states += digests a-h;
jmp L_loop;
L_last_block:
for (r=48;r<64;r+=4) {
digests a-h update and permute round r:r+3
}
ctx->states += digests a-h;
------------------------------------------------------------------------
Apple CoreOS vector & numerics
*/
#if (defined(__arm__) && defined(__ARM_NEON__))
// associate variables with registers or memory
#define ctx r0
#define data r1
#define num_blocks [sp, #64]
#define _i_loop [sp, #68]
#define a r2
#define b r3
#define c r4
#define d r5
#define e r8
#define f r9
#define g r10
#define h r11
#define K r6
// 2 local variables
#define t r12
#define s lr
// a window (16 words) of message scheule
#define W0 q0
#define W1 q1
#define W2 q2
#define W3 q3
#define zero q8
// circular buffer for WK[(r:r+15)%16]
#define WK(r) [sp,#((r)&15)*4]
// #define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
.macro Ch
mvn t, $0 // ~x
and s, $0, $1 // (x) & (y)
and t, t, $2 // (~(x)) & (z)
eor t, t, s // t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
.endm
// #define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
.macro Maj
eor t, $1, $2 // y^z
and s, $1, $2 // y&z
and t, t, $0 // x&(y^z)
eor t, t, s // t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
.endm
// #define sigma0_256(x) (S32(7, (x)) ^ S32(18, (x)) ^ R(3 , (x)))
// performs sigma0_256 on 4 words on a Q register
// use q6/q7 as intermediate registers
.macro sigma0
vshr.u32 q6, $0, #7
vshl.i32 q7, $0, #14
vshr.u32 $0, $0, #3
veor $0, q6
veor $0, q7
vshr.u32 q6, #11
vshl.i32 q7, #11
veor $0, q6
veor $0, q7
.endm
// #define sigma1_256(x) (S32(17, (x)) ^ S32(19, (x)) ^ R(10, (x)))
// performs sigma1_256 on 4 words on a Q register
// use q6/q7 as intermediate registers
.macro sigma1
vshr.u32 q6, $0, #17
vshl.i32 q7, $0, #13
vshr.u32 $0, $0, #10
veor $0, q6
veor $0, q7
vshr.u32 q6, #2
vshl.i32 q7, #2
veor $0, q6
veor $0, q7
.endm
// #define Sigma0_256(x) (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x)))
.macro Sigma0
ror t, $0, #2 // S32(2, (x))
ror s, $0, #13 // S32(13, (x))
eor t, t, s // S32(2, (x)) ^ S32(13, (x))
ror s, s, #9 // S32(22, (x))
eor t, t, s // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x)))
.endm
// #define Sigma1_256(x) (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
.macro Sigma1
ror t, $0, #6 // S32(6, (x))
ror s, $0, #11 // S32(11, (x))
eor t, t, s // S32(6, (x)) ^ S32(11, (x))
ror s, s, #14 // S32(25, (x))
eor t, t, s // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
.endm
// per round digests update
.macro round
// ror t, $4, #6 // S32(6, (x))
eor t, t, $4, ror #11 // S32(6, (x)) ^ S32(11, (x))
and s, $4, $5 // (x) & (y)
eor t, t, $4, ror #25 // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
add $7, t // use h to store h+Sigma1(e)
bic t, $6, $4 // (~(x)) & (z)
eor t, t, s // t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
ldr s, WK($8) //
add $7, t // t = h+Sigma1(e)+Ch(e,f,g);
ror t, $0, #2 // S32(2, (x))
add $7, s // h = T1
eor t, t, $0, ror #13 // S32(2, (x)) ^ S32(13, (x))
add $3, $7 // d += T1;
eor t, t, $0, ror #22 // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) // t = Sigma0(a);
add $7, t // h = T1 + Sigma0(a);
eor t, $1, $2 // y^z
and s, $1, $2 // y&z
and t, t, $0 // x&(y^z)
eor s, s, t // t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
// add $7, s // h = T1 + Sigma0(a) + Maj(a,b,c);
.endm
// per 4 rounds digests update and permutation
// permutation is absorbed by rotating the roles of digests a-h
.macro rounds
ror t, $4, #6
round $0, $1, $2, $3, $4, $5, $6, $7, 0+$8
ror t, $3, #6
add $7, s
round $7, $0, $1, $2, $3, $4, $5, $6, 1+$8
ror t, $2, #6
add $6, s
round $6, $7, $0, $1, $2, $3, $4, $5, 2+$8
ror t, $1, #6
add $5, s
round $5, $6, $7, $0, $1, $2, $3, $4, 3+$8
add $4, s
.endm
.macro rounds_a
ror t, e, #6
round a, b, c, d, e, f, g, h, 0+$0
ror t, d, #6
add h, s
round h, a, b, c, d, e, f, g, 1+$0
ror t, c, #6
add g, s
round g, h, a, b, c, d, e, f, 2+$0
ror t, b, #6
add f, s
round f, g, h, a, b, c, d, e, 3+$0
add e, s
.endm
.macro rounds_a_update_W_WK
ror t, e, #6
round a, b, c, d, e, f, g, h, 0+$0
vld1.s32 {$2},[data]!
ror t, d, #6
add h, s
round h, a, b, c, d, e, f, g, 1+$0
vrev32.8 $2, $2
ror t, c, #6
vld1.s32 {q4},[K,:128]!
add g, s
round g, h, a, b, c, d, e, f, 2+$0
ror t, b, #6
add f, s
vadd.s32 q4, $2
round f, g, h, a, b, c, d, e, 3+$0
add t, sp, #($1*16)
add e, s
vst1.32 {q4},[t]
.endm
.macro rounds_e
ror t, a, #6
round e, f, g, h, a, b, c, d, 0+$0
ror t, h, #6
add d, s
round d, e, f, g, h, a, b, c, 1+$0
ror t, g, #6
add c, s
round c, d, e, f, g, h, a, b, 2+$0
ror t, f, #6
add b, s
round b, c, d, e, f, g, h, a, 3+$0
add a, s
.endm
.macro rounds_e_update_W_WK
ror t, a, #6
round e, f, g, h, a, b, c, d, 0+$0
vld1.s32 {$2},[data]!
ror t, h, #6
add d, s
round d, e, f, g, h, a, b, c, 1+$0
vrev32.8 $2, $2
ror t, g, #6
vld1.s32 {q4},[K,:128]!
add c, s
round c, d, e, f, g, h, a, b, 2+$0
ror t, f, #6
add b, s
vadd.s32 q4, $2
round b, c, d, e, f, g, h, a, 3+$0
add t, sp, #($1*16)
add a, s
vst1.32 {q4},[t]
.endm
// update the message schedule W and W+K (4 rounds) 16 rounds ahead in the future
.macro message_schedule
vld1.32 {q5},[K,:128]!
vext.32 q4, $0, $1, #1 // Q4 = w4:w1
sigma0 q4 // sigma0(w4:w1)
vadd.s32 $0, q4 // w3:w0 + sigma0(w4:w1)
vext.32 q6, $2, $3, #1 // Q6 = w12:w9
vadd.s32 $0, q6 // w3:w0 + sigma0(w4:w1) + w12:w9
vext.64 q4, $3, zero, #1 // 0 0 w15:w14
sigma1 q4 // Q4 = sigma1(0 0 w15:w14)
vadd.s32 $0, q4 // w3:w0 + sigma0(w4:w1) + w12:w9 + sigma1(0 0 w15:w14)
vext.64 q4, zero, $0, #1 // Q4 = (w17:w16 0 0)
sigma1 q4 // sigma1(w17:w16 0 0)
vadd.s32 $0, q4 // w19:w16 = w3:w0 + sigma0(w4:w1) + w12:w9 + sigma1(w17:w14)
add t, sp, #(($4&15)*4)
vadd.s32 q5, $0 // W+K
vst1.32 {q5},[t,:128]
.endm
// this macro is used in the last 16 rounds of a current block
// it reads the next message (16 4-byte words), load it into 4 words W[r:r+3], computes WK[r:r+3]
// and save into stack to prepare for next block
.macro update_W_WK
vld1.s32 {$1},[data]!
vrev32.8 $1, $1
add t, sp, #($0*16)
vld1.s32 {q4},[K,:128]!
vadd.s32 q4, $1
vst1.32 {q4},[t]
.endm
.macro Update_Digits
ldrd t, s, [ctx]
add a, t
add b, s
strd a, b, [ctx]
ldrd t, s, [ctx,#8]
add c, t
add d, s
strd c, d, [ctx, #8]
ldrd t, s, [ctx,#16]
add e, t
add f, s
strd e, f, [ctx, #16]
ldrd t, s, [ctx,#24]
add g, t
add h, s
strd g, h, [ctx, #24]
.endm
.macro rounds_a_schedule_update
eor t, e, e, ror #5 // S32(6, (x)) ^ S32(11, (x))
vld1.32 {q5},[K,:128]!
eor t, t, e, ror #19 // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
vext.32 q4, $1, $2, #1 // Q4 = w4:w1
and s, e, f // (x) & (y)
add h, t, ror #6 // use h to store h+Sigma1(e)
bic t, g, e // (~(x)) & (z)
vshr.u32 q6, q4, #7
eor t, t, s // t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
vshl.i32 q7, q4, #14
ldr s, WK($0) //
add h, t // t = h+Sigma1(e)+Ch(e,f,g);
eor t, a, a, ror #11 // S32(2, (x)) ^ S32(13, (x))
vshr.u32 q4, q4, #3
add h, s // h = T1
eor t, t, a, ror #20 // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) // t = Sigma0(a);
add d, h // d += T1;
add h, t, ror #2 // h = T1 + Sigma0(a);
veor q4, q6
eor t, b, c // y^z
vshr.u32 q6, #11
and s, b, c // y&z
and t, t, a // x&(y^z)
veor q4, q7
eor s, s, t // t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
eor t, d, d, ror #5 // S32(6, (x)) ^ S32(11, (x))
vshl.i32 q7, #11
add h, s
veor q4, q6
eor t, t, d, ror #19 // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
and s, d, e // (x) & (y)
veor q4, q7
add g, t, ror #6 // use h to store h+Sigma1(e)
bic t, f, d // (~(x)) & (z)
vext.32 q6, $3, $4, #1 // Q6 = w12:w9
eor t, t, s // t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
ldr s, WK(1+$0) //
vadd.s32 $1, q4 // w3:w0 + sigma0(w4:w1)
add g, t // t = h+Sigma1(e)+Ch(e,f,g);
eor t, h, h, ror #11 // S32(2, (x)) ^ S32(13, (x))
vadd.s32 $1, q6 // w3:w0 + sigma0(w4:w1) + w12:w9
add g, s // h = T1
eor t, t, h, ror #20 // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) // t = Sigma0(a);
vext.64 q4, $4, zero, #1 // 0 0 w15:w14
add c, g // d += T1;
add g, t, ror #2 // h = T1 + Sigma0(a);
eor t, a, b // y^z
and s, a, b // y&z
vshr.u32 q6, q4, #17
and t, t, h // x&(y^z)
vshl.i32 q7, q4, #13
eor s, s, t // t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
vshr.u32 q4, q4, #10
eor t, c, c, ror #5 // S32(6, (x)) ^ S32(11, (x))
veor q4, q6
add g, s
veor q4, q7
eor t, t, c, ror #19 // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
vshr.u32 q6, #2
and s, c, d // (x) & (y)
vshl.i32 q7, #2
add f, t, ror #6 // use h to store h+Sigma1(e)
veor q4, q6
bic t, e, c // (~(x)) & (z)
veor q4, q7
eor t, t, s // t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
vadd.s32 $1, q4 // w3:w0 + sigma0(w4:w1) + w12:w9 + sigma1(0 0 w15:w14)
ldr s, WK(2+$0) //
vext.64 q4, zero, $1, #1 // Q4 = (w17:w16 0 0)
add f, t // t = h+Sigma1(e)+Ch(e,f,g);
vshr.u32 q6, q4, #17
eor t, g, g, ror #11 // S32(2, (x)) ^ S32(13, (x))
vshl.i32 q7, q4, #13
add f, s // h = T1
vshr.u32 q4, q4, #10
eor t, t, g, ror #20 // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) // t = Sigma0(a);
veor q4, q6
add b, f // d += T1;
veor q4, q7
add f, t, ror #2 // h = T1 + Sigma0(a);
eor t, h, a // y^z
vshr.u32 q6, #2
and s, h, a // y&z
and t, t, g // x&(y^z)
vshl.i32 q7, #2
eor s, s, t // t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
eor t, b, b, ror #5 // S32(6, (x)) ^ S32(11, (x))
veor q4, q6
add f, s
eor t, t, b, ror #19 // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
veor q4, q7
vadd.s32 $1, q4 // w19:w16 = w3:w0 + sigma0(w4:w1) + w12:w9 + sigma1(w17:w14)
and s, b, c // (x) & (y)
add e, t, ror #6 // use h to store h+Sigma1(e)
bic t, d, b // (~(x)) & (z)
vadd.s32 q5, $1 // W+K
eor t, t, s // t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
ldr s, WK(3+$0) //
add e, t // t = h+Sigma1(e)+Ch(e,f,g);
eor t, f, f, ror #11 // S32(2, (x)) ^ S32(13, (x))
add e, s // h = T1
eor t, t, f, ror #20 // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) // t = Sigma0(a);
add a, e // d += T1;
add e, t, ror #2 // h = T1 + Sigma0(a);
eor t, g, h // y^z
and s, g, h // y&z
and t, t, f // x&(y^z)
eor s, s, t // t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
add t, sp, #(($0&15)*4)
add e, s
vst1.32 {q5},[t,:128]
.endm
.macro rounds_e_schedule_update
eor t, a, a, ror #5 // S32(6, (x)) ^ S32(11, (x))
vld1.32 {q5},[K,:128]!
eor t, t, a, ror #19 // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
vext.32 q4, $1, $2, #1 // Q4 = w4:w1
and s, a, b // (x) & (y)
add d, t, ror #6 // use h to store h+Sigma1(e)
bic t, c, a // (~(x)) & (z)
vshr.u32 q6, q4, #7
eor t, t, s // t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
vshl.i32 q7, q4, #14
ldr s, WK($0) //
add d, t // t = h+Sigma1(e)+Ch(e,f,g);
eor t, e, e, ror #11 // S32(2, (x)) ^ S32(13, (x))
vshr.u32 q4, q4, #3
add d, s // h = T1
eor t, t, e, ror #20 // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) // t = Sigma0(a);
add h, d // d += T1;
veor q4, q6
add d, t, ror #2 // h = T1 + Sigma0(a);
vshr.u32 q6, #11
eor t, f, g // y^z
and s, f, g // y&z
veor q4, q7
and t, t, e // x&(y^z)
vshl.i32 q7, #11
eor s, s, t // t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
veor q4, q6
eor t, h, h, ror #5 // S32(6, (x)) ^ S32(11, (x))
vext.32 q6, $3, $4, #1 // Q6 = w12:w9
add d, s
veor q4, q7
eor t, t, h, ror #19 // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
and s, h, a // (x) & (y)
vadd.s32 $1, q4 // w3:w0 + sigma0(w4:w1)
add c, t, ror #6 // use h to store h+Sigma1(e)
bic t, b, h // (~(x)) & (z)
eor t, t, s // t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
ldr s, WK(1+$0) //
add c, t // t = h+Sigma1(e)+Ch(e,f,g);
vadd.s32 $1, q6 // w3:w0 + sigma0(w4:w1) + w12:w9
eor t, d, d, ror #11 // S32(2, (x)) ^ S32(13, (x))
vext.64 q4, $4, zero, #1 // 0 0 w15:w14
add c, s // h = T1
eor t, t, d, ror #20 // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) // t = Sigma0(a);
add g, c // d += T1;
vshr.u32 q6, q4, #17
add c, t, ror #2 // h = T1 + Sigma0(a);
vshl.i32 q7, q4, #13
eor t, e, f // y^z
vshr.u32 q4, q4, #10
and s, e, f // y&z
and t, t, d // x&(y^z)
veor q4, q6
eor s, s, t // t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
veor q4, q7
eor t, g, g, ror #5 // S32(6, (x)) ^ S32(11, (x))
vshr.u32 q6, #2
add c, s
vshl.i32 q7, #2
eor t, t, g, ror #19 // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
veor q4, q6
and s, g, h // (x) & (y)
veor q4, q7
add b, t, ror #6 // use h to store h+Sigma1(e)
vadd.s32 $1, q4 // w3:w0 + sigma0(w4:w1) + w12:w9 + sigma1(0 0 w15:w14)
bic t, a, g // (~(x)) & (z)
vext.64 q4, zero, $1, #1 // Q4 = (w17:w16 0 0)
eor t, t, s // t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
ldr s, WK(2+$0) //
add b, t // t = h+Sigma1(e)+Ch(e,f,g);
vshr.u32 q6, q4, #17
eor t, c, c, ror #11 // S32(2, (x)) ^ S32(13, (x))
vshl.i32 q7, q4, #13
add b, s // h = T1
vshr.u32 q4, q4, #10
eor t, t, c, ror #20 // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) // t = Sigma0(a);
add f, b // d += T1;
veor q4, q6
add b, t, ror #2 // h = T1 + Sigma0(a);
vshr.u32 q6, #2
eor t, d, e // y^z
veor q4, q7
and s, d, e // y&z
vshl.i32 q7, #2
and t, t, c // x&(y^z)
eor s, s, t // t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
veor q4, q6
eor t, f, f, ror #5 // S32(6, (x)) ^ S32(11, (x))
veor q4, q7
add b, s
eor t, t, f, ror #19 // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
and s, f, g // (x) & (y)
add a, t, ror #6 // use h to store h+Sigma1(e)
bic t, h, f // (~(x)) & (z)
vadd.s32 $1, q4 // w19:w16 = w3:w0 + sigma0(w4:w1) + w12:w9 + sigma1(w17:w14)
eor t, t, s // t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
ldr s, WK(3+$0) //
add a, t // t = h+Sigma1(e)+Ch(e,f,g);
eor t, b, b, ror #11 // S32(2, (x)) ^ S32(13, (x))
add a, s // h = T1
eor t, t, b, ror #20 // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) // t = Sigma0(a);
vadd.s32 q5, $1 // W+K
add e, a // d += T1;
add a, t, ror #2 // h = T1 + Sigma0(a);
eor t, c, d // y^z
and s, c, d // y&z
and t, t, b // x&(y^z)
eor s, s, t // t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
add t, sp, #(($0&15)*4)
add a, s
vst1.32 {q5},[t,:128]
.endm
.subsections_via_symbols
.text
.p2align 4
K256:
.long 0x428a2f98
.long 0x71374491
.long 0xb5c0fbcf
.long 0xe9b5dba5
.long 0x3956c25b
.long 0x59f111f1
.long 0x923f82a4
.long 0xab1c5ed5
.long 0xd807aa98
.long 0x12835b01
.long 0x243185be
.long 0x550c7dc3
.long 0x72be5d74
.long 0x80deb1fe
.long 0x9bdc06a7
.long 0xc19bf174
.long 0xe49b69c1
.long 0xefbe4786
.long 0x0fc19dc6
.long 0x240ca1cc
.long 0x2de92c6f
.long 0x4a7484aa
.long 0x5cb0a9dc
.long 0x76f988da
.long 0x983e5152
.long 0xa831c66d
.long 0xb00327c8
.long 0xbf597fc7
.long 0xc6e00bf3
.long 0xd5a79147
.long 0x06ca6351
.long 0x14292967
.long 0x27b70a85
.long 0x2e1b2138
.long 0x4d2c6dfc
.long 0x53380d13
.long 0x650a7354
.long 0x766a0abb
.long 0x81c2c92e
.long 0x92722c85
.long 0xa2bfe8a1
.long 0xa81a664b
.long 0xc24b8b70
.long 0xc76c51a3
.long 0xd192e819
.long 0xd6990624
.long 0xf40e3585
.long 0x106aa070
.long 0x19a4c116
.long 0x1e376c08
.long 0x2748774c
.long 0x34b0bcb5
.long 0x391c0cb3
.long 0x4ed8aa4a
.long 0x5b9cca4f
.long 0x682e6ff3
.long 0x748f82ee
.long 0x78a5636f
.long 0x84c87814
.long 0x8cc70208
.long 0x90befffa
.long 0xa4506ceb
.long 0xbef9a3f7
.long 0xc67178f2
.syntax unified
.p2align 2
.code 16
.thumb_func _AccelerateCrypto_SHA256_compress
.globl _AccelerateCrypto_SHA256_compress
_AccelerateCrypto_SHA256_compress:
// due to the change of order in the 2nd and 3rd calling argument,
// we need to switch r1/r2 to use the original code
mov r12, r1
mov r1, r2
mov r2, r12
// push callee-saved registers
push {r4-r7,lr}
add r7, sp, #12 // set up dtrace frame pointer
push {r8-r11}
// align sp to 16-byte boundary
mov r12, sp
ands r12, r12, #15 // bytes to align to 16-byte boundary
it eq
addeq r12, #16 // if nothing, enforce to insert 16 bytes
sub sp, r12
str r12, [sp]
#if BUILDKERNEL
vpush {q8}
#endif
vpush {q0-q7}
#define stack_size (16*5) // circular buffer W0-W3, extra 16 to save num_blocks
sub sp, #stack_size
str r2, num_blocks
veor zero, zero
// set up pointer to table K256[]
ldr K, L_table1
L_table0:
mov r12, pc
ldr K, [r12, K]
bal 0f
L_table1:
.long L_Tab$non_lazy_ptr-(L_table0+4)
0:
// load W[0:15]
vld1.s32 {W0-W1},[data]!
vld1.s32 {W2-W3},[data]!
// load K[0:15] & per word byte swap
vrev32.8 W0, W0
vrev32.8 W1, W1
vld1.s32 {q4-q5}, [K,:128]!
vrev32.8 W2, W2
vrev32.8 W3, W3
vld1.s32 {q6-q7}, [K,:128]!
// compute WK[0:15] and save in stack
vadd.s32 q4, q0
vadd.s32 q5, q1
vadd.s32 q6, q2
vadd.s32 q7, q3
vstmia sp,{q4-q7}
// digests a-h = ctx->states;
ldmia ctx,{a-d,e-h}
L_loop:
// rounds 0:47 interleaved with W/WK update for rounds 16:63
mov t, #3
str t, _i_loop
L_i_loop:
rounds_a_schedule_update 0,W0,W1,W2,W3
rounds_e_schedule_update 4,W1,W2,W3,W0
rounds_a_schedule_update 8,W2,W3,W0,W1
rounds_e_schedule_update 12,W3,W0,W1,W2
ldr t, _i_loop
subs t, t, #1
str t, _i_loop
bgt L_i_loop
// revert K to the beginning of K256[]
ldr t, num_blocks
sub K, #256
subs t, #1 // num_blocks--
beq L_final_block // if final block, wrap up final rounds
str t, num_blocks
// rounds 48:63 interleaved with W/WK initialization for next block rounds 0:15
#if 0
rounds_a 48
update_W_WK 0, W0
rounds_e 52
update_W_WK 1, W1
rounds_a 56
update_W_WK 2, W2
rounds_e 60
update_W_WK 3, W3
#else
rounds_a_update_W_WK 48, 0, W0
rounds_e_update_W_WK 52, 1, W1
rounds_a_update_W_WK 56, 2, W2
rounds_e_update_W_WK 60, 3, W3
#endif
// ctx->states += digests a-h
Update_Digits
// digests a-h = ctx->states;
// ldmia ctx,{a-d,e-h}
bal L_loop // branch for next block
// wrap up digest update round 48:63 for final block
L_final_block:
rounds_a 48
rounds_e 52
rounds_a 56
rounds_e 60
// ctx->states += digests a-h
Update_Digits
// free allocated stack memory
add sp, #stack_size
// if kernel, restore q0-q8
vpop {q0-q1}
vpop {q2-q3}
vpop {q4-q5}
vpop {q6-q7}
#if BUILDKERNEL
vpop {q8}
#endif
// dealign sp from the 16-byte boundary
ldr r12, [sp]
add sp, r12
// restore callee-save registers and return
pop {r8-r11}
pop {r4-r7,pc}
.section __DATA,__nl_symbol_ptr,non_lazy_symbol_pointers
.p2align 2
L_Tab$non_lazy_ptr:
.indirect_symbol K256
.long 0
#endif // (defined(__arm__) && defined(__ARM_NEON__))

View File

@ -0,0 +1,389 @@
# Copyright (c) (2018-2020) Apple Inc. All rights reserved.
#
# corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
# is contained in the License.txt file distributed with corecrypto) and only to
# people who accept that license. IMPORTANT: Any license rights granted to you by
# Apple Inc. (if any) are limited to internal use within your organization only on
# devices and computers you own or control, for the sole purpose of verifying the
# security characteristics and correct functioning of the Apple Software. You may
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
/*
This file provides armv7+neon hand implementation of the following function
void SHA256_Transform(SHA256_ctx *ctx, char *data, unsigned int num_blocks);
which is a C function in sha2.c (from xnu).
sha256 algorithm per block description:
1. W(0:15) = big-endian (per 4 bytes) loading of input data (64 byte)
2. load 8 digests a-h from ctx->state
3. for r = 0:15
T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
d += T1;
h = T1 + Sigma0(a) + Maj(a,b,c)
permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
4. for r = 16:63
W[r] = W[r-16] + sigma1(W[r-2]) + W[r-7] + sigma0(W[r-15]);
T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
d += T1;
h = T1 + Sigma0(a) + Maj(a,b,c)
permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
In the assembly implementation:
- a circular window of message schedule W(r:r+15) is updated and stored in q0-q3
- its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
- the 8 digests (a-h) will be stored in GPR or memory
the implementation per block looks like
----------------------------------------------------------------------------
load W(0:15) (big-endian per 4 bytes) into q0:q3
pre_calculate and store W+K(0:15) in stack
load digests a-h from ctx->state;
for (r=0;r<48;r+=4) {
digests a-h update and permute round r:r+3
update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
}
for (r=48;r<64;r+=4) {
digests a-h update and permute round r:r+3
}
ctx->states += digests a-h;
----------------------------------------------------------------------------
our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block
into the last 16 rounds of its previous block:
----------------------------------------------------------------------------
load W(0:15) (big-endian per 4 bytes) into q0:q3
pre_calculate and store W+K(0:15) in stack
L_loop:
load digests a-h from ctx->state;
for (r=0;r<48;r+=4) {
digests a-h update and permute round r:r+3
update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
}
num_block--;
if (num_block==0) jmp L_last_block;
for (r=48;r<64;r+=4) {
digests a-h update and permute round r:r+3
load W([r:r+3]%16) (big-endian per 4 bytes) into q0:q3
pre_calculate and store W+K([r:r+3]%16) in stack
}
ctx->states += digests a-h;
jmp L_loop;
L_last_block:
for (r=48;r<64;r+=4) {
digests a-h update and permute round r:r+3
}
ctx->states += digests a-h;
------------------------------------------------------------------------
Apple CoreOS vector & numerics
*/
#if defined(__arm64__)
#include "arm64_isa_compatibility.h"
#include "ccarm_pac_bti_macros.h"
.subsections_via_symbols
.text
.p2align 4
K256:
.long 0x428a2f98
.long 0x71374491
.long 0xb5c0fbcf
.long 0xe9b5dba5
.long 0x3956c25b
.long 0x59f111f1
.long 0x923f82a4
.long 0xab1c5ed5
.long 0xd807aa98
.long 0x12835b01
.long 0x243185be
.long 0x550c7dc3
.long 0x72be5d74
.long 0x80deb1fe
.long 0x9bdc06a7
.long 0xc19bf174
.long 0xe49b69c1
.long 0xefbe4786
.long 0x0fc19dc6
.long 0x240ca1cc
.long 0x2de92c6f
.long 0x4a7484aa
.long 0x5cb0a9dc
.long 0x76f988da
.long 0x983e5152
.long 0xa831c66d
.long 0xb00327c8
.long 0xbf597fc7
.long 0xc6e00bf3
.long 0xd5a79147
.long 0x06ca6351
.long 0x14292967
.long 0x27b70a85
.long 0x2e1b2138
.long 0x4d2c6dfc
.long 0x53380d13
.long 0x650a7354
.long 0x766a0abb
.long 0x81c2c92e
.long 0x92722c85
.long 0xa2bfe8a1
.long 0xa81a664b
.long 0xc24b8b70
.long 0xc76c51a3
.long 0xd192e819
.long 0xd6990624
.long 0xf40e3585
.long 0x106aa070
.long 0x19a4c116
.long 0x1e376c08
.long 0x2748774c
.long 0x34b0bcb5
.long 0x391c0cb3
.long 0x4ed8aa4a
.long 0x5b9cca4f
.long 0x682e6ff3
.long 0x748f82ee
.long 0x78a5636f
.long 0x84c87814
.long 0x8cc70208
.long 0x90befffa
.long 0xa4506ceb
.long 0xbef9a3f7
.long 0xc67178f2
.p2align 4
.globl _AccelerateCrypto_SHA256_compress
_AccelerateCrypto_SHA256_compress:
#define hashes x0
#define numblocks x1
#define data x2
#define ktable x3
BRANCH_TARGET_CALL
#ifdef __ILP32__
uxtw numblocks, numblocks // in arm64_32 size_t is 32-bit, so we need to extend it
#endif
adrp ktable, K256@page
cbnz numblocks, 1f // if number of blocks is nonzero, go on for sha256 transform operation
ret lr // otherwise, return
1:
add ktable, ktable, K256@pageoff
#if BUILDKERNEL
// save q0-q7, q16-q24 8+8+1=19
sub x4, sp, #17*16
sub sp, sp, #17*16
st1.4s {v0, v1, v2, v3}, [x4], #64
st1.4s {v4, v5, v6, v7}, [x4], #64
st1.4s {v16, v17, v18, v19}, [x4], #64
st1.4s {v20, v21, v22, v23}, [x4], #64
st1.4s {v24}, [x4], #16
#endif
ld1.4s {v0,v1,v2,v3}, [data], #64 // w0,w1,w2,w3 need to bswap into big-endian
rev32.16b v0, v0 // byte swap of 1st 4 ints
ldr q21, [ktable, #16*0]
rev32.16b v1, v1 // byte swap of 2nd 4 ints
ldr q16, [hashes, #0]
rev32.16b v2, v2 // byte swap of 3rd 4 ints
ldr q17, [hashes, #16]
rev32.16b v3, v3 // byte swap of 4th 4 ints
ldr q22, [ktable, #16*1]
mov.16b v18, v16
ldr q23, [ktable, #16*2]
add.4s v4, v0, v21 // 1st 4 input + K256
ldr q24, [ktable, #16*3]
add.4s v5, v1, v22 // 2nd 4 input + K256
mov.16b v19, v17
add.4s v6, v2, v23 // 3rd 4 input + K256
add.4s v7, v3, v24 // 4th 4 input + K256
add ktable, ktable, #16*4
.macro sha256_round
mov.16b v20, v18
SHA256SU0 $0, $1
SHA256H 18, 19, $4
SHA256SU1 $0, $2, $3
SHA256H2 19, 20, $4
add.4s $6, $5, $7
.endm
// 4 vector hashes update and load next vector rounds
.macro sha256_hash_load_round
mov.16b v20, v18
SHA256H 18, 19, $0
rev32.16b $1, $1
SHA256H2 19, 20, $0
add.4s $2, $1, $3
.endm
.macro sha256_hash_round
mov.16b v20, v18
SHA256H 18, 19, $0
SHA256H2 19, 20, $0
.endm
// 12 vector hash and sequence update rounds
mov w4, #3
L_i_loop:
mov.16b v20, v18
ldr q21, [ktable, #0] // k0
SHA256SU0 0, 1
ldr q22, [ktable, #16] // k1
SHA256H 18, 19, 4
ldr q23, [ktable, #32] // k2
SHA256SU1 0, 2, 3
ldr q24, [ktable, #48] // k3
SHA256H2 19, 20, 4
add ktable, ktable, #64
add.4s v4, v0, v21
sha256_round 1, 2, 3, 0, 5, v1, v5, v22
sha256_round 2, 3, 0, 1, 6, v2, v6, v23
subs w4, w4, #1
sha256_round 3, 0, 1, 2, 7, v3, v7, v24
b.gt L_i_loop
subs numblocks, numblocks, #1 // pre-decrement num_blocks by 1
b.le L_wrapup
sub ktable, ktable, #256
L_loop:
ldr q0, [data, #0]
mov.16b v20, v18
ldr q21, [ktable,#0]
SHA256H 18, 19, 4
ldr q1, [data, #16]
rev32.16b v0, v0
ldr q2, [data, #32]
SHA256H2 19, 20, 4
ldr q3, [data, #48]
add.4s v4, v0, v21
ldr q22, [ktable,#16]
mov.16b v20, v18
add data, data, #64
SHA256H 18, 19, 5
ldr q23, [ktable,#32]
rev32.16b v1, v1
ldr q24, [ktable,#48]
SHA256H2 19, 20, 5
add.4s v5, v1, v22
sha256_hash_load_round 6, v2, v6, v23
sha256_hash_load_round 7, v3, v7, v24
add.4s v18, v16, v18
add.4s v19, v17, v19
mov.16b v16, v18
mov.16b v17, v19
// 12 vector hash and sequence update rounds
mov.16b v20, v18
ldr q21, [ktable, #16*4] // k0
SHA256SU0 0, 1
ldr q22, [ktable, #16*5] // k1
SHA256H 18, 19, 4
ldr q23, [ktable, #16*6] // k2
SHA256SU1 0, 2, 3
ldr q24, [ktable, #16*7] // k3
SHA256H2 19, 20, 4
add.4s v4, v0, v21
sha256_round 1, 2, 3, 0, 5, v1, v5, v22
sha256_round 2, 3, 0, 1, 6, v2, v6, v23
sha256_round 3, 0, 1, 2, 7, v3, v7, v24
mov.16b v20, v18
ldr q21, [ktable, #16*8] // k0
SHA256SU0 0, 1
ldr q22, [ktable, #16*9] // k1
SHA256H 18, 19, 4
ldr q23, [ktable, #16*10] // k2
SHA256SU1 0, 2, 3
ldr q24, [ktable, #16*11] // k3
SHA256H2 19, 20, 4
add.4s v4, v0, v21
sha256_round 1, 2, 3, 0, 5, v1, v5, v22
sha256_round 2, 3, 0, 1, 6, v2, v6, v23
sha256_round 3, 0, 1, 2, 7, v3, v7, v24
mov.16b v20, v18
ldr q21, [ktable, #16*12] // k0
SHA256SU0 0, 1
ldr q22, [ktable, #16*13] // k1
SHA256H 18, 19, 4
ldr q23, [ktable, #16*14] // k2
SHA256SU1 0, 2, 3
ldr q24, [ktable, #16*15] // k3
SHA256H2 19, 20, 4
add.4s v4, v0, v21
sha256_round 1, 2, 3, 0, 5, v1, v5, v22
sha256_round 2, 3, 0, 1, 6, v2, v6, v23
sha256_round 3, 0, 1, 2, 7, v3, v7, v24
subs numblocks, numblocks, #1 // pre-decrement num_blocks by 1
b.gt L_loop
L_wrapup:
sha256_hash_round 4
sha256_hash_round 5
sha256_hash_round 6
sha256_hash_round 7
add.4s v16, v16, v18
add.4s v17, v17, v19
st1.4s {v16,v17}, [hashes] // hashes q16 : d,c,b,a q17 : h,g,f,e
#if BUILDKERNEL
// restore q9-q13, q0-q7, q16-q31
ld1.4s {v0, v1, v2, v3}, [sp], #64
ld1.4s {v4, v5, v6, v7}, [sp], #64
ld1.4s {v16, v17, v18, v19}, [sp], #64
ld1.4s {v20, v21, v22, v23}, [sp], #64
ld1.4s {v24}, [sp], #16
#endif
ret lr
#endif // arm64

View File

@ -0,0 +1,796 @@
# Copyright (c) (2011-2013,2015,2016,2018-2020) Apple Inc. All rights reserved.
#
# corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
# is contained in the License.txt file distributed with corecrypto) and only to
# people who accept that license. IMPORTANT: Any license rights granted to you by
# Apple Inc. (if any) are limited to internal use within your organization only on
# devices and computers you own or control, for the sole purpose of verifying the
# security characteristics and correct functioning of the Apple Software. You may
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
/*
This is for Chinook AOP (arm64) that does not support crypto instructions.
This file provides arm64 neon hand implementation of the following function
void SHA256_Transform(SHA256_ctx *ctx, char *data, unsigned int num_blocks);
which is a C function in sha2.c (from xnu).
sha256 algorithm per block description:
1. W(0:15) = big-endian (per 4 bytes) loading of input data (64 byte)
2. load 8 digests a-h from ctx->state
3. for r = 0:15
T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
d += T1;
h = T1 + Sigma0(a) + Maj(a,b,c)
permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
4. for r = 16:63
W[r] = W[r-16] + sigma1(W[r-2]) + W[r-7] + sigma0(W[r-15]);
T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
d += T1;
h = T1 + Sigma0(a) + Maj(a,b,c)
permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
In the assembly implementation:
- a circular window of message schedule W(r:r+15) is updated and stored in q0-q3
- its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
- the 8 digests (a-h) will be stored in GPR or memory
the implementation per block looks like
----------------------------------------------------------------------------
load W(0:15) (big-endian per 4 bytes) into q0:q3
pre_calculate and store W+K(0:15) in stack
load digests a-h from ctx->state;
for (r=0;r<48;r+=4) {
digests a-h update and permute round r:r+3
update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
}
for (r=48;r<64;r+=4) {
digests a-h update and permute round r:r+3
}
ctx->states += digests a-h;
----------------------------------------------------------------------------
our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block
into the last 16 rounds of its previous block:
----------------------------------------------------------------------------
load W(0:15) (big-endian per 4 bytes) into q0:q3
pre_calculate and store W+K(0:15) in stack
L_loop:
load digests a-h from ctx->state;
for (r=0;r<48;r+=4) {
digests a-h update and permute round r:r+3
update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
}
num_block--;
if (num_block==0) jmp L_last_block;
for (r=48;r<64;r+=4) {
digests a-h update and permute round r:r+3
load W([r:r+3]%16) (big-endian per 4 bytes) into q0:q3
pre_calculate and store W+K([r:r+3]%16) in stack
}
ctx->states += digests a-h;
jmp L_loop;
L_last_block:
for (r=48;r<64;r+=4) {
digests a-h update and permute round r:r+3
}
ctx->states += digests a-h;
------------------------------------------------------------------------
Apple CoreOS vector & numerics
*/
// associate variables with registers or memory
#define ctx x0
#define num_blocks x1
#define data x2
#define ktable x3
#define _i_loop x4
#define a w5
#define bb w6
#define c w7
#define d w8
#define e w9
#define f w10
#define g w11
#define h w12
// 2 local variables
#define t w13
#define s w14
// a window (16 words) of message scheule
#define W0 v0
#define W1 v1
#define W2 v2
#define W3 v3
#define qW0 q0
#define qW1 q1
#define qW2 q2
#define qW3 q3
#define zero v16
#define WK0 v4
#define WK1 v5
#define WK2 v6
#define WK3 v7
#define qWK0 q4
#define qWK1 q5
#define qWK2 q6
#define qWK3 q7
// circular buffer for WK[(r:r+15)%16]
#define WK(r) [sp,#((r)&15)*4]
// #define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
.macro Ch
mvn t, $0 // ~x
and s, $0, $1 // (x) & (y)
and t, t, $2 // (~(x)) & (z)
eor t, t, s // t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
.endm
// #define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
.macro Maj
eor t, $1, $2 // y^z
and s, $1, $2 // y&z
and t, t, $0 // x&(y^z)
eor t, t, s // t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
.endm
// #define sigma0_256(x) (S32(7, (x)) ^ S32(18, (x)) ^ R(3 , (x)))
// performs sigma0_256 on 4 words on a Q register
// use q6/q7 as intermediate registers
.macro sigma0
vshr.u32 q6, $0, #7
vshl.i32 q7, $0, #14
vshr.u32 $0, $0, #3
veor $0, q6
veor $0, q7
vshr.u32 q6, #11
vshl.i32 q7, #11
veor $0, q6
veor $0, q7
.endm
// #define sigma1_256(x) (S32(17, (x)) ^ S32(19, (x)) ^ R(10, (x)))
// performs sigma1_256 on 4 words on a Q register
// use q6/q7 as intermediate registers
.macro sigma1
vshr.u32 q6, $0, #17
vshl.i32 q7, $0, #13
vshr.u32 $0, $0, #10
veor $0, q6
veor $0, q7
vshr.u32 q6, #2
vshl.i32 q7, #2
veor $0, q6
veor $0, q7
.endm
// #define Sigma0_256(x) (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x)))
.macro Sigma0
ror t, $0, #2 // S32(2, (x))
ror s, $0, #13 // S32(13, (x))
eor t, t, s // S32(2, (x)) ^ S32(13, (x))
ror s, s, #9 // S32(22, (x))
eor t, t, s // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x)))
.endm
// #define Sigma1_256(x) (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
.macro Sigma1
ror t, $0, #6 // S32(6, (x))
ror s, $0, #11 // S32(11, (x))
eor t, t, s // S32(6, (x)) ^ S32(11, (x))
ror s, s, #14 // S32(25, (x))
eor t, t, s // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
.endm
// per round digests update
.macro round
// ror t, $4, #6 // S32(6, (x))
eor t, t, $4, ror #11 // S32(6, (x)) ^ S32(11, (x))
eor t, t, $4, ror #25 // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
and s, $4, $5 // (x) & (y)
add $7, $7, t // use h to store h+Sigma1(e)
bic t, $6, $4 // (~(x)) & (z)
eor t, t, s // t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
mov s, $8 //
add $7, $7, t // t = h+Sigma1(e)+Ch(e,f,g);
ror t, $0, #2 // S32(2, (x))
add $7, $7, s // h = T1
eor t, t, $0, ror #13 // S32(2, (x)) ^ S32(13, (x))
add $3, $3, $7 // d += T1;
eor t, t, $0, ror #22 // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) // t = Sigma0(a);
add $7, $7, t // h = T1 + Sigma0(a);
eor t, $1, $2 // y^z
and s, $1, $2 // y&z
and t, t, $0 // x&(y^z)
eor s, s, t // t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
// add $7, s // h = T1 + Sigma0(a) + Maj(a,b,c);
.endm
// per 4 rounds digests update and permutation
// permutation is absorbed by rotating the roles of digests a-h
.macro rounds
ror t, $4, #6
round $0, $1, $2, $3, $4, $5, $6, $7, 0+$8
ror t, $3, #6
add $7, s
round $7, $0, $1, $2, $3, $4, $5, $6, 1+$8
ror t, $2, #6
add $6, s
round $6, $7, $0, $1, $2, $3, $4, $5, 2+$8
ror t, $1, #6
add $5, s
round $5, $6, $7, $0, $1, $2, $3, $4, 3+$8
add $4, s
.endm
.macro rounds_a
ror t, e, #6
round a, bb, c, d, e, f, g, h, $0.s[0]
ror t, d, #6
add h, h, s
round h, a, bb, c, d, e, f, g, $0.s[1]
ror t, c, #6
add g, g, s
round g, h, a, bb, c, d, e, f, $0.s[2]
ror t, bb, #6
add f, f, s
round f, g, h, a, bb, c, d, e, $0.s[3]
add e, e, s
.endm
.macro rounds_e
ror t, a, #6
round e, f, g, h, a, bb, c, d, $0.s[0]
ror t, h, #6
add d, d, s
round d, e, f, g, h, a, bb, c, $0.s[1]
ror t, g, #6
add c, c, s
round c, d, e, f, g, h, a, bb, $0.s[2]
ror t, f, #6
add bb, bb, s
round bb, c, d, e, f, g, h, a, $0.s[3]
add a, a, s
.endm
.macro rounds_a_update_W_WK
ror t, e, #6
ldr $3, [data], #16
round a, bb, c, d, e, f, g, h, $0.s[0]
ror t, d, #6
rev32.16b $1, $1
add h, h, s
round h, a, bb, c, d, e, f, g, $0.s[1]
ror t, c, #6
add g, g, s
ldr q17, [ktable], #16
round g, h, a, bb, c, d, e, f, $0.s[2]
ror t, bb, #6
add f, f, s
round f, g, h, a, bb, c, d, e, $0.s[3]
add e, e, s
add.4s $0, v17, $1
.endm
.macro rounds_e_update_W_WK
ror t, a, #6
ldr $3, [data], #16
round e, f, g, h, a, bb, c, d, $0.s[0]
ror t, h, #6
rev32.16b $1, $1
add d, d, s
round d, e, f, g, h, a, bb, c, $0.s[1]
ror t, g, #6
add c, c, s
ldr q17, [ktable], #16
round c, d, e, f, g, h, a, bb, $0.s[2]
ror t, f, #6
add bb, bb, s
round bb, c, d, e, f, g, h, a, $0.s[3]
add a, a, s
add.4s $0, v17, $1
.endm
// this macro is used in the last 16 rounds of a current block
// it reads the next message (16 4-byte words), load it into 4 words W[r:r+3], computes WK[r:r+3]
// and save into stack to prepare for next block
.macro update_W_WK
ldr $3, [data]
ldr $2, [ktable]
add data, data, #16
rev32.16b $1, $1
add ktable, ktable, #16
add.4s $0, $0, $1
.endm
.macro Update_Digits
ldp t, s, [ctx]
add a, a, t
add bb, bb, s
stp a, bb, [ctx]
ldp t, s, [ctx,#8]
add c, c, t
add d, d, s
stp c, d, [ctx, #8]
ldp t, s, [ctx,#16]
add e, e, t
add f, f, s
stp e, f, [ctx, #16]
ldp t, s, [ctx,#24]
add g, g, t
add h, h, s
stp g, h, [ctx, #24]
.endm
.macro rounds_a_schedule_update
eor t, e, e, ror #5 // S32(6, (x)) ^ S32(11, (x))
ldr q17, [ktable], #16
eor t, t, e, ror #19 // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
ext.16b v18, $1, $2, #4 // w4:w1
ror t, t, #6
and s, e, f // (x) & (y)
add h, h, t // use h to store h+Sigma1(e)
bic t, g, e // (~(x)) & (z)
ushr.4s v19, v18, #7
eor t, t, s // t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
mov s, $5.s[0] //
add h, h, t // t = h+Sigma1(e)+Ch(e,f,g);
shl.4s v20, v18, #14
eor t, a, a, ror #11 // S32(2, (x)) ^ S32(13, (x))
ushr.4s v18, v18, #3
add h, h, s // h = T1
eor t, t, a, ror #20 // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) // t = Sigma0(a);
add d, d, h // d += T1;
ror t, t, #2
eor.16b v18, v18, v19
add h, h, t // h = T1 + Sigma0(a);
ushr.4s v19, v19, #11
eor t, bb, c // y^z
and s, bb, c // y&z
and t, t, a // x&(y^z)
eor.16b v18, v18, v20
eor s, s, t // t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
shl.4s v20, v20, #11
eor t, d, d, ror #5 // S32(6, (x)) ^ S32(11, (x))
add h, h, s
eor t, t, d, ror #19 // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
eor.16b v18, v18, v19
and s, d, e // (x) & (y)
ext.16b v19, $3, $4, #4 // q19 = w12:w9
ror t, t, #6
add g, g, t // use h to store h+Sigma1(e)
eor.16b v18, v18, v20
bic t, f, d // (~(x)) & (z)
eor t, t, s // t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
mov s, $5.s[1] //
add g, g, t // t = h+Sigma1(e)+Ch(e,f,g);
eor t, h, h, ror #11 // S32(2, (x)) ^ S32(13, (x))
add.4s $1, $1, v18 // w3:w0 + sigma0(w4:w1)
add g, g, s // h = T1
ext.16b v18, $4, zero, #8 // 0 0 w15:w14
eor t, t, h, ror #20 // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) // t = Sigma0(a);
add.4s $1, $1, v19 // w3:w0 + sigma0(w4:w1) + w12:w9
ror t, t, #2
add c, c, g // d += T1;
ushr.4s v19, v18, #17
add g, g, t // h = T1 + Sigma0(a);
shl.4s v20, v18, #13
eor t, a, bb // y^z
ushr.4s v18, v18, #10
and s, a, bb // y&z
and t, t, h // x&(y^z)
eor.16b v18, v18, v19
eor s, s, t // t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
ushr.4s v19, v19, #2
eor t, c, c, ror #5 // S32(6, (x)) ^ S32(11, (x))
add g, g, s
eor.16b v18, v18, v20
eor t, t, c, ror #19 // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
shl.4s v20, v20, #2
ror t, t, #6
and s, c, d // (x) & (y)
eor.16b v18, v18, v19
add f, f, t // use h to store h+Sigma1(e)
eor.16b v18, v18, v20
bic t, e, c // (~(x)) & (z)
add.4s $1, $1, v18 // w3:w0 + sigma0(w4:w1) + w12:w9 + sigma1(0 0 w15:w14)
eor t, t, s // t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
mov s, $5.s[2] //
add f, f, t // t = h+Sigma1(e)+Ch(e,f,g);
ext.16b v18, zero, $1, #8 // Q4 = (w17:w16 0 0)
eor t, g, g, ror #11 // S32(2, (x)) ^ S32(13, (x))
add f, f, s // h = T1
eor t, t, g, ror #20 // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) // t = Sigma0(a);
ushr.4s v19, v18, #17
add bb, bb, f // d += T1;
shl.4s v20, v18, #13
ror t, t, #2
ushr.4s v18, v18, #10
add f, f, t // h = T1 + Sigma0(a);
eor t, h, a // y^z
and s, h, a // y&z
eor.16b v18, v18, v19
and t, t, g // x&(y^z)
eor s, s, t // t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
eor t, bb, bb, ror #5 // S32(6, (x)) ^ S32(11, (x))
add f, f, s
eor.16b v18, v18, v20
eor t, t, bb, ror #19 // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
ushr.4s v19, v19, #2
ror t, t, #6
shl.4s v20, v20, #2
and s, bb, c // (x) & (y)
eor.16b v18, v18, v19
add e, e, t // use h to store h+Sigma1(e)
bic t, d, bb // (~(x)) & (z)
eor t, t, s // t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
mov s, $5.s[3] //
add e, e, t // t = h+Sigma1(e)+Ch(e,f,g);
eor.16b v18, v18, v20
eor t, f, f, ror #11 // S32(2, (x)) ^ S32(13, (x))
add e, e, s // h = T1
eor t, t, f, ror #20 // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) // t = Sigma0(a);
add a, a, e // d += T1;
ror t, t, #2
add.4s $1, $1, v18 // w19:w16 = w3:w0 + sigma0(w4:w1) + w12:w9 + sigma1(w17:w14)
add e, e, t // h = T1 + Sigma0(a);
eor t, g, h // y^z
and s, g, h // y&z
add.4s $5, v17, $1 // W+K
and t, t, f // x&(y^z)
eor s, s, t // t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
add e, e, s
.endm
.macro rounds_e_schedule_update
eor t, a, a, ror #5 // S32(6, (x)) ^ S32(11, (x))
ldr q17, [ktable], #16 // K
eor t, t, a, ror #19 // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
ext.16b v18, $1, $2, #4 // Q18 = w4:w1
ror t, t, #6
and s, a, bb // (x) & (y)
add d, d, t // use h to store h+Sigma1(e)
bic t, c, a // (~(x)) & (z)
ushr.4s v19, v18, #7
eor t, t, s // t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
mov s, $5.s[0]
add d, d, t // t = h+Sigma1(e)+Ch(e,f,g);
shl.4s v20, v18, #14
eor t, e, e, ror #11 // S32(2, (x)) ^ S32(13, (x))
ushr.4s v18, v18, #3
add d, d, s // h = T1
eor t, t, e, ror #20 // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) // t = Sigma0(a);
add h, h, d // d += T1;
ror t, t, #2
eor.16b v18, v18, v19
add d, d, t // h = T1 + Sigma0(a);
ushr.4s v19, v19, #11
eor t, f, g // y^z
and s, f, g // y&z
and t, t, e // x&(y^z)
eor.16b v18, v18, v20
eor s, s, t // t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
shl.4s v20, v20, #11
eor t, h, h, ror #5 // S32(6, (x)) ^ S32(11, (x))
add d, d, s
eor t, t, h, ror #19 // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
eor.16b v18, v18, v19
and s, h, a // (x) & (y)
ext.16b v19, $3, $4, #4 // q19 = w12:w9
ror t, t, #6
add c, c, t // use h to store h+Sigma1(e)
eor.16b v18, v18, v20
bic t, bb, h // (~(x)) & (z)
eor t, t, s // t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
mov s, $5.s[1]
add c, c, t // t = h+Sigma1(e)+Ch(e,f,g);
eor t, d, d, ror #11 // S32(2, (x)) ^ S32(13, (x))
add.4s $1, $1, v18 // w3:w0 + sigma0(w4:w1)
add c, c, s // h = T1
ext.16b v18, $4, zero, #8 // 0 0 w15:w14
eor t, t, d, ror #20 // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) // t = Sigma0(a);
add.4s $1, $1, v19 // w3:w0 + sigma0(w4:w1) + w12:w9
ror t, t, #2
add g, g, c // d += T1;
ushr.4s v19, v18, #17
add c, c, t // h = T1 + Sigma0(a);
shl.4s v20, v18, #13
eor t, e, f // y^z
ushr.4s v18, v18, #10
and s, e, f // y&z
and t, t, d // x&(y^z)
eor.16b v18, v18, v19
eor s, s, t // t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
ushr.4s v19, v19, #2
eor t, g, g, ror #5 // S32(6, (x)) ^ S32(11, (x))
add c, c, s
eor.16b v18, v18, v20
eor t, t, g, ror #19 // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
shl.4s v20, v20, #2
ror t, t, #6
and s, g, h // (x) & (y)
eor.16b v18, v18, v19
add bb, bb, t // use h to store h+Sigma1(e)
eor.16b v18, v18, v20
bic t, a, g // (~(x)) & (z)
add.4s $1, $1, v18 // w3:w0 + sigma0(w4:w1) + w12:w9 + sigma1(0 0 w15:w14)
eor t, t, s // t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
mov s, $5.s[2]
add bb, bb, t // t = h+Sigma1(e)+Ch(e,f,g);
ext.16b v18, zero, $1, #8 // Q18 = (w17:w16 0 0)
eor t, c, c, ror #11 // S32(2, (x)) ^ S32(13, (x))
add bb, bb, s // h = T1
eor t, t, c, ror #20 // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) // t = Sigma0(a);
ushr.4s v19, v18, #17
add f, f, bb // d += T1;
shl.4s v20, v18, #13
ror t, t, #2
ushr.4s v18, v18, #10
add bb, bb, t // h = T1 + Sigma0(a);
eor t, d, e // y^z
and s, d, e // y&z
eor.16b v18, v18, v19
and t, t, c // x&(y^z)
eor s, s, t // t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
eor t, f, f, ror #5 // S32(6, (x)) ^ S32(11, (x))
add bb, bb, s
eor.16b v18, v18, v20
eor t, t, f, ror #19 // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
ushr.4s v19, v19, #2
ror t, t, #6
shl.4s v20, v20, #2
and s, f, g // (x) & (y)
add a, a, t // use h to store h+Sigma1(e)
eor.16b v18, v18, v19
bic t, h, f // (~(x)) & (z)
eor t, t, s // t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
mov s, $5.s[3]
add a, a, t // t = h+Sigma1(e)+Ch(e,f,g);
eor.16b v18, v18, v20
eor t, bb, bb, ror #11 // S32(2, (x)) ^ S32(13, (x))
add a, a, s // h = T1
eor t, t, bb, ror #20 // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) // t = Sigma0(a);
ror t, t, #2
add.4s $1, $1, v18 // w19:w16 = w3:w0 + sigma0(w4:w1) + w12:w9 + sigma1(w17:w14)
add e, e, a // d += T1;
add a, a, t // h = T1 + Sigma0(a);
eor t, c, d // y^z
and s, c, d // y&z
add.4s $5, v17, $1 // W+K
and t, t, bb // x&(y^z)
eor s, s, t // t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
add a, a, s
.endm
#if defined(__arm64__)
#include "ccarm_pac_bti_macros.h"
.subsections_via_symbols
.text
.p2align 4
K256:
.long 0x428a2f98
.long 0x71374491
.long 0xb5c0fbcf
.long 0xe9b5dba5
.long 0x3956c25b
.long 0x59f111f1
.long 0x923f82a4
.long 0xab1c5ed5
.long 0xd807aa98
.long 0x12835b01
.long 0x243185be
.long 0x550c7dc3
.long 0x72be5d74
.long 0x80deb1fe
.long 0x9bdc06a7
.long 0xc19bf174
.long 0xe49b69c1
.long 0xefbe4786
.long 0x0fc19dc6
.long 0x240ca1cc
.long 0x2de92c6f
.long 0x4a7484aa
.long 0x5cb0a9dc
.long 0x76f988da
.long 0x983e5152
.long 0xa831c66d
.long 0xb00327c8
.long 0xbf597fc7
.long 0xc6e00bf3
.long 0xd5a79147
.long 0x06ca6351
.long 0x14292967
.long 0x27b70a85
.long 0x2e1b2138
.long 0x4d2c6dfc
.long 0x53380d13
.long 0x650a7354
.long 0x766a0abb
.long 0x81c2c92e
.long 0x92722c85
.long 0xa2bfe8a1
.long 0xa81a664b
.long 0xc24b8b70
.long 0xc76c51a3
.long 0xd192e819
.long 0xd6990624
.long 0xf40e3585
.long 0x106aa070
.long 0x19a4c116
.long 0x1e376c08
.long 0x2748774c
.long 0x34b0bcb5
.long 0x391c0cb3
.long 0x4ed8aa4a
.long 0x5b9cca4f
.long 0x682e6ff3
.long 0x748f82ee
.long 0x78a5636f
.long 0x84c87814
.long 0x8cc70208
.long 0x90befffa
.long 0xa4506ceb
.long 0xbef9a3f7
.long 0xc67178f2
.p2align 4
.globl _AccelerateCrypto_SHA256_compress_arm64neon
_AccelerateCrypto_SHA256_compress_arm64neon:
BRANCH_TARGET_CALL
adrp ktable, K256@page
cbnz num_blocks, 1f // if number of blocks is nonzero, go on for sha256 transform operation
ret lr // otherwise, return
1:
add ktable, ktable, K256@pageoff
#if BUILDKERNEL
// save q0-q7, q16-q20 8+4+1=13
sub x4, sp, #13*16
sub sp, sp, #13*16
st1.4s {v0, v1, v2, v3}, [x4], #64
st1.4s {v4, v5, v6, v7}, [x4], #64
st1.4s {v16, v17, v18, v19}, [x4], #64
st1.4s {v20}, [x4]
#endif
// load W[0:15]
ldr qW0, [data, #0*16]
movi.16b zero, #0
ldr qW1, [data, #1*16]
ldr qW2, [data, #2*16]
ldr qW3, [data, #3*16]
add data, data, #4*16
// load K[0:15] & per word byte swap
rev32.16b W0, W0
ldr qWK0, [ktable, #0*16]
rev32.16b W1, W1
ldr qWK1, [ktable, #1*16]
rev32.16b W2, W2
ldr qWK2, [ktable, #2*16]
rev32.16b W3, W3
ldr qWK3, [ktable, #3*16]
// compute WK[0:15]
add ktable, ktable, #4*16
add.4s WK0, WK0, W0
ldp a, bb, [ctx, #0*4]
add.4s WK1, WK1, W1
ldp c, d, [ctx, #2*4]
add.4s WK2, WK2, W2
ldp e, f, [ctx, #4*4]
add.4s WK3, WK3, W3
ldp g, h, [ctx, #6*4]
L_loop:
// rounds 0:47 interleaved with W/WK update for rounds 16:63
mov _i_loop, #3
L_i_loop:
rounds_a_schedule_update 0,W0,W1,W2,W3, WK0
rounds_e_schedule_update 4,W1,W2,W3,W0, WK1
rounds_a_schedule_update 8,W2,W3,W0,W1, WK2
rounds_e_schedule_update 12,W3,W0,W1,W2, WK3
subs _i_loop, _i_loop, #1
b.gt L_i_loop
// revert K to the beginning of K256[]
subs num_blocks, num_blocks, #1 // num_blocks--
sub ktable, ktable, #256
b.eq L_final_block // if final block, wrap up final rounds
// rounds 48:63 interleaved with W/WK initialization for next block rounds 0:15
rounds_a_update_W_WK WK0, W0, qWK0, qW0
rounds_e_update_W_WK WK1, W1, qWK1, qW1
rounds_a_update_W_WK WK2, W2, qWK2, qW2
rounds_e_update_W_WK WK3, W3, qWK3, qW3
// ctx->states += digests a-h, also update digest variables a-h
Update_Digits
b.al L_loop // branch for next block
// wrap up digest update round 48:63 for final block
L_final_block:
rounds_a WK0
rounds_e WK1
rounds_a WK2
rounds_e WK3
// ctx->states += digests a-h
Update_Digits
#if BUILDKERNEL
// restore q0-q7, q16-q20
ld1.4s {v0, v1, v2, v3}, [sp], #64
ld1.4s {v4, v5, v6, v7}, [sp], #64
ld1.4s {v16, v17, v18, v19}, [sp], #64
ld1.4s {v20}, [sp], #16
#endif
ret lr
#endif /* arm64 */

View File

@ -0,0 +1,30 @@
/* Copyright (c) (2010,2014-2016,2019,2020) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#include <stdint.h>
#include <corecrypto/cc_config.h>
/* the K array */
const uint32_t sha256_K[64] CC_ALIGNED(16) = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b,
0x59f111f1, 0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01,
0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7,
0xc19bf174, 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 0x983e5152,
0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147,
0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc,
0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819,
0xd6990624, 0xf40e3585, 0x106aa070, 0x19a4c116, 0x1e376c08,
0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f,
0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};

View File

@ -0,0 +1,30 @@
/* Copyright (c) (2019) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#include <stddef.h>
#include "config.h"
#include "AccelerateCrypto.h"
#if (defined(__x86_64__) || defined(__i386__))
extern void AccelerateCrypto_SHA256_compress_ssse3(uint32_t *state, size_t num, const void *buf) __asm__("_AccelerateCrypto_SHA256_compress_ssse3");
extern void AccelerateCrypto_SHA256_compress_AVX1(uint32_t *state, size_t num, const void *buf) __asm__("_AccelerateCrypto_SHA256_compress_AVX1");
extern void AccelerateCrypto_SHA256_compress_AVX2(uint32_t *state, size_t num, const void *buf) __asm__("_AccelerateCrypto_SHA256_compress_AVX2");
void AccelerateCrypto_SHA256_compress(uint32_t *state, size_t num, const void *buf)
{
#if defined(__x86_64__)
if (HAS_AVX2()) AccelerateCrypto_SHA256_compress_AVX2(state, num, buf);
else if (HAS_AVX1()) AccelerateCrypto_SHA256_compress_AVX1(state, num, buf);
else
#endif
AccelerateCrypto_SHA256_compress_ssse3(state, num, buf);
}
#endif // (defined(__x86_64__) || defined(__i386__))

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,504 @@
# Copyright (c) (2010,2011,2012,2014,2015,2016,2018,2019) Apple Inc. All rights reserved.
#
# corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
# is contained in the License.txt file distributed with corecrypto) and only to
# people who accept that license. IMPORTANT: Any license rights granted to you by
# Apple Inc. (if any) are limited to internal use within your organization only on
# devices and computers you own or control, for the sole purpose of verifying the
# security characteristics and correct functioning of the Apple Software. You may
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
#include <corecrypto/cc_config.h>
/*
This file provides i386 hand implementation of the following function
sha2_void sha256_compile(sha256_ctx ctx[1]);
which is a C function in CommonCrypto Source/Digest/sha2.c
The implementation here is modified from another sha256 i386 implementation for sha256 in the xnu.
To modify to fit the new API,
the old ctx (points to ctx->hashes) shoule be changed to ctx->hashes, 8(ctx).
the old data (points to ctx->wbuf), should be changed to ctx->wbuf, 40(ctx).
sha256_compile handles 1 input block (64 bytes) per call.
The following is comments for the initial xnu-sha256.s.
void SHA256_Transform(SHA256_ctx *ctx, char *data, unsigned int num_blocks);
which is a C function in sha2.c (from xnu).
sha256 algorithm per block description:
1. W(0:15) = big-endian (per 4 bytes) loading of input data (64 byte)
2. load 8 digests a-h from ctx->state
3. for r = 0:15
T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
d += T1;
h = T1 + Sigma0(a) + Maj(a,b,c)
permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
4. for r = 16:63
W[r] = W[r-16] + sigma1(W[r-2]) + W[r-7] + sigma0(W[r-15]);
T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
d += T1;
h = T1 + Sigma0(a) + Maj(a,b,c)
permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
In the assembly implementation:
- a circular window of message schedule W(r:r+15) is updated and stored in xmm0-xmm3
- its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
- the 8 digests (a-h) will be stored in GPR or m32 (all in GPR for x86_64, and some in m32 for i386)
the implementation per block looks like
----------------------------------------------------------------------------
load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3
pre_calculate and store W+K(0:15) in stack
load digests a-h from ctx->state;
for (r=0;r<48;r+=4) {
digests a-h update and permute round r:r+3
update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
}
for (r=48;r<64;r+=4) {
digests a-h update and permute round r:r+3
}
ctx->states += digests a-h;
----------------------------------------------------------------------------
our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block
into the last 16 rounds of its previous block:
----------------------------------------------------------------------------
load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3
pre_calculate and store W+K(0:15) in stack
L_loop:
load digests a-h from ctx->state;
for (r=0;r<48;r+=4) {
digests a-h update and permute round r:r+3
update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
}
num_block--;
if (num_block==0) jmp L_last_block;
for (r=48;r<64;r+=4) {
digests a-h update and permute round r:r+3
load W([r:r+3]%16) (big-endian per 4 bytes) into xmm0:xmm3
pre_calculate and store W+K([r:r+3]%16) in stack
}
ctx->states += digests a-h;
jmp L_loop;
L_last_block:
for (r=48;r<64;r+=4) {
digests a-h update and permute round r:r+3
}
ctx->states += digests a-h;
------------------------------------------------------------------------
Apple CoreOS vector & numerics
*/
#if defined __i386__
// associate variables with registers or memory
#define sp %esp
#define stack_size (12+16*8+16+16+64) // 12 (align) + xmm0:xmm7 + 16 (c,f,h,K) + L_aligned_bswap + WK(0:15)
#define ctx_addr 20+stack_size(sp) // ret_addr + 4 registers = 20, 1st caller argument
#define num_blocks 24+stack_size(sp) // 2nd caller argument
#define data_addr 28+stack_size(sp) // 3rd caller argument
#define a %ebx
#define b %edx
#define c 64(sp)
#define d %ebp
#define e %esi
#define f 68(sp)
#define g %edi
#define h 72(sp)
#define K 76(sp) // pointer to K256[] table
#define L_aligned_bswap 80(sp) // bswap : big-endian loading of 4-byte words
#define xmm_save 96(sp) // starting address for xmm save/restore
// 2 local variables
#define t %eax
#define s %ecx
// a window (16 words) of message scheule
#define W0 %xmm0
#define W1 %xmm1
#define W2 %xmm2
#define W3 %xmm3
// circular buffer for WK[(r:r+15)%16]
#define WK(x) ((x)&15)*4(sp)
// #define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
.macro Ch
mov $0, t // x
mov $0, s // x
not t // ~x
and $1, s // x & y
and $2, t // ~x & z
xor s, t // t = ((x) & (y)) ^ ((~(x)) & (z));
.endm
// #define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
.macro Maj
mov $1, t // y
mov $2, s // z
xor $2, t // y^z
and $1, s // y&z
and $0, t // x&(y^z)
xor s, t // Maj(x,y,z)
.endm
// #define sigma0_256(x) (S32(7, (x)) ^ S32(18, (x)) ^ R(3 , (x)))
// performs sigma0_256 on 4 words on an xmm registers
// use xmm6/xmm7 as intermediate registers
.macro sigma0
movdqa $0, %xmm6
movdqa $0, %xmm7
psrld $$3, $0 // SHR3(x)
psrld $$7, %xmm6 // part of ROTR7
pslld $$14, %xmm7 // part of ROTR18
pxor %xmm6, $0
pxor %xmm7, $0
psrld $$11, %xmm6 // part of ROTR18
pslld $$11, %xmm7 // part of ROTR7
pxor %xmm6, $0
pxor %xmm7, $0
.endm
// #define sigma1_256(x) (S32(17, (x)) ^ S32(19, (x)) ^ R(10, (x)))
// performs sigma1_256 on 4 words on an xmm registers
// use xmm6/xmm7 as intermediate registers
.macro sigma1
movdqa $0, %xmm6
movdqa $0, %xmm7
psrld $$10, $0 // SHR10(x)
psrld $$17, %xmm6 // part of ROTR17
pxor %xmm6, $0
pslld $$13, %xmm7 // part of ROTR19
pxor %xmm7, $0
psrld $$2, %xmm6 // part of ROTR19
pxor %xmm6, $0
pslld $$2, %xmm7 // part of ROTR17
pxor %xmm7, $0
.endm
// #define Sigma0_256(x) (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x)))
.macro Sigma0
mov $0, t // x
mov $0, s // x
ror $$2, t // S32(2, (x))
ror $$13, s // S32(13, (x))
xor s, t // S32(2, (x)) ^ S32(13, (x))
ror $$9, s // S32(22, (x))
xor s, t // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x)))
.endm
// #define Sigma1_256(x) (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
.macro Sigma1
mov $0, s // x
ror $$6, s // S32(6, (x))
mov s, t // S32(6, (x))
ror $$5, s // S32(11, (x))
xor s, t // S32(6, (x)) ^ S32(11, (x))
ror $$14, s // S32(25, (x))
xor s, t // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
.endm
// per round digests update
.macro round
Sigma1 $4 // t = T1
add t, $7 // use h to store h+Sigma1(e)
Ch $4, $5, $6 // t = Ch (e, f, g);
add $7, t // t = h+Sigma1(e)+Ch(e,f,g);
add WK($8), t // h = T1
add t, $3 // d += T1;
mov t, $7 // h = T1
Sigma0 $0 // t = Sigma0(a);
add t, $7 // h = T1 + Sigma0(a);
Maj $0, $1, $2 // t = Maj(a,b,c)
add t, $7 // h = T1 + Sigma0(a) + Maj(a,b,c);
.endm
// per 4 rounds digests update and permutation
// permutation is absorbed by rotating the roles of digests a-h
.macro rounds
round $0, $1, $2, $3, $4, $5, $6, $7, 0+$8
round $7, $0, $1, $2, $3, $4, $5, $6, 1+$8
round $6, $7, $0, $1, $2, $3, $4, $5, 2+$8
round $5, $6, $7, $0, $1, $2, $3, $4, 3+$8
.endm
// update the message schedule W and W+K (4 rounds) 16 rounds ahead in the future
.macro message_schedule
// 4 32-bit K256 words in xmm5
mov K, t
movdqu (t), %xmm5
addl $$16, K // K points to next K256 word for next iteration
movdqa $1, %xmm4 // W7:W4
palignr $$4, $0, %xmm4 // W4:W1
sigma0 %xmm4 // sigma0(W4:W1)
movdqa $3, %xmm6 // W15:W12
paddd %xmm4, $0 // $0 = W3:W0 + sigma0(W4:W1)
palignr $$4, $2, %xmm6 // W12:W9
paddd %xmm6, $0 // $0 = W12:W9 + sigma0(W4:W1) + W3:W0
movdqa $3, %xmm4 // W15:W12
psrldq $$8, %xmm4 // 0,0,W15,W14
sigma1 %xmm4 // sigma1(0,0,W15,W14)
paddd %xmm4, $0 // sigma1(0,0,W15,W14) + W12:W9 + sigma0(W4:W1) + W3:W0
movdqa $0, %xmm4 // W19-sigma1(W17), W18-sigma1(W16), W17, W16
pslldq $$8, %xmm4 // W17, W16, 0, 0
sigma1 %xmm4 // sigma1(W17,W16,0,0)
paddd %xmm4, $0 // W19:W16
paddd $0, %xmm5 // WK
movdqa %xmm5, WK($4)
.endm
// this macro is used in the last 16 rounds of a current block
// it reads the next message (16 4-byte words), load it into 4 words W[r:r+3], computes WK[r:r+3]
// and save into stack to prepare for next block
.macro update_W_WK
mov data_addr, t
movdqu $0*16(t), $1 // read 4 4-byte words
pshufb L_aligned_bswap, $1 // big-endian of each 4-byte word, W[r:r+3]
mov K, t
movdqu $0*16(t), %xmm4 // K[r:r+3]
paddd $1, %xmm4 // WK[r:r+3]
movdqa %xmm4, WK($0*4) // save WK[r:r+3] into stack circular buffer
.endm
.section __IMPORT,__pointers,non_lazy_symbol_pointers
L_sha256_K$non_lazy_ptr:
.indirect_symbol CC_C_LABEL(sha256_K)
.long 0
.text
.globl _AccelerateCrypto_SHA256_compress_ssse3
_AccelerateCrypto_SHA256_compress_ssse3:
// push callee-saved registers
push %ebp
push %ebx
push %esi
push %edi
// allocate stack space
sub $stack_size, sp
// if kernel code, save used xmm registers
#if BUILDKERNEL
movdqa %xmm0, 0*16+xmm_save
movdqa %xmm1, 1*16+xmm_save
movdqa %xmm2, 2*16+xmm_save
movdqa %xmm3, 3*16+xmm_save
movdqa %xmm4, 4*16+xmm_save
movdqa %xmm5, 5*16+xmm_save
movdqa %xmm6, 6*16+xmm_save
movdqa %xmm7, 7*16+xmm_save
#endif
// set up bswap parameters in the aligned stack space and pointer to table K256[]
call 0f // Push program counter onto stack.
0: pop t // Get program counter.
mov L_sha256_K$non_lazy_ptr-0b(t), t
mov t, K
call 0f // Push program counter onto stack.
0: pop %eax // Get program counter.
lea L_bswap-0b(%eax), %eax
movdqa (%eax), %xmm0
movdqa %xmm0, L_aligned_bswap
// load W[0:15] into xmm0-xmm3
mov data_addr, t
movdqu 0*16(t), W0
movdqu 1*16(t), W1
movdqu 2*16(t), W2
movdqu 3*16(t), W3
addl $64, data_addr
pshufb L_aligned_bswap, W0
pshufb L_aligned_bswap, W1
pshufb L_aligned_bswap, W2
pshufb L_aligned_bswap, W3
// compute WK[0:15] and save in stack
mov K, t
movdqu 0*16(t), %xmm4
movdqu 1*16(t), %xmm5
movdqu 2*16(t), %xmm6
movdqu 3*16(t), %xmm7
addl $64, K
paddd %xmm0, %xmm4
paddd %xmm1, %xmm5
paddd %xmm2, %xmm6
paddd %xmm3, %xmm7
movdqa %xmm4, WK(0)
movdqa %xmm5, WK(4)
movdqa %xmm6, WK(8)
movdqa %xmm7, WK(12)
L_loop:
// digests a-h = ctx->states;
mov ctx_addr, t
mov 0*4(t), a
mov 1*4(t), b
mov 2*4(t), s
mov s, c
mov 3*4(t), d
mov 4*4(t), e
mov 5*4(t), s
mov s, f
mov 6*4(t), g
mov 7*4(t), s
mov s, h
// rounds 0:47 interleaved with W/WK update for rounds 16:63
rounds a, b, c, d, e, f, g, h, 0
message_schedule W0,W1,W2,W3,16
rounds e, f, g, h, a, b, c, d, 4
message_schedule W1,W2,W3,W0,20
rounds a, b, c, d, e, f, g, h, 8
message_schedule W2,W3,W0,W1,24
rounds e, f, g, h, a, b, c, d, 12
message_schedule W3,W0,W1,W2,28
rounds a, b, c, d, e, f, g, h, 16
message_schedule W0,W1,W2,W3,32
rounds e, f, g, h, a, b, c, d, 20
message_schedule W1,W2,W3,W0,36
rounds a, b, c, d, e, f, g, h, 24
message_schedule W2,W3,W0,W1,40
rounds e, f, g, h, a, b, c, d, 28
message_schedule W3,W0,W1,W2,44
rounds a, b, c, d, e, f, g, h, 32
message_schedule W0,W1,W2,W3,48
rounds e, f, g, h, a, b, c, d, 36
message_schedule W1,W2,W3,W0,52
rounds a, b, c, d, e, f, g, h, 40
message_schedule W2,W3,W0,W1,56
rounds e, f, g, h, a, b, c, d, 44
message_schedule W3,W0,W1,W2,60
// revert K to the beginning of K256[]
subl $256, K
subl $1, num_blocks // num_blocks--
je L_final_block // if final block, wrap up final rounds
// rounds 48:63 interleaved with W/WK initialization for next block rounds 0:15
rounds a, b, c, d, e, f, g, h, 48
update_W_WK 0, W0
rounds e, f, g, h, a, b, c, d, 52
update_W_WK 1, W1
rounds a, b, c, d, e, f, g, h, 56
update_W_WK 2, W2
rounds e, f, g, h, a, b, c, d, 60
update_W_WK 3, W3
addl $64, K
addl $64, data_addr
// ctx->states += digests a-h
mov ctx_addr, t
add a, 0*4(t)
add b, 1*4(t)
mov c, s
add s, 2*4(t)
add d, 3*4(t)
add e, 4*4(t)
mov f, s
add s, 5*4(t)
add g, 6*4(t)
mov h, s
add s, 7*4(t)
jmp L_loop // branch for next block
// wrap up digest update round 48:63 for final block
L_final_block:
rounds a, b, c, d, e, f, g, h, 48
rounds e, f, g, h, a, b, c, d, 52
rounds a, b, c, d, e, f, g, h, 56
rounds e, f, g, h, a, b, c, d, 60
// ctx->states += digests a-h
mov ctx_addr, t
add a, 0*4(t)
add b, 1*4(t)
mov c, s
add s, 2*4(t)
add d, 3*4(t)
add e, 4*4(t)
mov f, s
add s, 5*4(t)
add g, 6*4(t)
mov h, s
add s, 7*4(t)
// if kernel, restore xmm0-xmm7
#if BUILDKERNEL
movdqa 0*16+xmm_save, %xmm0
movdqa 1*16+xmm_save, %xmm1
movdqa 2*16+xmm_save, %xmm2
movdqa 3*16+xmm_save, %xmm3
movdqa 4*16+xmm_save, %xmm4
movdqa 5*16+xmm_save, %xmm5
movdqa 6*16+xmm_save, %xmm6
movdqa 7*16+xmm_save, %xmm7
#endif
// free allocated stack memory
add $stack_size, sp
// restore callee-saved registers
pop %edi
pop %esi
pop %ebx
pop %ebp
// return
ret
// data for using ssse3 pshufb instruction (big-endian loading of data)
CC_ASM_SECTION_CONST
.p2align 4, 0x90
L_bswap:
.long 0x00010203
.long 0x04050607
.long 0x08090a0b
.long 0x0c0d0e0f
#endif // i386

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,564 @@
# Copyright (c) (2016,2018,2019) Apple Inc. All rights reserved.
#
# corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
# is contained in the License.txt file distributed with corecrypto) and only to
# people who accept that license. IMPORTANT: Any license rights granted to you by
# Apple Inc. (if any) are limited to internal use within your organization only on
# devices and computers you own or control, for the sole purpose of verifying the
# security characteristics and correct functioning of the Apple Software. You may
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
/*
This file provides armv7 neon hand implementation of the following function
void sha512_compress(uint64_t *state, size_t nblocks, const void *in);
sha512 algorithm per block description:
1. W(0:15) = big-endian (per 8 bytes) loading of input data (128 bytes)
2. load 8 digests (each 64bit) a-h from state
3. for r = 0:15
T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
d += T1;
h = T1 + Sigma0(a) + Maj(a,b,c)
permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
4. for r = 16:79
W[r] = W[r-16] + Gamma1(W[r-2]) + W[r-7] + Gamma0(W[r-15]);
T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
d += T1;
h = T1 + Sigma0(a) + Maj(a,b,c)
permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
In the assembly implementation:
- a circular window of message schedule W(r:r+15) is updated and stored in v0-v7
- its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
- the 8 digests (a-h) will be stored in GPR (%r8-%r15)
----------------------------------------------------------------------------
our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block
into the last 16 rounds of its previous block:
----------------------------------------------------------------------------
load W(0:15) (big-endian per 8 bytes) into v0:v7
pre_calculate and store W+K(0:15) in stack
L_loop:
load digests a-h from ctx->state;
for (r=0;r<64;r+=2) {
digests a-h update and permute round r:r+1
update W([r:r+1]%16) and WK([r:r+1]%16) for the next 8th iteration
}
num_block--;
if (num_block==0) jmp L_last_block;
for (r=64;r<80;r+=2) {
digests a-h update and permute round r:r+1
load W([r:r+1]%16) (big-endian per 8 bytes) into v0:v7
pre_calculate and store W+K([r:r+1]%16) in stack
}
ctx->states += digests a-h;
jmp L_loop;
L_last_block:
for (r=64;r<80;r+=2) {
digests a-h update and permute round r:r+2
}
ctx->states += digests a-h;
------------------------------------------------------------------------
Apple CoreOS vector & numerics
*/
#if (defined(__arm__) && defined(__ARM_NEON__))
// associate variables with registers or memory
#define stack_size (16*8)
#define ctx r0
#define num_blocks r1
#define data r2
/* use d0-d7 (q0-q3) for 8 digests */
#define a d0
#define b d1
#define c d2
#define d d3
#define e d4
#define f d5
#define g d6
#define h d7
#define K r3
// 3 local variables
#define s d8
#define t d9
#define u d10
// a window (16 quad-words) of message scheule
#define W0 q8
#define W1 q9
#define W2 q10
#define W3 q11
#define W4 q12
#define W5 q13
#define W6 q14
#define W7 q15
// circular buffer for WK[(r:r+15)%16]
#define WK(x) [sp,#((x)&15)*8]
// #define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
/* t = Ch($0, $1, $2) */
.macro Ch
veor t, $1, $2
vand t, t, $0
veor t, t, $2
.endm
// #define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
/* t = Maj($0, $1, $2) */
.macro Maj
veor t, $1, $2 // y^z
vand s, $1,$2 // y&z
vand t, t, $0 // x&(y^z)
veor t, t, s // Maj(x,y,z)
.endm
// #define Gamma0(x) (S64(1, (x)) ^ S64(8, (x)) ^ R(7 , (x)))
// performs Gamma0_512 on 2 words on an vector registers
// use q6/q7 as intermediate registers
.macro Gamma0
vshr.u64 q6, $0, #1 // part of S64(1, x)
vshl.i64 q7, $0, #56 // part of S64(8, x)
vshr.u64 $0, $0, #7 // R(7, x)
veor $0, $0, q6
vshr.u64 q6, q6, #7 // part of S64(8, x)
veor $0, $0, q7
vshl.i64 q7, q7, #7 // part of S64(1, x)
veor $0, $0, q6
veor $0, $0, q7
.endm
// #define Gamma1(x) (S64(19, (x)) ^ S64(61, (x)) ^ R(6, (x)))
// performs Gamma1_512 on 2 words on an vector registers
// use v16/v17 as intermediate registers
.macro Gamma1
vshr.u64 q6, $0, #19 // part of S64(19, x)
vshl.i64 q7, $0, #3 // part of S64(61, x)
vshr.u64 $0, $0, #6 // R(6, x)
veor $0, $0, q6
vshr.u64 q6, q6, #42 // part of S64(61, x)
veor $0, $0, q7
vshl.i64 q7, q7, #42 // part of S64(19, x)
veor $0, $0, q6
veor $0, $0, q7
.endm
// W[r] = W[r-16] + Gamma1(W[r-2]) + W[r-7] + Gamma0(W[r-15]);
/*
W0 W1 W2 W3 W4 W5 W6 W7
update 2 quad words in W0 = W0 + Gamma1(W7) + vext(W4,W5) + Gamma0(vext(W0,W1)).
use q5-q7 for temp
*/
.macro message_update2
vext.64 q7, $4, $5, #1 // W[r-7]
vext.64 q5, $0, $1, #1 // W[r-15]
vadd.s64 $0, $0, q7 // W[r-16] + W[r-7];
Gamma0 q5
vadd.s64 $0, $0, q5 // W[r-16] + W[r-7] + Gamma0(W[r-15])
vshr.u64 q6, $7, #19 // Gamma1(W[r-2]), part of S64(19, x)
vshl.i64 q7, $7, #3 // part of S64(61, x)
vshr.u64 q5, $7, #6 // R(6, x)
veor q5, q5, q6
vshr.u64 q6, q6, #42 // part of S64(61, x)
veor q5, q5, q7
vshl.i64 q7, q7, #42 // part of S64(19, x)
veor q5, q5, q6
veor q5, q5, q7
vadd.s64 $0, $0, q5 // W[r-16] + W[r-7] + Gamma1(W7)
.endm
// #define Sigma0(x) (S64(28, (x)) ^ S64(34, (x)) ^ S64(39, (x)))
.macro Sigma0
vshr.u64 t, $0, #28
vshl.i64 s, $0, #25
vshr.u64 u, t, #6
veor t, t, s
vshl.i64 s, s, #5
veor t, t, u
vshr.u64 u, u, #5
veor t, t, s
vshl.i64 s, s, #6
veor t, t, u
veor t, t, s
.endm
// #define Sigma1(x) (S(14, (x)) ^ S(18, (x)) ^ S(41, (x)))
.macro Sigma1
vshr.u64 t, $0, #14
vshl.i64 s, $0, #23
vshr.u64 u, t, #4
veor t, t, s
vshl.i64 s, s, #23
veor t, t, u
vshr.u64 u, u, #23
veor t, t, s
vshl.i64 s, s, #4
veor t, t, u
veor t, t, s
.endm
// per round digests update
.macro round_ref
Sigma1 $4 // t = Sigma1(e);
vadd.s64 $7, $7, t // h = h+Sigma1(e)
Ch $4, $5, $6 // t = Ch (e, f, g);
vldr s, WK($8) // s = WK
vadd.s64 $7, $7, t // h = h+Sigma1(e)+Ch(e,f,g);
vadd.s64 $7, $7, s // h = h+Sigma1(e)+Ch(e,f,g)+WK
vadd.s64 $3, $3, $7 // d += h;
Sigma0 $0 // t = Sigma0(a);
vadd.s64 $7, $7, t // h += Sigma0(a);
Maj $0, $1, $2 // t = Maj(a,b,c)
vadd.s64 $7, $7, t // h = T1 + Sigma0(a) + Maj(a,b,c);
.endm
.macro round
Sigma1 $4 // t = Sigma1(e);
vldr s, WK($8) // s = WK
vadd.s64 $7, $7, t // h = h+Sigma1(e)
veor t, $5, $6
vadd.s64 $7, $7, s // h = h+Sigma1(e)+WK
vand t, t, $4
veor t, t, $6 // t = Ch (e, f, g);
vadd.s64 $7, $7, t // h = h+Sigma1(e)+Ch(e,f,g);
Sigma0 $0 // t = Sigma0(a);
vadd.s64 $3, $3, $7 // d += h;
vadd.s64 $7, $7, t // h += Sigma0(a);
Maj $0, $1, $2 // t = Maj(a,b,c)
vadd.s64 $7, $7, t // h = T1 + Sigma0(a) + Maj(a,b,c);
.endm
/*
16 rounds of hash update, update input schedule W (in vector register v0-v7) and WK = W + K (in stack)
*/
.macro rounds_schedule
mov r12, sp
message_update2 W0, W1, W2, W3, W4, W5, W6, W7
round $0, $1, $2, $3, $4, $5, $6, $7, 0+$8
round $7, $0, $1, $2, $3, $4, $5, $6, 1+$8
vld1.64 {q7}, [K,:128]!
vadd.s64 q7, q7, W0
vst1.64 {q7}, [r12]!
message_update2 W1, W2, W3, W4, W5, W6, W7, W0
round $6, $7, $0, $1, $2, $3, $4, $5, 2+$8
round $5, $6, $7, $0, $1, $2, $3, $4, 3+$8
vld1.64 {q7}, [K,:128]!
vadd.s64 q7, q7, W1
vst1.64 {q7}, [r12]!
message_update2 W2, W3, W4, W5, W6, W7, W0, W1
round $4, $5, $6, $7, $0, $1, $2, $3, 4+$8
round $3, $4, $5, $6, $7, $0, $1, $2, 5+$8
vld1.64 {q7}, [K,:128]!
vadd.s64 q7, q7, W2
vst1.64 {q7}, [r12]!
message_update2 W3, W4, W5, W6, W7, W0, W1, W2
round $2, $3, $4, $5, $6, $7, $0, $1, 6+$8
round $1, $2, $3, $4, $5, $6, $7, $0, 7+$8
vld1.64 {q7}, [K,:128]!
vadd.s64 q7, q7, W3
vst1.64 {q7}, [r12]!
message_update2 W4, W5, W6, W7, W0, W1, W2, W3
round $0, $1, $2, $3, $4, $5, $6, $7, 8+$8
round $7, $0, $1, $2, $3, $4, $5, $6, 9+$8
vld1.64 {q7}, [K,:128]!
vadd.s64 q7, q7, W4
vst1.64 {q7}, [r12]!
message_update2 W5, W6, W7, W0, W1, W2, W3, W4
round $6, $7, $0, $1, $2, $3, $4, $5, 10+$8
round $5, $6, $7, $0, $1, $2, $3, $4, 11+$8
vld1.64 {q7}, [K,:128]!
vadd.s64 q7, q7, W5
vst1.64 {q7}, [r12]!
message_update2 W6, W7, W0, W1, W2, W3, W4, W5
round $4, $5, $6, $7, $0, $1, $2, $3, 12+$8
round $3, $4, $5, $6, $7, $0, $1, $2, 13+$8
vld1.64 {q7}, [K,:128]!
vadd.s64 q7, q7, W6
vst1.64 {q7}, [r12]!
message_update2 W7, W0, W1, W2, W3, W4, W5, W6
round $2, $3, $4, $5, $6, $7, $0, $1, 14+$8
round $1, $2, $3, $4, $5, $6, $7, $0, 15+$8
vld1.64 {q7}, [K,:128]!
vadd.s64 q7, q7, W7
vst1.64 {q7}, [r12]!
.endm
.macro rev64
vrev64.8 $0, $0
.endm
/*
16 rounds of hash update, load new input schedule W (in vector register v0-v7) and update WK = W + K (in stack)
*/
.macro rounds_schedule_initial
mov r12, sp
vld1.8 {W0}, [data]!
round $0, $1, $2, $3, $4, $5, $6, $7, 0+$8
rev64 W0
round $7, $0, $1, $2, $3, $4, $5, $6, 1+$8
vld1.64 {q7}, [K,:128]!
vadd.s64 q7, q7, W0
vst1.64 {q7}, [r12]!
vld1.8 {W1}, [data]!
round $6, $7, $0, $1, $2, $3, $4, $5, 2+$8
rev64 W1
round $5, $6, $7, $0, $1, $2, $3, $4, 3+$8
vld1.64 {q7}, [K,:128]!
vadd.s64 q7, q7, W1
vst1.64 {q7}, [r12]!
vld1.8 {W2}, [data]!
round $4, $5, $6, $7, $0, $1, $2, $3, 4+$8
rev64 W2
round $3, $4, $5, $6, $7, $0, $1, $2, 5+$8
vld1.64 {q7}, [K,:128]!
vadd.s64 q7, q7, W2
vst1.64 {q7}, [r12]!
vld1.8 {W3}, [data]!
round $2, $3, $4, $5, $6, $7, $0, $1, 6+$8
rev64 W3
round $1, $2, $3, $4, $5, $6, $7, $0, 7+$8
vld1.64 {q7}, [K,:128]!
vadd.s64 q7, q7, W3
vst1.64 {q7}, [r12]!
vld1.8 {W4}, [data]!
round $0, $1, $2, $3, $4, $5, $6, $7, 8+$8
rev64 W4
round $7, $0, $1, $2, $3, $4, $5, $6, 9+$8
vld1.64 {q7}, [K,:128]!
vadd.s64 q7, q7, W4
vst1.64 {q7}, [r12]!
vld1.8 {W5}, [data]!
round $6, $7, $0, $1, $2, $3, $4, $5, 10+$8
rev64 W5
round $5, $6, $7, $0, $1, $2, $3, $4, 11+$8
vld1.64 {q7}, [K,:128]!
vadd.s64 q7, q7, W5
vst1.64 {q7}, [r12]!
vld1.8 {W6}, [data]!
round $4, $5, $6, $7, $0, $1, $2, $3, 12+$8
rev64 W6
round $3, $4, $5, $6, $7, $0, $1, $2, 13+$8
vld1.64 {q7}, [K,:128]!
vadd.s64 q7, q7, W6
vst1.64 {q7}, [r12]!
vld1.8 {W7}, [data]!
round $2, $3, $4, $5, $6, $7, $0, $1, 14+$8
rev64 W7
round $1, $2, $3, $4, $5, $6, $7, $0, 15+$8
vld1.64 {q7}, [K,:128]!
vadd.s64 q7, q7, W7
vst1.64 {q7}, [r12]!
.endm
/*
16 rounds of hash update
*/
.macro rounds_schedule_final
round $0, $1, $2, $3, $4, $5, $6, $7, 0+$8
round $7, $0, $1, $2, $3, $4, $5, $6, 1+$8
round $6, $7, $0, $1, $2, $3, $4, $5, 2+$8
round $5, $6, $7, $0, $1, $2, $3, $4, 3+$8
round $4, $5, $6, $7, $0, $1, $2, $3, 4+$8
round $3, $4, $5, $6, $7, $0, $1, $2, 5+$8
round $2, $3, $4, $5, $6, $7, $0, $1, 6+$8
round $1, $2, $3, $4, $5, $6, $7, $0, 7+$8
round $0, $1, $2, $3, $4, $5, $6, $7, 8+$8
round $7, $0, $1, $2, $3, $4, $5, $6, 9+$8
round $6, $7, $0, $1, $2, $3, $4, $5, 10+$8
round $5, $6, $7, $0, $1, $2, $3, $4, 11+$8
round $4, $5, $6, $7, $0, $1, $2, $3, 12+$8
round $3, $4, $5, $6, $7, $0, $1, $2, 13+$8
round $2, $3, $4, $5, $6, $7, $0, $1, 14+$8
round $1, $2, $3, $4, $5, $6, $7, $0, 15+$8
.endm
.p2align 4
L_table1:
.long L_Tab$non_lazy_ptr-(L_table0+8)
.p2align 4
.text
.globl _AccelerateCrypto_SHA512_compress
_AccelerateCrypto_SHA512_compress:
// push callee-saved registers
push {r4,r5,r7,lr}
add r7, sp, #8 // set up dtrace frame pointer
vpush {q4-q7}
#if BUILDKERNEL
vpush {q0-q3}
vpush {q8-q15}
#endif
// allocate stack space for WK[0:15]
sub sp, sp, #stack_size
ldr K, L_table1
L_table0:
mov r12, pc
ldr K, [r12, K]
vld1.8 {W0,W1}, [data]!
vld1.8 {W2,W3}, [data]!
vld1.8 {W4,W5}, [data]!
vld1.8 {W6,W7}, [data]!
rev64 W0
rev64 W1
rev64 W2
rev64 W3
rev64 W4
rev64 W5
rev64 W6
rev64 W7
mov r12, sp
// compute WK[0:15] and save in stack, use q0-q7 as they have not yet being used
vld1.8 {q0,q1}, [K,:128]!
vld1.8 {q2,q3}, [K,:128]!
vld1.8 {q4,q5}, [K,:128]!
vld1.8 {q6,q7}, [K,:128]!
vadd.s64 q0, q0, W0
vadd.s64 q1, q1, W1
vadd.s64 q2, q2, W2
vadd.s64 q3, q3, W3
vadd.s64 q4, q4, W4
vadd.s64 q5, q5, W5
vadd.s64 q6, q6, W6
vadd.s64 q7, q7, W7
vst1.32 {q0,q1}, [r12]!
vst1.32 {q2,q3}, [r12]!
vst1.32 {q4,q5}, [r12]!
vst1.32 {q6,q7}, [r12]!
L_loop:
// digests a-h = ctx->states;
mov r12, ctx
vld1.64 {q0,q1}, [r12]!
vld1.64 {q2,q3}, [r12]
// rounds 0:47 interleaved with W/WK update for rounds 16:63
mov r4, #4
L_i_loop:
rounds_schedule a, b, c, d, e, f, g, h, 16
subs r4, r4, #1
bgt L_i_loop
// revert K to the beginning of K256[]
sub K, K, #640
subs num_blocks, num_blocks, #1 // num_blocks--
beq L_final_block // if final block, wrap up final rounds
rounds_schedule_initial a, b, c, d, e, f, g, h, 0
// ctx->states += digests a-h
mov r12, ctx
vld1.64 {q4,q5}, [r12]!
vld1.64 {q6,q7}, [r12]
vadd.s64 q4, q0, q4
vadd.s64 q5, q1, q5
vadd.s64 q6, q2, q6
vadd.s64 q7, q3, q7
vst1.64 {q4,q5}, [ctx]
vst1.64 {q6,q7}, [r12]
bal L_loop // branch for next block
// wrap up digest update round 48:63 for final block
L_final_block:
rounds_schedule_final a, b, c, d, e, f, g, h, 0
// ctx->states += digests a-h
mov r12, ctx
vld1.64 {q4,q5}, [r12]!
vld1.64 {q6,q7}, [r12]
vadd.s64 q4, q0, q4
vadd.s64 q5, q1, q5
vadd.s64 q6, q2, q6
vadd.s64 q7, q3, q7
vst1.64 {q4,q5}, [ctx]
vst1.64 {q6,q7}, [r12]
// free allocated stack memory
add sp, sp, #stack_size
// if kernel, restore used vector registers
#if BUILDKERNEL
vpop {q8-q15}
vpop {q0-q3}
#endif
vpop {q4-q7}
// return
pop {r4,r5,r7,pc}
.section __DATA,__nl_symbol_ptr,non_lazy_symbol_pointers
.p2align 4
L_Tab$non_lazy_ptr:
.indirect_symbol _sha512_K
.long 0
#endif // (defined(__arm__) && defined(__ARM_NEON__))

View File

@ -0,0 +1,622 @@
# Copyright (c) (2016,2018-2020) Apple Inc. All rights reserved.
#
# corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
# is contained in the License.txt file distributed with corecrypto) and only to
# people who accept that license. IMPORTANT: Any license rights granted to you by
# Apple Inc. (if any) are limited to internal use within your organization only on
# devices and computers you own or control, for the sole purpose of verifying the
# security characteristics and correct functioning of the Apple Software. You may
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
/*
This file provides arm64 hand implementation of the following function
void sha512_compress(uint64_t *state, size_t nblocks, const void *in);
sha512 algorithm per block description:
1. W(0:15) = big-endian (per 8 bytes) loading of input data (128 bytes)
2. load 8 digests (each 64bit) a-h from state
3. for r = 0:15
T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
d += T1;
h = T1 + Sigma0(a) + Maj(a,b,c)
permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
4. for r = 16:79
W[r] = W[r-16] + Gamma1(W[r-2]) + W[r-7] + Gamma0(W[r-15]);
T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
d += T1;
h = T1 + Sigma0(a) + Maj(a,b,c)
permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
In the assembly implementation:
- a circular window of message schedule W(r:r+15) is updated and stored in v0-v7
- its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
- the 8 digests (a-h) will be stored in GPR (%r8-%r15)
----------------------------------------------------------------------------
our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block
into the last 16 rounds of its previous block:
----------------------------------------------------------------------------
load W(0:15) (big-endian per 8 bytes) into v0:v7
pre_calculate and store W+K(0:15) in stack
L_loop:
load digests a-h from ctx->state;
for (r=0;r<64;r+=2) {
digests a-h update and permute round r:r+1
update W([r:r+1]%16) and WK([r:r+1]%16) for the next 8th iteration
}
num_block--;
if (num_block==0) jmp L_last_block;
for (r=64;r<80;r+=2) {
digests a-h update and permute round r:r+1
load W([r:r+1]%16) (big-endian per 8 bytes) into v0:v7
pre_calculate and store W+K([r:r+1]%16) in stack
}
ctx->states += digests a-h;
jmp L_loop;
L_last_block:
for (r=64;r<80;r+=2) {
digests a-h update and permute round r:r+2
}
ctx->states += digests a-h;
------------------------------------------------------------------------
Apple CoreOS vector & numerics
*/
#if defined __arm64__
#include "ccarm_pac_bti_macros.h"
// associate variables with registers or memory
#define stack_size (16*8)
#define ctx x0
#define num_blocks x1
#define data x2
#define a x4
#define bb x5
#define c x6
#define d x7
#define e x8
#define f x9
#define g x10
#define h x11
#define K x3
// 3 local variables
#define s x12
#define t x13
#define u x14
// a window (16 quad-words) of message scheule
#define W0 v0
#define W1 v1
#define W2 v2
#define W3 v3
#define W4 v4
#define W5 v5
#define W6 v6
#define W7 v7
// circular buffer for WK[(r:r+15)%16]
#define WK(x) [sp,#((x)&15)*8]
// #define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
/* t = Ch($0, $1, $2) */
.macro Ch
eor t, $1, $2
and t, t, $0
eor t, t, $2
.endm
// #define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
/* t = Maj($0, $1, $2) */
.macro Maj
eor t, $1, $2 // y^z
and s, $1,$2 // y&z
and t, t, $0 // x&(y^z)
eor t, t, s // Maj(x,y,z)
.endm
// #define Gamma0(x) (S64(1, (x)) ^ S64(8, (x)) ^ R(7 , (x)))
// performs Gamma0_512 on 2 words on an vector registers
// use v20/v21 as intermediate registers
.macro Gamma0
ushr.2d v20, $0, #1 // part of S64(1, x)
shl.2d v21, $0, #56 // part of S64(8, x)
ushr.2d $0, $0, #7 // R(7, x)
eor.16b $0, $0, v20
ushr.2d v20, v20, #7 // part of S64(8, x)
eor.16b $0, $0, v21
shl.2d v21,v21, #7 // part of S64(1, x)
eor.16b $0, $0, v20
eor.16b $0, $0, v21
.endm
// #define Gamma1(x) (S64(19, (x)) ^ S64(61, (x)) ^ R(6, (x)))
// performs Gamma1_512 on 2 words on an vector registers
// use v16/v17 as intermediate registers
.macro Gamma1
ushr.2d v16, $0, #19 // part of S64(19, x)
shl.2d v17, $0, #3 // part of S64(61, x)
ushr.2d $0, $0, #6 // R(6, x)
eor.16b $0, $0, v16
ushr.2d v16, v16, #42 // part of S64(61, x)
eor.16b $0, $0, v17
shl.2d v17,v17, #42 // part of S64(19, x)
eor.16b $0, $0, v16
eor.16b $0, $0, v17
.endm
// W[r] = W[r-16] + Gamma1(W[r-2]) + W[r-7] + Gamma0(W[r-15]);
/*
W0 W1 W2 W3 W4 W5 W6 W7
update 2 quad words in W0 = W0 + Gamma1(W7) + vext(W4,W5) + Gamma0(vext(W0,W1)).
use v16-v19 for temp
*/
.macro message_update2 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7
ext.16b v18, \vec4, \vec5, #8 // vext(W4,W5)
ext.16b v19, \vec0, \vec1, #8 // vext(W0,W1)
ushr.2d v16, \vec7, #19 // part of S64(19, x)
shl.2d v17, \vec7, #3 // part of S64(61, x)
add.2d \vec0, \vec0, v18 // W0 + vext(W4,W5)
ushr.2d v18, \vec7, #6 // R(6,x)
ushr.2d v20, v19, #1 // part of S64(1, x)
shl.2d v21, v19, #56 // part of S64(8, x)
ushr.2d v19, v19, #7 // R(7, x)
eor.16b v18, v18, v16
ushr.2d v16, v16, #42 // part of S64(61, x)
eor.16b v19, v19, v20
ushr.2d v20, v20, #7 // part of S64(8, x)
eor.16b v18, v18, v17
shl.2d v17, v17, #42 // part of S64(19, x)
eor.16b v19, v19, v21
shl.2d v21,v21, #7 // part of S64(1, x)
eor.16b v18, v18, v16
eor.16b v19, v19, v20
eor.16b v18, v18, v17
eor.16b v19, v19, v21
add.2d \vec0, \vec0, v18 // W0 + Gamma1(W7) + vext(W4,W5)
add.2d \vec0, \vec0, v19 // W0 + Gamma1(W7) + vext(W4,W5) + Gamma0(vext(W0,W1))
.endm
// #define Sigma0(x) (S64(28, (x)) ^ S64(34, (x)) ^ S64(39, (x)))
.macro Sigma0
ror t, $0, #28
eor t, t, $0, ror #34
eor t, t, $0, ror #39
.endm
// #define Sigma1(x) (S(14, (x)) ^ S(18, (x)) ^ S(41, (x)))
.macro Sigma1
ror t, $0, #14
eor t, t, $0, ror #18
eor t, t, $0, ror #41
.endm
// per round digests update
.macro round_ref
Sigma1 $4 // t = Sigma1(e);
add $7, $7, t // h = h+Sigma1(e)
Ch $4, $5, $6 // t = Ch (e, f, g);
ldr s, WK($8) // s = WK
add $7, $7, t // h = h+Sigma1(e)+Ch(e,f,g);
add $7, $7, s // h = h+Sigma1(e)+Ch(e,f,g)+WK
add $3, $3, $7 // d += h;
Sigma0 $0 // t = Sigma0(a);
add $7, $7, t // h += Sigma0(a);
Maj $0, $1, $2 // t = Maj(a,b,c)
add $7, $7, t // h = T1 + Sigma0(a) + Maj(a,b,c);
.endm
.macro round s0, s1, s2, s3, s4, s5, s6, s7, s8
ror t, \s4, #14
eor s, \s5, \s6
ldr u, WK(\s8) // t = WK
eor t, t, \s4, ror #18
and s, s, \s4
add \s7, \s7, u // h = h+WK
eor t, t, \s4, ror #41
eor s, s, \s6
add \s7, \s7, t // h = h+WK+Sigma1(e)
eor t, \s1, \s2 // y^z
add \s7, \s7, s // h = h+WK+Sigma1(e)+Ch(e,f,g);
ror s, \s0, #28
add \s3, \s3, \s7 // d += h;
and u, \s1,\s2 // y&z
eor s, s, \s0, ror #34
and t, t, \s0 // x&(y^z)
eor s, s, \s0, ror #39
eor t, t, u // Maj(x,y,z)
add \s7, \s7, s // h += Sigma0(a);
add \s7, \s7, t // h = T1 + Sigma0(a) + Maj(a,b,c);
.endm
.macro combined_message_round_update2 s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7
//
// message_update2 \vec0, \vec1, \vec2, \vec3, \vec4, \vec5, \vec6, \vec7
// round \s0, \s1, \s2, \s3, \s4, \s5, \s6, \s7, 0+\s8+\s9
// round \s7, \s0, \s1, \s2, \s3, \s4, \s5, \s6, 1+\s8+\s9
ror t, \s4, #14
ldr u, WK(0+\s8+\s9) // t = WK
eor s, \s5, \s6
ext.16b v18, \vec4, \vec5, #8 // vext(W4,W5)
eor t, t, \s4, ror #18
and s, s, \s4
ext.16b v19, \vec0, \vec1, #8 // vext(W0,W1)
add \s7, \s7, u // h = h+WK
eor t, t, \s4, ror #41
ushr.2d v16, \vec7, #19 // part of S64(19, x)
eor s, s, \s6
add \s7, \s7, t // h = h+WK+Sigma1(e)
shl.2d v17, \vec7, #3 // part of S64(61, x)
eor t, \s1, \s2 // y^z
add.2d \vec0, \vec0, v18 // W0 + vext(W4,W5)
ushr.2d v18, \vec7, #6 // R(6,x)
add \s7, \s7, s // h = h+WK+Sigma1(e)+Ch(e,f,g);
ushr.2d v20, v19, #1 // part of S64(1, x)
ror s, \s0, #28
shl.2d v21, v19, #56 // part of S64(8, x)
add \s3, \s3, \s7 // d += h;
ushr.2d v19, v19, #7 // R(7, x)
and u, \s1,\s2 // y&z
eor.16b v18, v18, v16
eor s, s, \s0, ror #34
ushr.2d v16, v16, #42 // part of S64(61, x)
and t, t, \s0 // x&(y^z)
eor.16b v19, v19, v20
eor s, s, \s0, ror #39
ushr.2d v20, v20, #7 // part of S64(8, x)
eor t, t, u // Maj(x,y,z)
eor.16b v18, v18, v17
add \s7, \s7, s // h += Sigma0(a);
shl.2d v17, v17, #42 // part of S64(19, x)
add \s7, \s7, t // h = T1 + Sigma0(a) + Maj(a,b,c);
eor.16b v19, v19, v21
ror t, \s3, #14
shl.2d v21,v21, #7 // part of S64(1, x)
ldr u, WK(1+\s8+\s9) // t = WK
eor s, \s4, \s5
eor.16b v18, v18, v16
ldr q16, [K]
eor t, t, \s3, ror #18
eor.16b v19, v19, v20
add K, K, #16
eor.16b v18, v18, v17
and s, s, \s3
eor.16b v19, v19, v21
add \s6, \s6, u // h = h+WK
add.2d \vec0, \vec0, v18 // W0 + Gamma1(W7) + vext(W4,W5)
eor t, t, \s3, ror #41
add.2d \vec0, \vec0, v19 // W0 + Gamma1(W7) + vext(W4,W5) + Gamma0(vext(W0,W1))
eor s, s, \s5
add \s6, \s6, t // h = h+WK+Sigma1(e)
eor t, \s0, \s1 // y^z
add.2d v16, v16, \vec0
add \s6, \s6, s // h = h+WK+Sigma1(e)+Ch(e,f,g);
ror s, \s7, #28
add \s2, \s2, \s6 // d += h;
and u, \s0,\s1 // y&z
eor s, s, \s7, ror #34
and t, t, \s7 // x&(y^z)
eor s, s, \s7, ror #39
eor t, t, u // Maj(x,y,z)
add \s6, \s6, s // h += Sigma0(a);
add \s6, \s6, t // h = T1 + Sigma0(a) + Maj(a,b,c);
str q16, WK(\s9)
.endm
/*
16 rounds of hash update, update input schedule W (in vector register v0-v7) and WK = W + K (in stack)
*/
.macro rounds_schedule
combined_message_round_update2 $0, $1, $2, $3, $4, $5, $6, $7, $8, 0, W0, W1, W2, W3, W4, W5, W6, W7
combined_message_round_update2 $6, $7, $0, $1, $2, $3, $4, $5, $8, 2, W1, W2, W3, W4, W5, W6, W7, W0
combined_message_round_update2 $4, $5, $6, $7, $0, $1, $2, $3, $8, 4, W2, W3, W4, W5, W6, W7, W0, W1
combined_message_round_update2 $2, $3, $4, $5, $6, $7, $0, $1, $8, 6, W3, W4, W5, W6, W7, W0, W1, W2
combined_message_round_update2 $0, $1, $2, $3, $4, $5, $6, $7, $8, 8, W4, W5, W6, W7, W0, W1, W2, W3
combined_message_round_update2 $6, $7, $0, $1, $2, $3, $4, $5, $8,10, W5, W6, W7, W0, W1, W2, W3, W4
combined_message_round_update2 $4, $5, $6, $7, $0, $1, $2, $3, $8,12, W6, W7, W0, W1, W2, W3, W4, W5
combined_message_round_update2 $2, $3, $4, $5, $6, $7, $0, $1, $8,14, W7, W0, W1, W2, W3, W4, W5, W6
.endm
/*
16 rounds of hash update, load new input schedule W (in vector register v0-v7) and update WK = W + K (in stack)
*/
.macro combined_initial_round_update2 s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, vec0
ror t, \s4, #14
ldr u, WK(0+\s8+\s9) // t = WK
eor s, \s5, \s6
ld1.16b {\vec0}, [data], #16
eor t, t, \s4, ror #18
and s, s, \s4
add \s7, \s7, u // h = h+WK
eor t, t, \s4, ror #41
eor s, s, \s6
add \s7, \s7, t // h = h+WK+Sigma1(e)
eor t, \s1, \s2 // y^z
add \s7, \s7, s // h = h+WK+Sigma1(e)+Ch(e,f,g);
ror s, \s0, #28
ld1.2d {v16}, [K], #16
add \s3, \s3, \s7 // d += h;
and u, \s1,\s2 // y&z
eor s, s, \s0, ror #34
and t, t, \s0 // x&(y^z)
eor s, s, \s0, ror #39
eor t, t, u // Maj(x,y,z)
add \s7, \s7, s // h += Sigma0(a);
add \s7, \s7, t // h = T1 + Sigma0(a) + Maj(a,b,c);
ror t, \s3, #14
eor s, \s4, \s5
ldr u, WK(1+\s8+\s9) // t = WK
eor t, t, \s3, ror #18
and s, s, \s3
add \s6, \s6, u // h = h+WK
rev64.16b \vec0, \vec0
eor t, t, \s3, ror #41
eor s, s, \s5
add \s6, \s6, t // h = h+WK+Sigma1(e)
eor t, \s0, \s1 // y^z
add \s6, \s6, s // h = h+WK+Sigma1(e)+Ch(e,f,g);
ror s, \s7, #28
add.2d v16, v16, \vec0
add \s2, \s2, \s6 // d += h;
and u, \s0,\s1 // y&z
eor s, s, \s7, ror #34
and t, t, \s7 // x&(y^z)
eor s, s, \s7, ror #39
eor t, t, u // Maj(x,y,z)
add \s6, \s6, s // h += Sigma0(a);
str q16, WK(\s9)
add \s6, \s6, t // h = T1 + Sigma0(a) + Maj(a,b,c);
.endm
.macro rounds_schedule_initial
combined_initial_round_update2 $0, $1, $2, $3, $4, $5, $6, $7, $8, 0, W0
combined_initial_round_update2 $6, $7, $0, $1, $2, $3, $4, $5, $8, 2, W1
combined_initial_round_update2 $4, $5, $6, $7, $0, $1, $2, $3, $8, 4, W2
combined_initial_round_update2 $2, $3, $4, $5, $6, $7, $0, $1, $8, 6, W3
combined_initial_round_update2 $0, $1, $2, $3, $4, $5, $6, $7, $8, 8, W4
combined_initial_round_update2 $6, $7, $0, $1, $2, $3, $4, $5, $8,10, W5
combined_initial_round_update2 $4, $5, $6, $7, $0, $1, $2, $3, $8,12, W6
combined_initial_round_update2 $2, $3, $4, $5, $6, $7, $0, $1, $8,14, W7
.endm
/*
16 rounds of hash update
*/
.macro rounds_schedule_final
round $0, $1, $2, $3, $4, $5, $6, $7, 0+$8
round $7, $0, $1, $2, $3, $4, $5, $6, 1+$8
round $6, $7, $0, $1, $2, $3, $4, $5, 2+$8
round $5, $6, $7, $0, $1, $2, $3, $4, 3+$8
round $4, $5, $6, $7, $0, $1, $2, $3, 4+$8
round $3, $4, $5, $6, $7, $0, $1, $2, 5+$8
round $2, $3, $4, $5, $6, $7, $0, $1, 6+$8
round $1, $2, $3, $4, $5, $6, $7, $0, 7+$8
round $0, $1, $2, $3, $4, $5, $6, $7, 8+$8
round $7, $0, $1, $2, $3, $4, $5, $6, 9+$8
round $6, $7, $0, $1, $2, $3, $4, $5, 10+$8
round $5, $6, $7, $0, $1, $2, $3, $4, 11+$8
round $4, $5, $6, $7, $0, $1, $2, $3, 12+$8
round $3, $4, $5, $6, $7, $0, $1, $2, 13+$8
round $2, $3, $4, $5, $6, $7, $0, $1, 14+$8
round $1, $2, $3, $4, $5, $6, $7, $0, 15+$8
.endm
.subsections_via_symbols
.text
.p2align 4
.globl _AccelerateCrypto_SHA512_compress
_AccelerateCrypto_SHA512_compress:
BRANCH_TARGET_CALL
#ifdef __ILP32__
uxtw num_blocks, num_blocks // in arm64_32 size_t is 32-bit, so we need to extend it
#endif
adrp K, _sha512_K@page
cbnz num_blocks, 1f // if number of blocks is nonzero, go on for sha256 transform operation
ret lr // otherwise, return
1:
add K, K, _sha512_K@pageoff
#if BUILDKERNEL
// v0-v7, v16-v23
sub x4, sp, #16*16
sub sp, sp, #16*16
st1.4s {v0, v1, v2, v3}, [x4], #64
st1.4s {v4, v5, v6, v7}, [x4], #64
st1.4s {v16, v17, v18, v19}, [x4], #64
st1.4s {v20, v21, v22, v23}, [x4], #64
#endif
// allocate stack space for WK[0:15]
sub sp, sp, #stack_size
ldr q0, [data], #128
ldr q1, [data, #-112]
ldr q2, [data, #-96]
ldr q3, [data, #-80]
rev64.16b v0, v0
ldr q4, [data, #-64]
rev64.16b v1, v1
ldr q5, [data, #-48]
rev64.16b v2, v2
ldr q6, [data, #-32]
rev64.16b v3, v3
ldr q7, [data, #-16]
rev64.16b v4, v4
ldr q16, [K], #64
rev64.16b v5, v5
ldr q17, [K, #-48]
rev64.16b v6, v6
ldr q18, [K, #-32]
rev64.16b v7, v7
ldr q19, [K, #-16]
// compute WK[0:15] and save in stack
add.2d v20, v16, v0
ldr q16, [K], #64
add.2d v21, v17, v1
ldr q17, [K, #-48]
add.2d v22, v18, v2
ldr q18, [K, #-32]
add.2d v23, v19, v3
ldr q19, [K, #-16]
add.2d v16, v16, v4
str q20, [sp]
add.2d v17, v17, v5
str q21, [sp, #16*1]
add.2d v18, v18, v6
str q22, [sp, #16*2]
add.2d v19, v19, v7
str q23, [sp, #16*3]
str q16, [sp, #16*4]
str q17, [sp, #16*5]
str q18, [sp, #16*6]
str q19, [sp, #16*7]
L_loop:
// digests a-h = ctx->states;
ldp a, bb, [ctx]
ldp c, d, [ctx, #16]
ldp e, f, [ctx, #32]
ldp g, h, [ctx, #48]
// rounds 0:47 interleaved with W/WK update for rounds 16:63
mov w15, #4
L_i_loop:
rounds_schedule a, bb, c, d, e, f, g, h, 16
subs w15, w15, #1
b.gt L_i_loop
// revert K to the beginning of K256[]
sub K, K, #640
subs num_blocks, num_blocks, #1 // num_blocks--
b.eq L_final_block // if final block, wrap up final rounds
rounds_schedule_initial a, bb, c, d, e, f, g, h, 0
// ctx->states += digests a-h
ldp s, t, [ctx]
add s, s, a
add t, t, bb
stp s, t, [ctx]
ldp s, t, [ctx, #16]
add s, s, c
add t, t, d
stp s, t, [ctx, #16]
ldp s, t, [ctx, #32]
add s, s, e
add t, t, f
stp s, t, [ctx, #32]
ldp s, t, [ctx, #48]
add s, s, g
add t, t, h
stp s, t, [ctx, #48]
b L_loop // branch for next block
// wrap up digest update round 48:63 for final block
L_final_block:
rounds_schedule_final a, bb, c, d, e, f, g, h, 0
// ctx->states += digests a-h
ldp s, t, [ctx]
add s, s, a
add t, t, bb
stp s, t, [ctx]
ldp s, t, [ctx, #16]
add s, s, c
add t, t, d
stp s, t, [ctx, #16]
ldp s, t, [ctx, #32]
add s, s, e
add t, t, f
stp s, t, [ctx, #32]
ldp s, t, [ctx, #48]
add s, s, g
add t, t, h
stp s, t, [ctx, #48]
// if kernel, restore used vector registers
#if BUILDKERNEL
ld1.4s {v0, v1, v2, v3}, [sp], #64
ld1.4s {v4, v5, v6, v7}, [sp], #64
ld1.4s {v16, v17, v18, v19}, [sp], #64
ld1.4s {v20, v21, v22, v23}, [sp], #64
#endif
// free allocated stack memory
add sp, sp, #stack_size
// return
ret lr
#endif // __arm64__

View File

@ -0,0 +1,259 @@
# Copyright (c) (2016,2018,2019,2020) Apple Inc. All rights reserved.
#
# corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
# is contained in the License.txt file distributed with corecrypto) and only to
# people who accept that license. IMPORTANT: Any license rights granted to you by
# Apple Inc. (if any) are limited to internal use within your organization only on
# devices and computers you own or control, for the sole purpose of verifying the
# security characteristics and correct functioning of the Apple Software. You may
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
/*
This file provides arm64 hand implementation of the following function
void sha512_compress(uint64_t *state, size_t nblocks, const void *in);
sha512 algorithm per block description:
1. W(0:15) = big-endian (per 8 bytes) loading of input data (128 bytes)
2. load 8 digests (each 64bit) a-h from state
3. for r = 0:15
T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
d += T1;
h = T1 + Sigma0(a) + Maj(a,b,c)
permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
4. for r = 16:79
W[r] = W[r-16] + Gamma1(W[r-2]) + W[r-7] + Gamma0(W[r-15]);
T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
d += T1;
h = T1 + Sigma0(a) + Maj(a,b,c)
permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
In the assembly implementation:
- a circular window of message schedule W(r:r+15) is updated and stored in v0-v7
- its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
- the 8 digests (a-h) will be stored in GPR (%r8-%r15)
----------------------------------------------------------------------------
our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block
into the last 16 rounds of its previous block:
----------------------------------------------------------------------------
load W(0:15) (big-endian per 8 bytes) into v0:v7
pre_calculate and store W+K(0:15) in stack
L_loop:
load digests a-h from ctx->state;
for (r=0;r<64;r+=2) {
digests a-h update and permute round r:r+1
update W([r:r+1]%16) and WK([r:r+1]%16) for the next 8th iteration
}
num_block--;
if (num_block==0) jmp L_last_block;
for (r=64;r<80;r+=2) {
digests a-h update and permute round r:r+1
load W([r:r+1]%16) (big-endian per 8 bytes) into v0:v7
pre_calculate and store W+K([r:r+1]%16) in stack
}
ctx->states += digests a-h;
jmp L_loop;
L_last_block:
for (r=64;r<80;r+=2) {
digests a-h update and permute round r:r+2
}
ctx->states += digests a-h;
------------------------------------------------------------------------
Apple CoreOS vector & numerics
*/
#if defined __arm64__
#include "ccarm_pac_bti_macros.h"
.macro swap_hilo
ext.16b $0, $0, $0, #8
.endm
.macro ext16b
ext.16b $0, $1, $2, #8
.endm
.text
.align 4
.globl _AccelerateCrypto_SHA512_compress_hwassist
_AccelerateCrypto_SHA512_compress_hwassist:
BRANCH_TARGET_CALL
#define hashes x0
#define numblocks x1
#define data x2
#define ktable x3
#ifdef __ILP32__
uxtw numblocks, numblocks // in arm64_32 size_t is 32-bit, so we need to extend it
#endif
adrp ktable, _ccsha512_K@page
cbnz numblocks, 1f
ret lr // otherwise, return
1:
add ktable, ktable, _ccsha512_K@pageoff
#if BUILDKERNEL
sub x4, sp, #28*16
sub sp, sp, #28*16
st1.4s {v0, v1, v2, v3}, [x4], #64
st1.4s {v4, v5, v6, v7}, [x4], #64
st1.4s {v16, v17, v18, v19}, [x4], #64
st1.4s {v20, v21, v22, v23}, [x4], #64
st1.4s {v24, v25, v26, v27}, [x4], #64
st1.4s {v28, v29, v30, v31}, [x4], #64
#else
sub x4, sp, #4*16
sub sp, sp, #4*16
#endif
st1.4s {v8, v9, v10, v11}, [x4], #64
ld1.2d {v8,v9,v10,v11}, [hashes] // (a,b) (c,d) (e,f) (g,h)
L_loop:
mov.16b v24, v8
ldr q0, [data, #0*16]
mov.16b v25, v9
ldr q1, [data, #1*16]
mov.16b v26, v10
ldr q2, [data, #2*16]
mov.16b v27, v11
ldr q3, [data, #3*16]
rev64.16b v0, v0
ldr q4, [data, #4*16]
rev64.16b v1, v1
ldr q5, [data, #5*16]
rev64.16b v2, v2
ldr q6, [data, #6*16]
rev64.16b v3, v3
ldr q7, [data, #7*16]
rev64.16b v4, v4
ldr q16, [ktable, #0*16]
rev64.16b v5, v5
ldr q17, [ktable, #1*16]
rev64.16b v6, v6
ldr q18, [ktable, #2*16]
rev64.16b v7, v7
ldr q19, [ktable, #3*16]
add.2d v16, v16, v0
ldr q20, [ktable, #4*16]
add.2d v17, v17, v1
ldr q21, [ktable, #5*16]
add.2d v18, v18, v2
ldr q22, [ktable, #6*16]
add.2d v19, v19, v3
ldr q23, [ktable, #7*16]
add.2d v20, v20, v4
add data, data, #8*16
add.2d v21, v21, v5
add ktable, ktable, #8*16
add.2d v22, v22, v6
add.2d v23, v23, v7
.macro sha512_round S0, S1, S2, S3, WK, w0, w1, w4, w5, w7, i
ext16b \WK, \WK, \WK
ext16b v29, \S2, \S3
ext16b v28, \S1, \S2
add.2d \S3, \S3, \WK
ext16b v31, \w4, \w5
ldr q30, [ktable, #\i*16]
sha512h.2d \S3, v29, v28
sha512su0.2d \w0, \w1
mov.16b v28, \S3
sha512h2.2d \S3, \S1, \S0
sha512su1.2d \w0, \w7, v31
add.2d \S1, \S1, v28
add.2d \WK, \w0, v30
.endm
.macro sha512_8_rounds
sha512_round v24, v25, v26, v27, v16, v0, v1, v4, v5, v7, 0
sha512_round v27, v24, v25, v26, v17, v1, v2, v5, v6, v0, 1
sha512_round v26, v27, v24, v25, v18, v2, v3, v6, v7, v1, 2
sha512_round v25, v26, v27, v24, v19, v3, v4, v7, v0, v2, 3
sha512_round v24, v25, v26, v27, v20, v4, v5, v0, v1, v3, 4
sha512_round v27, v24, v25, v26, v21, v5, v6, v1, v2, v4, 5
sha512_round v26, v27, v24, v25, v22, v6, v7, v2, v3, v5, 6
sha512_round v25, v26, v27, v24, v23, v7, v0, v3, v4, v6, 7
add ktable, ktable, #16*8
.endm
.macro sha512_round_final S0, S1, S2, S3, WK, w0, w1, w4, w5, w7
ext16b \WK, \WK, \WK
ext16b v29, \S2, \S3
ext16b v28, \S1, \S2
add.2d v30, \S3, \WK
sha512h.2d v30, v29, v28
mov.16b \S3, v30
sha512h2.2d \S3, \S1, \S0
add.2d \S1, \S1, v30
.endm
.macro sha512_8_rounds_final
sha512_round_final v24, v25, v26, v27, v16
sha512_round_final v27, v24, v25, v26, v17
sha512_round_final v26, v27, v24, v25, v18
sha512_round_final v25, v26, v27, v24, v19
sha512_round_final v24, v25, v26, v27, v20
sha512_round_final v27, v24, v25, v26, v21
sha512_round_final v26, v27, v24, v25, v22
sha512_round_final v25, v26, v27, v24, v23
.endm
sha512_8_rounds
sha512_8_rounds
sha512_8_rounds
sha512_8_rounds
sha512_8_rounds_final
add.2d v8, v8, v24
add.2d v9, v9, v25
add.2d v10, v10, v26
add.2d v11, v11, v27
subs numblocks, numblocks, #1 // pre-decrement num_blocks by 1
sub ktable, ktable, #640
b.gt L_loop
st1.2d {v8,v9,v10,v11}, [hashes]
#if BUILDKERNEL
ld1.4s {v0, v1, v2, v3}, [sp], #64
ld1.4s {v4, v5, v6, v7}, [sp], #64
ld1.4s {v16, v17, v18, v19}, [sp], #64
ld1.4s {v20, v21, v22, v23}, [sp], #64
ld1.4s {v24, v25, v26, v27}, [sp], #64
ld1.4s {v28, v29, v30, v31}, [sp], #64
#endif
ld1.4s {v8, v9, v10, v11}, [sp], #64
ret lr
#endif

View File

@ -0,0 +1,29 @@
/* Copyright (c) (2019) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#include <stddef.h>
#include "config.h"
#include "AccelerateCrypto.h"
#if defined(__x86_64__)
extern void AccelerateCrypto_SHA512_compress_ssse3(uint64_t *state, size_t num, const void *buf) __asm__("_AccelerateCrypto_SHA512_compress_ssse3");
extern void AccelerateCrypto_SHA512_compress_AVX1(uint64_t *state, size_t num, const void *buf) __asm__("_AccelerateCrypto_SHA512_compress_AVX1");
extern void AccelerateCrypto_SHA512_compress_AVX2(uint64_t *state, size_t num, const void *buf)__asm__("_AccelerateCrypto_SHA512_compress_AVX2");
void AccelerateCrypto_SHA512_compress(uint64_t *state, size_t num, const void *buf)
{
if (HAS_AVX2()) AccelerateCrypto_SHA512_compress_AVX2(state, num, buf);
else if (HAS_AVX1()) AccelerateCrypto_SHA512_compress_AVX1(state, num, buf);
else
AccelerateCrypto_SHA512_compress_ssse3(state, num, buf);
}
#endif // defined(__x86_64__)

View File

@ -0,0 +1,616 @@
# Copyright (c) (2016,2018,2019) Apple Inc. All rights reserved.
#
# corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
# is contained in the License.txt file distributed with corecrypto) and only to
# people who accept that license. IMPORTANT: Any license rights granted to you by
# Apple Inc. (if any) are limited to internal use within your organization only on
# devices and computers you own or control, for the sole purpose of verifying the
# security characteristics and correct functioning of the Apple Software. You may
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
#include <corecrypto/cc_config.h>
/*
This file provides x86_64 hand implementation of the following function
void sha512_compress(uint64_t *state, size_t nblocks, const void *in);
sha512 algorithm per block description:
1. W(0:15) = big-endian (per 8 bytes) loading of input data (128 bytes)
2. load 8 digests (each 64bit) a-h from state
3. for r = 0:15
T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
d += T1;
h = T1 + Sigma0(a) + Maj(a,b,c)
permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
4. for r = 16:79
W[r] = W[r-16] + Gamma1(W[r-2]) + W[r-7] + Gamma0(W[r-15]);
T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
d += T1;
h = T1 + Sigma0(a) + Maj(a,b,c)
permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
In the assembly implementation:
- a circular window of message schedule W(r:r+15) is updated and stored in xmm0-xmm7 (or ymm0-ymm3/zmm0-zmm1 for avx1/avx2)
- its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
- the 8 digests (a-h) will be stored in GPR (%r8-%r15)
----------------------------------------------------------------------------
our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block
into the last 16 rounds of its previous block:
----------------------------------------------------------------------------
load W(0:15) (big-endian per 8 bytes) into xmm0:xmm7
pre_calculate and store W+K(0:15) in stack
L_loop:
load digests a-h from ctx->state;
for (r=0;r<64;r+=2) {
digests a-h update and permute round r:r+1
update W([r:r+1]%16) and WK([r:r+1]%16) for the next 8th iteration
}
num_block--;
if (num_block==0) jmp L_last_block;
for (r=64;r<80;r+=2) {
digests a-h update and permute round r:r+1
load W([r:r+1]%16) (big-endian per 8 bytes) into xmm0:xmm7
pre_calculate and store W+K([r:r+1]%16) in stack
}
ctx->states += digests a-h;
jmp L_loop;
L_last_block:
for (r=64;r<80;r+=2) {
digests a-h update and permute round r:r+2
}
ctx->states += digests a-h;
------------------------------------------------------------------------
Apple CoreOS vector & numerics
*/
#if defined __x86_64__
// associate variables with registers or memory
#define sp %rsp
#define ctx %rdi
#define num_blocks %rsi // later move this to stack, use %rsi for temp variable u
#define data %rdx
#define a %r8
#define b %r9
#define c %r10
#define d %r11
#define e %r12
#define f %r13
#define g %r14
#define h %r15
#define K %rbx
#define _num_blocks (-48)(%rbp) // rbx/r12-r15
#define stack_size (8+32*12+128+16) // 8 (_num_blocks) + ymm0:ymm11 + WK(0:15) + 16byte for 32-byte alignment
#define L_aligned_bswap L_bswap(%rip) // bswap : big-endian loading of 4-byte words
#define ymm_save 128(sp) // starting address for xmm save/restore
// 3 local variables
#define s %rax
#define t %rcx
#define u %rsi
// a window (16 quad-words) of message scheule
#define W0 %xmm0
#define W1 %xmm1
#define W2 %xmm2
#define W3 %xmm3
#define W4 %xmm4
#define W5 %xmm5
#define W6 %xmm6
#define W7 %xmm7
// circular buffer for WK[(r:r+15)%16]
#define WK(x) ((x)&15)*8(sp)
// #define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
.macro Ch arg0, arg1, arg2
#if 1
mov \arg2, t
xor \arg1, t
and \arg0, t
xor \arg2, t
#else
mov \arg0, t // x
mov \arg0, s // x
not t // ~x
and \arg1, s // x & y
and \arg2, t // ~x & z
xor s, t // t = ((x) & (y)) ^ ((~(x)) & (z));
#endif
.endm
// #define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
.macro Maj arg0, arg1, arg2
mov \arg1, t // y
mov \arg2, s // z
xor \arg2, t // y^z
and \arg1, s // y&z
and \arg0, t // x&(y^z)
xor s, t // Maj(x,y,z)
.endm
// #define Gamma0(x) (S64(1, (x)) ^ S64(8, (x)) ^ R(7 , (x)))
// performs Gamma0_512 on 2 words on an xmm registers
// use xmm8/xmm9 as intermediate registers
.macro Gamma0 arg0
vpsrlq $1, \arg0, %xmm8 // part of S64(1, x)
vpsllq $56, \arg0, %xmm9 // part of S64(8, x)
vpsrlq $7, \arg0, \arg0 // R(7, x)
vpxor %xmm8, \arg0, \arg0
vpsrlq $7, %xmm8, %xmm8 // part of S64(8, x)
vpxor %xmm9, \arg0, \arg0
vpsllq $7, %xmm9, %xmm9 // part of S64(1, x)
vpxor %xmm8, \arg0, \arg0
vpxor %xmm9, \arg0, \arg0
.endm
// #define Gamma1(x) (S64(19, (x)) ^ S64(61, (x)) ^ R(6, (x)))
// performs Gamma1_512 on 2 words on an xmm registers
// use xmm8/xmm9 as intermediate registers
.macro Gamma1 arg0
vpsrlq $19, \arg0, %xmm8 // part of S64(19, x)
vpsllq $3, \arg0, %xmm9 // part of S64(61, x)
vpsrlq $6, \arg0, \arg0 // R(6, x)
vpxor %xmm8, \arg0, \arg0
vpsrlq $42, %xmm8, %xmm8 // part of S64(61, x)
vpxor %xmm9, \arg0, \arg0
vpsllq $42, %xmm9, %xmm9 // part of S64(19, x)
vpxor %xmm8, \arg0, \arg0
vpxor %xmm9, \arg0, \arg0
.endm
// W[r] = W[r-16] + Gamma1(W[r-2]) + W[r-7] + Gamma0(W[r-15]);
/*
W0 W1 W2 W3 W4 W5 W6 W7
update 2 quad words in W0 = W0 + Gamma1(W7) + vext(W4,W5) + Gamma0(vext(W0,W1)).
use %xmm10, %xmm11 for temp
*/
.macro message_update2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
vpalignr $8, \arg4, \arg5, %xmm10 // vext(W4,W5)
vpalignr $8, \arg0, \arg1, %xmm11 // vext(W0,W1)
vpaddq %xmm10, \arg0, \arg0 // W0 + vext(W4,W5)
// vmovdqa \arg7, %xmm10
// Gamma1 %xmm10 // Gamma1(W7)
vpsrlq $19, \arg7, %xmm8 // part of S64(19, x)
vpsllq $3, \arg7, %xmm9 // part of S64(61, x)
vpsrlq $6, \arg7, %xmm10 // R(6, x)
vpxor %xmm8, %xmm10, %xmm10
vpsrlq $42, %xmm8, %xmm8 // part of S64(61, x)
vpxor %xmm9, %xmm10, %xmm10
vpsllq $42, %xmm9, %xmm9 // part of S64(19, x)
vpxor %xmm8, %xmm10, %xmm10
vpxor %xmm9, %xmm10, %xmm10
Gamma0 %xmm11 // Gamma0(vext(W0,W1))
vpaddq %xmm10, \arg0, \arg0 // W0 + Gamma1(W7) + vext(W4,W5)
vpaddq %xmm11, \arg0, \arg0 // W0 + Gamma1(W7) + vext(W4,W5) + Gamma0(vext(W0,W1))
.endm
// #define Sigma0(x) (S64(28, (x)) ^ S64(34, (x)) ^ S64(39, (x)))
.macro Sigma0 arg0
mov \arg0, t // x
mov \arg0, s // x
ror $28, t // S(28, (x))
ror $34, s // S(34, (x))
xor s, t // S(28, (x)) ^ S(34, (x))
ror $5, s // S(39, (x))
xor s, t // t = (S(28, (x)) ^ S(34, (x)) ^ S(39, (x)))
.endm
// #define Sigma1(x) (S(14, (x)) ^ S(18, (x)) ^ S(41, (x)))
.macro Sigma1 arg0
mov \arg0, s // x
ror $14, s // S(14, (x))
mov s, t // S(14, (x))
ror $4, s // S(18, (x))
xor s, t // S(14, (x)) ^ S(18, (x))
ror $23, s // S(41, (x))
xor s, t // t = (S(14, (x)) ^ S(18, (x)) ^ S(41, (x)))
.endm
// per round digests update
.macro round_ref arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
Sigma1 \arg4 // t = Sigma1(e);
add t, \arg7 // h = h+Sigma1(e)
Ch \arg4, \arg5, \arg6 // t = Ch (e, f, g);
add t, \arg7 // h = h+Sigma1(e)+Ch(e,f,g);
add WK(\arg8), \arg7 // h = h+Sigma1(e)+Ch(e,f,g)+WK
add \arg7, \arg3 // d += h;
Sigma0 \arg0 // t = Sigma0(a);
add t, \arg7 // h += Sigma0(a);
Maj \arg0, \arg1, \arg2 // t = Maj(a,b,c)
add t, \arg7 // h = T1 + Sigma0(a) + Maj(a,b,c);
.endm
.macro round arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
mov \arg4, s
mov \arg0, t
ror $(41-18), s
ror $(39-34), t
xor \arg4, s
mov \arg5, u
xor \arg0, t
ror $(18-14), s
xor \arg6, u
xor \arg4, s
ror $(34-28), t
and \arg4, u
xor \arg0, t
xor \arg6, u
ror $14, s
ror $28, t
add s, u
mov \arg0, s
add WK(\arg8), u
or \arg2, s
add u, \arg7
mov \arg0, u
add \arg7, \arg3
and \arg1, s
and \arg2, u
or u, s
add t, \arg7
add s, \arg7
.endm
/*
16 rounds of hash update, update input schedule W (in vector register xmm0-xmm7) and WK = W + K (in stack)
*/
.macro rounds_schedule arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
message_update2 W0, W1, W2, W3, W4, W5, W6, W7
vmovdqa 0*16(K), %xmm8
round \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 0+\arg8
vpaddq W0, %xmm8, %xmm8
round \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 1+\arg8
vmovdqa %xmm8, WK(0)
message_update2 W1, W2, W3, W4, W5, W6, W7, W0
vmovdqa 1*16(K), %xmm8
round \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 2+\arg8
vpaddq W1, %xmm8, %xmm8
round \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 3+\arg8
vmovdqa %xmm8, WK(2)
message_update2 W2, W3, W4, W5, W6, W7, W0, W1
vmovdqa 2*16(K), %xmm8
round \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 4+\arg8
vpaddq W2, %xmm8, %xmm8
round \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 5+\arg8
vmovdqa %xmm8, WK(4)
message_update2 W3, W4, W5, W6, W7, W0, W1, W2
vmovdqa 3*16(K), %xmm8
round \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 6+\arg8
vpaddq W3, %xmm8, %xmm8
round \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 7+\arg8
vmovdqa %xmm8, WK(6)
message_update2 W4, W5, W6, W7, W0, W1, W2, W3
movdqa 4*16(K), %xmm8
round \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 8+\arg8
paddq W4, %xmm8
round \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 9+\arg8
movdqa %xmm8, WK(8)
message_update2 W5, W6, W7, W0, W1, W2, W3, W4
vmovdqa 5*16(K), %xmm8
round \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 10+\arg8
vpaddq W5, %xmm8, %xmm8
round \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 11+\arg8
vmovdqa %xmm8, WK(10)
message_update2 W6, W7, W0, W1, W2, W3, W4, W5
vmovdqa 6*16(K), %xmm8
round \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 12+\arg8
vpaddq W6, %xmm8, %xmm8
round \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 13+\arg8
vmovdqa %xmm8, WK(12)
message_update2 W7, W0, W1, W2, W3, W4, W5, W6
vmovdqa 7*16(K), %xmm8
round \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 14+\arg8
vpaddq W7, %xmm8, %xmm8
round \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 15+\arg8
vmovdqa %xmm8, WK(14)
addq $128, K
.endm
/*
16 rounds of hash update, load new input schedule W (in vector register xmm0-xmm7) and update WK = W + K (in stack)
*/
.macro rounds_schedule_initial arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
vmovdqu 0*16(data), W0
vmovdqa 0*16(K), %xmm8
vpshufb L_aligned_bswap, W0, W0
round \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 0+\arg8
vpaddq W0, %xmm8, %xmm8
round \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 1+\arg8
vmovdqa %xmm8, WK(0)
vmovdqu 1*16(data), W1
vmovdqa 1*16(K), %xmm8
vpshufb L_aligned_bswap, W1, W1
round \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 2+\arg8
vpaddq W1, %xmm8, %xmm8
round \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 3+\arg8
vmovdqa %xmm8, WK(2)
vmovdqu 2*16(data), W2
vmovdqa 2*16(K), %xmm8
vpshufb L_aligned_bswap, W2, W2
round \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 4+\arg8
vpaddq W2, %xmm8, %xmm8
round \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 5+\arg8
vmovdqa %xmm8, WK(4)
vmovdqu 3*16(data), W3
vmovdqa 3*16(K), %xmm8
vpshufb L_aligned_bswap, W3, W3
round \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 6+\arg8
vpaddq W3, %xmm8, %xmm8
round \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 7+\arg8
vmovdqa %xmm8, WK(6)
vmovdqu 4*16(data), W4
vmovdqa 4*16(K), %xmm8
vpshufb L_aligned_bswap, W4, W4
round \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 8+\arg8
vpaddq W4, %xmm8, %xmm8
round \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 9+\arg8
vmovdqa %xmm8, WK(8)
vmovdqu 5*16(data), W5
vmovdqa 5*16(K), %xmm8
vpshufb L_aligned_bswap, W5, W5
round \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 10+\arg8
vpaddq W5, %xmm8, %xmm8
round \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 11+\arg8
vmovdqa %xmm8, WK(10)
vmovdqu 6*16(data), W6
vmovdqa 6*16(K), %xmm8
vpshufb L_aligned_bswap, W6, W6
round \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 12+\arg8
vpaddq W6, %xmm8, %xmm8
round \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 13+\arg8
vmovdqa %xmm8, WK(12)
vmovdqu 7*16(data), W7
vmovdqa 7*16(K), %xmm8
vpshufb L_aligned_bswap, W7, W7
round \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 14+\arg8
vpaddq W7, %xmm8, %xmm8
round \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 15+\arg8
vmovdqa %xmm8, WK(14)
addq $128, K
addq $128, data
.endm
/*
16 rounds of hash update
*/
.macro rounds_schedule_final arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
round \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 0+\arg8
round \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 1+\arg8
round \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 2+\arg8
round \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 3+\arg8
round \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 4+\arg8
round \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 5+\arg8
round \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 6+\arg8
round \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 7+\arg8
round \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 8+\arg8
round \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 9+\arg8
round \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 10+\arg8
round \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 11+\arg8
round \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 12+\arg8
round \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 13+\arg8
round \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 14+\arg8
round \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 15+\arg8
.endm
.text
.globl _AccelerateCrypto_SHA512_compress_AVX1
_AccelerateCrypto_SHA512_compress_AVX1:
// push callee-saved registers
push %rbp
movq %rsp, %rbp
push %rbx
push %r12
push %r13
push %r14
push %r15
// allocate stack space
sub $stack_size, sp
andq $-32, sp // aligned sp to 32-bytes
// if kernel code, save used xmm registers
#if BUILDKERNEL
vmovdqa %ymm0, 0*32+ymm_save
vmovdqa %ymm1, 1*32+ymm_save
vmovdqa %ymm2, 2*32+ymm_save
vmovdqa %ymm3, 3*32+ymm_save
vmovdqa %ymm4, 4*32+ymm_save
vmovdqa %ymm5, 5*32+ymm_save
vmovdqa %ymm6, 6*32+ymm_save
vmovdqa %ymm7, 7*32+ymm_save
vmovdqa %ymm8, 8*32+ymm_save
vmovdqa %ymm9, 9*32+ymm_save
vmovdqa %ymm10, 10*32+ymm_save
vmovdqa %ymm11, 11*32+ymm_save
#endif
movq num_blocks, _num_blocks
// set up bswap parameters in the aligned stack space and pointer to table K512[]
lea CC_C_LABEL(sha512_K)(%rip), K
// load W[0:15] into xmm0-xmm7
vmovdqu 0*16(data), W0
vmovdqu 1*16(data), W1
vmovdqu 2*16(data), W2
vmovdqu 3*16(data), W3
vmovdqu 4*16(data), W4
vmovdqu 5*16(data), W5
vmovdqu 6*16(data), W6
vmovdqu 7*16(data), W7
addq $128, data
vmovdqa L_aligned_bswap, %xmm8
vpshufb %xmm8, W0, W0
vpshufb %xmm8, W1, W1
vpshufb %xmm8, W2, W2
vpshufb %xmm8, W3, W3
vpshufb %xmm8, W4, W4
vpshufb %xmm8, W5, W5
vpshufb %xmm8, W6, W6
vpshufb %xmm8, W7, W7
// compute WK[0:15] and save in stack
vpaddq 0*16(K), %xmm0, %xmm8
vpaddq 1*16(K), %xmm1, %xmm9
vpaddq 2*16(K), %xmm2, %xmm10
vpaddq 3*16(K), %xmm3, %xmm11
vmovdqa %xmm8, WK(0)
vmovdqa %xmm9, WK(2)
vmovdqa %xmm10, WK(4)
vmovdqa %xmm11, WK(6)
vpaddq 4*16(K), %xmm4, %xmm8
vpaddq 5*16(K), %xmm5, %xmm9
vpaddq 6*16(K), %xmm6, %xmm10
vpaddq 7*16(K), %xmm7, %xmm11
vmovdqa %xmm8, WK(8)
vmovdqa %xmm9, WK(10)
vmovdqa %xmm10, WK(12)
vmovdqa %xmm11, WK(14)
addq $128, K
L_loop:
// digests a-h = ctx->states;
mov 0*8(ctx), a
mov 1*8(ctx), b
mov 2*8(ctx), c
mov 3*8(ctx), d
mov 4*8(ctx), e
mov 5*8(ctx), f
mov 6*8(ctx), g
mov 7*8(ctx), h
// rounds 0:47 interleaved with W/WK update for rounds 16:63
rounds_schedule a, b, c, d, e, f, g, h, 16
rounds_schedule a, b, c, d, e, f, g, h, 32
rounds_schedule a, b, c, d, e, f, g, h, 48
rounds_schedule a, b, c, d, e, f, g, h, 64
// revert K to the beginning of K256[]
subq $640, K
subq $1, _num_blocks // num_blocks--
je L_final_block // if final block, wrap up final rounds
rounds_schedule_initial a, b, c, d, e, f, g, h, 0
// ctx->states += digests a-h
add a, 0*8(ctx)
add b, 1*8(ctx)
add c, 2*8(ctx)
add d, 3*8(ctx)
add e, 4*8(ctx)
add f, 5*8(ctx)
add g, 6*8(ctx)
add h, 7*8(ctx)
jmp L_loop // branch for next block
// wrap up digest update round 48:63 for final block
L_final_block:
rounds_schedule_final a, b, c, d, e, f, g, h, 0
// ctx->states += digests a-h
add a, 0*8(ctx)
add b, 1*8(ctx)
add c, 2*8(ctx)
add d, 3*8(ctx)
add e, 4*8(ctx)
add f, 5*8(ctx)
add g, 6*8(ctx)
add h, 7*8(ctx)
// if kernel, restore ymm0-ymm11
#if BUILDKERNEL
vmovdqa 0*32+ymm_save, %ymm0
vmovdqa 1*32+ymm_save, %ymm1
vmovdqa 2*32+ymm_save, %ymm2
vmovdqa 3*32+ymm_save, %ymm3
vmovdqa 4*32+ymm_save, %ymm4
vmovdqa 5*32+ymm_save, %ymm5
vmovdqa 6*32+ymm_save, %ymm6
vmovdqa 7*32+ymm_save, %ymm7
vmovdqa 8*32+ymm_save, %ymm8
vmovdqa 9*32+ymm_save, %ymm9
vmovdqa 10*32+ymm_save, %ymm10
vmovdqa 11*32+ymm_save, %ymm11
#endif
// free allocated stack memory
leaq -40(%rbp), sp
// restore callee-saved registers
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbx
pop %rbp
// return
ret
// data for using ssse3 pshufb instruction (big-endian loading of data)
CC_ASM_SECTION_CONST
.p2align 4
L_bswap:
.quad 0x0001020304050607
.quad 0x08090a0b0c0d0e0f
#endif // x86_64

View File

@ -0,0 +1,552 @@
# Copyright (c) (2016,2018,2019) Apple Inc. All rights reserved.
#
# corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
# is contained in the License.txt file distributed with corecrypto) and only to
# people who accept that license. IMPORTANT: Any license rights granted to you by
# Apple Inc. (if any) are limited to internal use within your organization only on
# devices and computers you own or control, for the sole purpose of verifying the
# security characteristics and correct functioning of the Apple Software. You may
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
#include <corecrypto/cc_config.h>
/*
This file provides x86_64 avx2 hand implementation of the following function
void sha512_compress(uint64_t *state, size_t nblocks, const void *in);
sha512 algorithm per block description:
1. W(0:15) = big-endian (per 8 bytes) loading of input data (128 bytes)
2. load 8 digests (each 64bit) a-h from state
3. for r = 0:15
T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
d += T1;
h = T1 + Sigma0(a) + Maj(a,b,c)
permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
4. for r = 16:79
W[r] = W[r-16] + Gamma1(W[r-2]) + W[r-7] + Gamma0(W[r-15]);
T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
d += T1;
h = T1 + Sigma0(a) + Maj(a,b,c)
permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
In the assembly implementation:
- a circular window of message schedule W(r:r+15) is updated and stored in xmm0-xmm7 (or ymm0-ymm3/zmm0-zmm1 for avx2/avx512)
- its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
- the 8 digests (a-h) will be stored in GPR (%r8-%r15)
----------------------------------------------------------------------------
our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block
into the last 16 rounds of its previous block:
----------------------------------------------------------------------------
load W(0:15) (big-endian per 8 bytes) into xmm0:xmm7
pre_calculate and store W+K(0:15) in stack
L_loop:
load digests a-h from ctx->state;
for (r=0;r<64;r+=2) {
digests a-h update and permute round r:r+1
update W([r:r+1]%16) and WK([r:r+1]%16) for the next 8th iteration
}
num_block--;
if (num_block==0) jmp L_last_block;
for (r=64;r<80;r+=2) {
digests a-h update and permute round r:r+1
load W([r:r+1]%16) (big-endian per 8 bytes) into xmm0:xmm7
pre_calculate and store W+K([r:r+1]%16) in stack
}
ctx->states += digests a-h;
jmp L_loop;
L_last_block:
for (r=64;r<80;r+=2) {
digests a-h update and permute round r:r+2
}
ctx->states += digests a-h;
------------------------------------------------------------------------
Apple CoreOS vector & numerics
*/
#if defined __x86_64__
// associate variables with registers or memory
#define sp %rsp
#define ctx %rdi
#define num_blocks %rsi // later move this to stack, use %rsi for temp variable u
#define data %rdx
#define a %r8
#define b %r9
#define c %r10
#define d %r11
#define e %r12
#define f %r13
#define g %r14
#define h %r15
#define K %rbx
#define _num_blocks (-48)(%rbp) // rbx/r12-r15
#define L_aligned_bswap L_bswap(%rip)
#define stack_size (8+32*8+128) // 8 (_num_blocks) + ymm save/restore + WK(0:15)
#define ymm_save 128(sp) // starting address for ymm save/restore
// 3 local variables
#define s %rax
#define t %rcx
#define u %rsi
// a window (16 quad-words) of message scheule
#define W0 %ymm0
#define W1 %ymm1
#define W2 %ymm2
#define W3 %ymm3
// circular buffer for WK[(r:r+15)%16]
#define WK(x) ((x)&15)*8(sp)
// #define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
.macro Ch arg0, arg1, arg2
#if 1
mov \arg2, t
xor \arg1, t
and \arg0, t
xor \arg2, t
#else
mov \arg0, t // x
mov \arg0, s // x
not t // ~x
and \arg1, s // x & y
and \arg2, t // ~x & z
xor s, t // t = ((x) & (y)) ^ ((~(x)) & (z));
#endif
.endm
// #define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
.macro Maj arg0, arg1, arg2
mov \arg1, t // y
mov \arg2, s // z
xor \arg2, t // y^z
and \arg1, s // y&z
and \arg0, t // x&(y^z)
xor s, t // Maj(x,y,z)
.endm
// #define Gamma0(x) (S64(1, (x)) ^ S64(8, (x)) ^ R(7 , (x)))
// performs Gamma0_512 on 4 quad-words on an ymm registers
// use ymm6/ymm7 as intermediate registers
.macro Gamma0 arg0
vpsrlq $1, \arg0, %ymm6 // part of S64(1, x)
vpsllq $56, \arg0, %ymm7 // part of S64(8, x)
vpsrlq $7, \arg0, \arg0 // R(7, x)
vpxor %ymm6, \arg0, \arg0
vpsrlq $7, %ymm6, %ymm6 // part of S64(8, x)
vpxor %ymm7, \arg0, \arg0
vpsllq $7, %ymm7, %ymm7 // part of S64(1, x)
vpxor %ymm6, \arg0, \arg0
vpxor %ymm7, \arg0, \arg0
.endm
// #define Gamma1(x) (S64(19, (x)) ^ S64(61, (x)) ^ R(6, (x)))
// performs Gamma1_512 on 4 words on an ymm registers
// use ymm6/ymm7 as intermediate registers
.macro Gamma1 arg0
vpsrlq $19, \arg0, %ymm6 // part of S64(19, x)
vpsllq $3, \arg0, %ymm7 // part of S64(61, x)
vpsrlq $6, \arg0, \arg0 // R(6, x)
vpxor %ymm6, \arg0, \arg0
vpsrlq $42, %ymm6, %ymm6 // part of S64(61, x)
vpxor %ymm7, \arg0, \arg0
vpsllq $42, %ymm7, %ymm7 // part of S64(19, x)
vpxor %ymm6, \arg0, \arg0
vpxor %ymm7, \arg0, \arg0
.endm
.macro rightshift16 arg0, arg1
vpxor \arg1, \arg1, \arg1
vperm2f128 $33, \arg1, \arg0, \arg1
.endm
.macro leftshift16 arg0, arg1
vpxor \arg1, \arg1, \arg1
vperm2f128 $2, \arg1, \arg0, \arg1
.endm
.macro vpalignr8 arg0, arg1, arg2
vpblendd $3, \arg1, \arg0, \arg2
vpermq $57, \arg2, \arg2
.endm
// W[r] = W[r-16] + Gamma1(W[r-2]) + W[r-7] + Gamma0(W[r-15]);
/*
W0 W1 W2 W3
update 4 quad words in W0 += vext(W2,W3,#8) + Gamma0(vext(W0,W1, #8)) + Gamma1(W1<<16);
W0 += Gamma1(vext(W3,W0, #16)).
*/
.macro message_update4 arg0, arg1, arg2, arg3
vpblendd $3, \arg1, \arg0, %ymm5
vpxor %ymm4, %ymm4, %ymm4
vpermq $57, %ymm5, %ymm5 // ymm5 = W[r-15] = vpalignr8 \arg0, \arg1, %ymm5
vperm2f128 $33, %ymm4, \arg3, %ymm4 // ymm4 = [W[16] W[17] 0 0] half of W[r-2] = rightshift16 \arg3, %ymm4
Gamma0 %ymm5 // Gamma0(W[r-15])
Gamma1 %ymm4 // Gamma1(W[r-2]) half
vpaddq %ymm5, \arg0, \arg0 // W0 += Gamma0([r-15]);
vpblendd $3, \arg3, \arg2, %ymm5
vpaddq %ymm4, \arg0, \arg0 // W0 += Gamma1(W[r-2]) + Gamma0(vext(W0,W1, #8));
vpermq $57, %ymm5, %ymm5 // W[r-7] = vpalignr8 \arg2, \arg3, %ymm5 // W[r-7]
vpxor %ymm4, %ymm4, %ymm4
vpaddq %ymm5, \arg0, \arg0 // W0 += W[r-7]
vperm2f128 $2, %ymm4, \arg0, %ymm4 // leftshift16 \arg0, %ymm4 for W0<<16
Gamma1 %ymm4 // Gamma1(W0<<16)
vpaddq %ymm4, \arg0, \arg0 // W0 += Gamma1(W0<<16);
.endm
// #define Sigma0(x) (S64(28, (x)) ^ S64(34, (x)) ^ S64(39, (x)))
.macro Sigma0 arg0
rorx $28, \arg0, s // S(28, (x))
rorx $34, \arg0, t // S(34, (x))
rorx $11, s, u // S(39, (x))
xor s, t // S(28, (x)) ^ S(34, (x))
xor u, t // t = (S(28, (x)) ^ S(34, (x)) ^ S(39, (x)))
.endm
// #define Sigma1(x) (S(14, (x)) ^ S(18, (x)) ^ S(41, (x)))
.macro Sigma1 arg0
rorx $14, \arg0, s // S(14, (x))
rorx $18, \arg0, t // S(18, (x))
rorx $27, s, u // S(41, (x))
xor s, t // S(14, (x)) ^ S(18, (x))
xor u, t // t = (S(14, (x)) ^ S(18, (x)) ^ S(41, (x)))
.endm
// per round digests update
.macro round_ref arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
Sigma1 \arg4 // t = T1
add t, \arg7 // use h to store h+Sigma1(e)
Ch \arg4, \arg5, \arg6 // t = Ch (e, f, g);
add \arg7, t // t = h+Sigma1(e)+Ch(e,f,g);
add WK(\arg8), t // h = T1
add t, \arg3 // d += T1;
mov t, \arg7 // h = T1
Sigma0 \arg0 // t = Sigma0(a);
add t, \arg7 // h = T1 + Sigma0(a);
Maj \arg0, \arg1, \arg2 // t = Maj(a,b,c)
add t, \arg7 // h = T1 + Sigma0(a) + Maj(a,b,c);
.endm
.macro round arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
rorx $14, \arg4, s // S(14, (x))
mov \arg6, t // Ch(e,f,g) : 1
rorx $18, \arg4, u // S(18, (x))
xor \arg5, t // Ch(e,f,g) : 2
xor s, u // S(14, (x)) ^ S(18, (x))
and \arg4, t // Ch(e,f,g) : 3
rorx $27, s, s // S(41, (x))
xor \arg6, t // t = Ch(e,f,g);
xor s, u // u = Sigma1(e);
add t, \arg7 // h = h+Ch(e,f,g);
add u, \arg7 // h = h+Sigma1(e)+Ch(e,f,g);
add WK(\arg8), \arg7 // h = T1
add \arg7, \arg3 // d += T1;
rorx $28, \arg0, s // S(28, (x))
rorx $34, \arg0, u // S(34, (x))
xor s, u // S(28, (x)) ^ S(34, (x))
rorx $11, s, s // S(39, (x))
xor s, u // t = (S(28, (x)) ^ S(34, (x)) ^ S(39, (x)))
add u, \arg7 // h = T1 + Sigma0(a);
mov \arg1, t // b
mov \arg2, s // c
xor \arg2, t // b^c
and \arg1, s // b&c
and \arg0, t // a&(b^c)
xor s, t // t = Maj(a,b,c)
add t, \arg7 // h = T1 + Sigma0(a) + Maj(a,b,c);
.endm
/*
16 rounds of hash update, update input schedule W (in vector register ymm0-ymm3) and WK = W + K (in stack)
*/
.macro rounds_schedule arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
message_update4 W0, W1, W2, W3
round \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 0+\arg8
round \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 1+\arg8
vpaddq 0*32(K), W0, %ymm4
round \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 2+\arg8
round \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 3+\arg8
vmovdqa %ymm4, WK(0)
message_update4 W1, W2, W3, W0
round \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 4+\arg8
round \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 5+\arg8
vpaddq 1*32(K), W1, %ymm4
round \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 6+\arg8
round \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 7+\arg8
vmovdqa %ymm4, WK(4)
message_update4 W2, W3, W0, W1
round \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 8+\arg8
round \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 9+\arg8
vpaddq 2*32(K), W2, %ymm4
round \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 10+\arg8
round \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 11+\arg8
vmovdqa %ymm4, WK(8)
message_update4 W3, W0, W1, W2
round \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 12+\arg8
round \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 13+\arg8
vpaddq 3*32(K), W3, %ymm4
round \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 14+\arg8
round \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 15+\arg8
vmovdqa %ymm4, WK(12)
addq $128, K
.endm
/*
16 rounds of hash update, load new input schedule W (in vector register xmm0-xmm7) and update WK = W + K (in stack)
*/
.macro rounds_schedule_initial arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
vmovdqu 0*32(data), W0
round \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 0+\arg8
vpshufb L_aligned_bswap, W0, W0
round \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 1+\arg8
vpaddq 0*32(K), W0, %ymm4
round \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 2+\arg8
round \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 3+\arg8
vmovdqa %ymm4, WK(0)
vmovdqu 1*32(data), W1
round \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 4+\arg8
vpshufb L_aligned_bswap, W1, W1
round \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 5+\arg8
vpaddq 1*32(K), W1, %ymm4
round \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 6+\arg8
round \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 7+\arg8
vmovdqa %ymm4, WK(4)
vmovdqu 2*32(data), W2
round \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 8+\arg8
vpshufb L_aligned_bswap, W2, W2
round \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 9+\arg8
vpaddq 2*32(K), W2, %ymm4
round \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 10+\arg8
round \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 11+\arg8
vmovdqa %ymm4, WK(8)
vmovdqu 3*32(data), W3
round \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 12+\arg8
vpshufb L_aligned_bswap, W3, W3
round \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 13+\arg8
vpaddq 3*32(K), W3, %ymm4
round \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 14+\arg8
round \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 15+\arg8
vmovdqa %ymm4, WK(12)
addq $128, K
addq $128, data
.endm
/*
16 rounds of hash update
*/
.macro rounds_schedule_final arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
round \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 0+\arg8
round \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 1+\arg8
round \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 2+\arg8
round \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 3+\arg8
round \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 4+\arg8
round \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 5+\arg8
round \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 6+\arg8
round \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 7+\arg8
round \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 8+\arg8
round \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 9+\arg8
round \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 10+\arg8
round \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 11+\arg8
round \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 12+\arg8
round \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 13+\arg8
round \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 14+\arg8
round \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 15+\arg8
.endm
.text
.globl _AccelerateCrypto_SHA512_compress_AVX2
_AccelerateCrypto_SHA512_compress_AVX2:
// push callee-saved registers
push %rbp
movq %rsp, %rbp
push %rbx
push %r12
push %r13
push %r14
push %r15
// allocate stack space
sub $stack_size, sp
andq $-32, sp // aligned sp to 32-bytes
// if kernel code, save used xmm registers
#if BUILDKERNEL
vmovdqa %ymm0, 0*32+ymm_save
vmovdqa %ymm1, 1*32+ymm_save
vmovdqa %ymm2, 2*32+ymm_save
vmovdqa %ymm3, 3*32+ymm_save
vmovdqa %ymm4, 4*32+ymm_save
vmovdqa %ymm5, 5*32+ymm_save
vmovdqa %ymm6, 6*32+ymm_save
vmovdqa %ymm7, 7*32+ymm_save
#endif
movq num_blocks, _num_blocks
// set up bswap parameters in the aligned stack space and pointer to table K512[]
lea CC_C_LABEL(sha512_K)(%rip), K
// load W[0:15] into ymm0-ymm3
vmovdqu 0*32(data), W0
vmovdqu 1*32(data), W1
vmovdqu 2*32(data), W2
vmovdqu 3*32(data), W3
addq $128, data
vmovdqa L_aligned_bswap, %ymm4
vpshufb %ymm4, W0, W0
vpshufb %ymm4, W1, W1
vpshufb %ymm4, W2, W2
vpshufb %ymm4, W3, W3
// compute WK[0:15] and save in stack
vpaddq 0*32(K), W0, %ymm4
vpaddq 1*32(K), W1, %ymm5
vpaddq 2*32(K), W2, %ymm6
vpaddq 3*32(K), W3, %ymm7
addq $128, K
vmovdqa %ymm4, WK(0)
vmovdqa %ymm5, WK(4)
vmovdqa %ymm6, WK(8)
vmovdqa %ymm7, WK(12)
L_loop:
// digests a-h = ctx->states;
mov 0*8(ctx), a
mov 1*8(ctx), b
mov 2*8(ctx), c
mov 3*8(ctx), d
mov 4*8(ctx), e
mov 5*8(ctx), f
mov 6*8(ctx), g
mov 7*8(ctx), h
// rounds 0:47 interleaved with W/WK update for rounds 16:63
rounds_schedule a, b, c, d, e, f, g, h, 16
rounds_schedule a, b, c, d, e, f, g, h, 32
rounds_schedule a, b, c, d, e, f, g, h, 48
rounds_schedule a, b, c, d, e, f, g, h, 64
// revert K to the beginning of K256[]
subq $640, K
subq $1, _num_blocks // num_blocks--
je L_final_block // if final block, wrap up final rounds
rounds_schedule_initial a, b, c, d, e, f, g, h, 0
// ctx->states += digests a-h
add a, 0*8(ctx)
add b, 1*8(ctx)
add c, 2*8(ctx)
add d, 3*8(ctx)
add e, 4*8(ctx)
add f, 5*8(ctx)
add g, 6*8(ctx)
add h, 7*8(ctx)
jmp L_loop // branch for next block
// wrap up digest update round 48:63 for final block
L_final_block:
rounds_schedule_final a, b, c, d, e, f, g, h, 0
// ctx->states += digests a-h
add a, 0*8(ctx)
add b, 1*8(ctx)
add c, 2*8(ctx)
add d, 3*8(ctx)
add e, 4*8(ctx)
add f, 5*8(ctx)
add g, 6*8(ctx)
add h, 7*8(ctx)
// if kernel, restore xmm0-xmm7
#if BUILDKERNEL
vmovdqa 0*32+ymm_save, %ymm0
vmovdqa 1*32+ymm_save, %ymm1
vmovdqa 2*32+ymm_save, %ymm2
vmovdqa 3*32+ymm_save, %ymm3
vmovdqa 4*32+ymm_save, %ymm4
vmovdqa 5*32+ymm_save, %ymm5
vmovdqa 6*32+ymm_save, %ymm6
vmovdqa 7*32+ymm_save, %ymm7
#endif
// free allocated stack memory
leaq -40(%rbp), sp
// restore callee-saved registers
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbx
pop %rbp
// return
ret
// data for using ssse3 pshufb instruction (big-endian loading of data)
CC_ASM_SECTION_CONST
.p2align 5
L_bswap:
.quad 0x0001020304050607
.quad 0x08090a0b0c0d0e0f
.quad 0x1011121314151617
.quad 0x18191a1b1c1d1e1f
#endif // x86_64

View File

@ -0,0 +1,619 @@
# Copyright (c) (2016,2018,2019) Apple Inc. All rights reserved.
#
# corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
# is contained in the License.txt file distributed with corecrypto) and only to
# people who accept that license. IMPORTANT: Any license rights granted to you by
# Apple Inc. (if any) are limited to internal use within your organization only on
# devices and computers you own or control, for the sole purpose of verifying the
# security characteristics and correct functioning of the Apple Software. You may
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
#include <corecrypto/cc_config.h>
/*
This file provides x86_64 hand implementation of the following function
void sha512_compress(uint64_t *state, size_t nblocks, const void *in);
sha512 algorithm per block description:
1. W(0:15) = big-endian (per 8 bytes) loading of input data (128 bytes)
2. load 8 digests (each 64bit) a-h from state
3. for r = 0:15
T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
d += T1;
h = T1 + Sigma0(a) + Maj(a,b,c)
permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
4. for r = 16:79
W[r] = W[r-16] + Gamma1(W[r-2]) + W[r-7] + Gamma0(W[r-15]);
T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
d += T1;
h = T1 + Sigma0(a) + Maj(a,b,c)
permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
In the assembly implementation:
- a circular window of message schedule W(r:r+15) is updated and stored in xmm0-xmm7 (or ymm0-ymm3/zmm0-zmm1 for avx1/avx2)
- its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
- the 8 digests (a-h) will be stored in GPR (%r8-%r15)
----------------------------------------------------------------------------
our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block
into the last 16 rounds of its previous block:
----------------------------------------------------------------------------
load W(0:15) (big-endian per 8 bytes) into xmm0:xmm7
pre_calculate and store W+K(0:15) in stack
L_loop:
load digests a-h from ctx->state;
for (r=0;r<64;r+=2) {
digests a-h update and permute round r:r+1
update W([r:r+1]%16) and WK([r:r+1]%16) for the next 8th iteration
}
num_block--;
if (num_block==0) jmp L_last_block;
for (r=64;r<80;r+=2) {
digests a-h update and permute round r:r+1
load W([r:r+1]%16) (big-endian per 8 bytes) into xmm0:xmm7
pre_calculate and store W+K([r:r+1]%16) in stack
}
ctx->states += digests a-h;
jmp L_loop;
L_last_block:
for (r=64;r<80;r+=2) {
digests a-h update and permute round r:r+2
}
ctx->states += digests a-h;
------------------------------------------------------------------------
Apple CoreOS vector & numerics
*/
#if defined __x86_64__
// associate variables with registers or memory
#define sp %rsp
#define ctx %rdi
#define num_blocks %rsi // later move this to stack, use %rsi for temp variable u
#define data %rdx
#define a %r8
#define b %r9
#define c %r10
#define d %r11
#define e %r12
#define f %r13
#define g %r14
#define h %r15
#define K %rbx
#define _num_blocks (-48)(%rbp) // rbx/r12-r15
#define stack_size (8+16*12+128) // 8 (_num_blocks) + xmm0:xmm11 + WK(0:15)
#define L_aligned_bswap L_bswap(%rip) // bswap : big-endian loading of 4-byte words
#define xmm_save 128(sp) // starting address for xmm save/restore
// 3 local variables
#define s %rax
#define t %rcx
#define u %rsi
// a window (16 quad-words) of message scheule
#define W0 %xmm0
#define W1 %xmm1
#define W2 %xmm2
#define W3 %xmm3
#define W4 %xmm4
#define W5 %xmm5
#define W6 %xmm6
#define W7 %xmm7
// circular buffer for WK[(r:r+15)%16]
#define WK(x) ((x)&15)*8(sp)
// #define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
.macro Ch arg0, arg1, arg2
#if 1
mov \arg2, t
xor \arg1, t
and \arg0, t
xor \arg2, t
#else
mov \arg0, t // x
mov \arg0, s // x
not t // ~x
and \arg1, s // x & y
and \arg2, t // ~x & z
xor s, t // t = ((x) & (y)) ^ ((~(x)) & (z));
#endif
.endm
// #define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
.macro Maj arg0, arg1, arg2
mov \arg1, t // y
mov \arg2, s // z
xor \arg2, t // y^z
and \arg1, s // y&z
and \arg0, t // x&(y^z)
xor s, t // Maj(x,y,z)
.endm
// #define Gamma0(x) (S64(1, (x)) ^ S64(8, (x)) ^ R(7 , (x)))
// performs Gamma0_512 on 2 words on an xmm registers
// use xmm8/xmm9 as intermediate registers
.macro Gamma0 arg0
movdqa \arg0, %xmm8
movdqa \arg0, %xmm9
psrlq $7, \arg0 // R(7, x)
psrlq $1, %xmm8 // part of S64(1, x)
psllq $56, %xmm9 // part of S64(8, x)
pxor %xmm8, \arg0
psrlq $7, %xmm8 // part of S64(8, x)
pxor %xmm9, \arg0
psllq $7, %xmm9 // part of S64(1, x)
pxor %xmm8, \arg0
pxor %xmm9, \arg0
.endm
// #define Gamma1(x) (S64(19, (x)) ^ S64(61, (x)) ^ R(6, (x)))
// performs Gamma1_512 on 2 words on an xmm registers
// use xmm8/xmm9 as intermediate registers
.macro Gamma1 arg0
movdqa \arg0, %xmm8
movdqa \arg0, %xmm9
psrlq $6, \arg0 // R(6, x)
psrlq $19, %xmm8 // part of S64(19, x)
psllq $3, %xmm9 // part of S64(61, x)
pxor %xmm8, \arg0
psrlq $42, %xmm8 // part of S64(61, x)
pxor %xmm9, \arg0
psllq $42, %xmm9 // part of S64(19, x)
pxor %xmm8, \arg0
pxor %xmm9, \arg0
.endm
// W[r] = W[r-16] + Gamma1(W[r-2]) + W[r-7] + Gamma0(W[r-15]);
/*
W0 W1 W2 W3 W4 W5 W6 W7
update 2 quad words in W0 = W0 + Gamma1(W7) + vext(W4,W5) + Gamma0(vext(W0,W1)).
use %xmm10, %xmm11 for temp
*/
.macro message_update2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
movdqa \arg5, %xmm10
movdqa \arg1, %xmm11
palignr $8, \arg4, %xmm10 // vext(W4,W5)
palignr $8, \arg0, %xmm11 // vext(W0,W1)
paddq %xmm10, \arg0 // W0 + vext(W4,W5)
movdqa \arg7, %xmm10
Gamma1 %xmm10 // Gamma1(W7)
Gamma0 %xmm11 // Gamma0(vext(W0,W1))
paddq %xmm10, \arg0 // W0 + Gamma1(W7) + vext(W4,W5)
paddq %xmm11, \arg0 // W0 + Gamma1(W7) + vext(W4,W5) + Gamma0(vext(W0,W1))
.endm
// #define Sigma0(x) (S64(28, (x)) ^ S64(34, (x)) ^ S64(39, (x)))
.macro Sigma0 arg0
mov \arg0, t // x
mov \arg0, s // x
ror $28, t // S(28, (x))
ror $34, s // S(34, (x))
xor s, t // S(28, (x)) ^ S(34, (x))
ror $5, s // S(39, (x))
xor s, t // t = (S(28, (x)) ^ S(34, (x)) ^ S(39, (x)))
.endm
// #define Sigma1(x) (S(14, (x)) ^ S(18, (x)) ^ S(41, (x)))
.macro Sigma1 arg0
mov \arg0, s // x
ror $14, s // S(14, (x))
mov s, t // S(14, (x))
ror $4, s // S(18, (x))
xor s, t // S(14, (x)) ^ S(18, (x))
ror $23, s // S(41, (x))
xor s, t // t = (S(14, (x)) ^ S(18, (x)) ^ S(41, (x)))
.endm
// per round digests update
.macro round_ref arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
Sigma1 \arg4 // t = Sigma1(e);
add t, \arg7 // h = h+Sigma1(e)
Ch \arg4, \arg5, \arg6 // t = Ch (e, f, g);
add t, \arg7 // h = h+Sigma1(e)+Ch(e,f,g);
add WK(\arg8), \arg7 // h = h+Sigma1(e)+Ch(e,f,g)+WK
add \arg7, \arg3 // d += h;
Sigma0 \arg0 // t = Sigma0(a);
add t, \arg7 // h += Sigma0(a);
Maj \arg0, \arg1, \arg2 // t = Maj(a,b,c)
add t, \arg7 // h = T1 + Sigma0(a) + Maj(a,b,c);
.endm
.macro round arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
mov \arg4, s
mov \arg0, t
ror $(41-18), s
ror $(39-34), t
xor \arg4, s
mov \arg5, u
xor \arg0, t
ror $(18-14), s
xor \arg6, u
xor \arg4, s
ror $(34-28), t
and \arg4, u
xor \arg0, t
xor \arg6, u
ror $14, s
ror $28, t
add s, u
mov \arg0, s
add WK(\arg8), u
or \arg2, s
add u, \arg7
mov \arg0, u
add \arg7, \arg3
and \arg1, s
and \arg2, u
or u, s
add t, \arg7
add s, \arg7
.endm
/*
16 rounds of hash update, update input schedule W (in vector register xmm0-xmm7) and WK = W + K (in stack)
*/
.macro rounds_schedule arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
message_update2 W0, W1, W2, W3, W4, W5, W6, W7
movdqa 0*16(K), %xmm8
round \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 0+\arg8
paddq W0, %xmm8
round \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 1+\arg8
movdqa %xmm8, WK(0)
message_update2 W1, W2, W3, W4, W5, W6, W7, W0
movdqa 1*16(K), %xmm8
round \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 2+\arg8
paddq W1, %xmm8
round \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 3+\arg8
movdqa %xmm8, WK(2)
message_update2 W2, W3, W4, W5, W6, W7, W0, W1
movdqa 2*16(K), %xmm8
round \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 4+\arg8
paddq W2, %xmm8
round \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 5+\arg8
movdqa %xmm8, WK(4)
message_update2 W3, W4, W5, W6, W7, W0, W1, W2
movdqa 3*16(K), %xmm8
round \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 6+\arg8
paddq W3, %xmm8
round \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 7+\arg8
movdqa %xmm8, WK(6)
message_update2 W4, W5, W6, W7, W0, W1, W2, W3
movdqa 4*16(K), %xmm8
round \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 8+\arg8
paddq W4, %xmm8
round \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 9+\arg8
movdqa %xmm8, WK(8)
message_update2 W5, W6, W7, W0, W1, W2, W3, W4
movdqa 5*16(K), %xmm8
round \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 10+\arg8
paddq W5, %xmm8
round \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 11+\arg8
movdqa %xmm8, WK(10)
message_update2 W6, W7, W0, W1, W2, W3, W4, W5
movdqa 6*16(K), %xmm8
round \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 12+\arg8
paddq W6, %xmm8
round \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 13+\arg8
movdqa %xmm8, WK(12)
message_update2 W7, W0, W1, W2, W3, W4, W5, W6
movdqa 7*16(K), %xmm8
round \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 14+\arg8
paddq W7, %xmm8
round \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 15+\arg8
movdqa %xmm8, WK(14)
addq $128, K
.endm
/*
16 rounds of hash update, load new input schedule W (in vector register xmm0-xmm7) and update WK = W + K (in stack)
*/
.macro rounds_schedule_initial arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
movdqu 0*16(data), W0
movdqa 0*16(K), %xmm8
pshufb L_aligned_bswap, W0
round \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 0+\arg8
paddq W0, %xmm8
round \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 1+\arg8
movdqa %xmm8, WK(0)
movdqu 1*16(data), W1
movdqa 1*16(K), %xmm8
pshufb L_aligned_bswap, W1
round \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 2+\arg8
paddq W1, %xmm8
round \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 3+\arg8
movdqa %xmm8, WK(2)
movdqu 2*16(data), W2
movdqa 2*16(K), %xmm8
pshufb L_aligned_bswap, W2
round \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 4+\arg8
paddq W2, %xmm8
round \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 5+\arg8
movdqa %xmm8, WK(4)
movdqu 3*16(data), W3
movdqa 3*16(K), %xmm8
pshufb L_aligned_bswap, W3
round \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 6+\arg8
paddq W3, %xmm8
round \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 7+\arg8
movdqa %xmm8, WK(6)
movdqu 4*16(data), W4
movdqa 4*16(K), %xmm8
pshufb L_aligned_bswap, W4
round \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 8+\arg8
paddq W4, %xmm8
round \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 9+\arg8
movdqa %xmm8, WK(8)
movdqu 5*16(data), W5
movdqa 5*16(K), %xmm8
pshufb L_aligned_bswap, W5
round \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 10+\arg8
paddq W5, %xmm8
round \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 11+\arg8
movdqa %xmm8, WK(10)
movdqu 6*16(data), W6
movdqa 6*16(K), %xmm8
pshufb L_aligned_bswap, W6
round \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 12+\arg8
paddq W6, %xmm8
round \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 13+\arg8
movdqa %xmm8, WK(12)
movdqu 7*16(data), W7
movdqa 7*16(K), %xmm8
pshufb L_aligned_bswap, W7
round \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 14+\arg8
paddq W7, %xmm8
round \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 15+\arg8
movdqa %xmm8, WK(14)
addq $128, K
addq $128, data
.endm
/*
16 rounds of hash update
*/
.macro rounds_schedule_final arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
round \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 0+\arg8
round \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 1+\arg8
round \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 2+\arg8
round \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 3+\arg8
round \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 4+\arg8
round \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 5+\arg8
round \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 6+\arg8
round \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 7+\arg8
round \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 8+\arg8
round \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 9+\arg8
round \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 10+\arg8
round \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 11+\arg8
round \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 12+\arg8
round \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 13+\arg8
round \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 14+\arg8
round \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 15+\arg8
.endm
.text
.globl _AccelerateCrypto_SHA512_compress_ssse3
_AccelerateCrypto_SHA512_compress_ssse3:
// push callee-saved registers
push %rbp
movq %rsp, %rbp
push %rbx
push %r12
push %r13
push %r14
push %r15
// allocate stack space
sub $stack_size, sp
// if kernel code, save used xmm registers
#if BUILDKERNEL
movdqa %xmm0, 0*16+xmm_save
movdqa %xmm1, 1*16+xmm_save
movdqa %xmm2, 2*16+xmm_save
movdqa %xmm3, 3*16+xmm_save
movdqa %xmm4, 4*16+xmm_save
movdqa %xmm5, 5*16+xmm_save
movdqa %xmm6, 6*16+xmm_save
movdqa %xmm7, 7*16+xmm_save
movdqa %xmm8, 8*16+xmm_save
movdqa %xmm9, 9*16+xmm_save
movdqa %xmm10, 10*16+xmm_save
movdqa %xmm11, 11*16+xmm_save
#endif
movq num_blocks, _num_blocks
// set up bswap parameters in the aligned stack space and pointer to table K512[]
lea CC_C_LABEL(sha512_K)(%rip), K
// load W[0:15] into xmm0-xmm7
movdqu 0*16(data), W0
movdqu 1*16(data), W1
movdqu 2*16(data), W2
movdqu 3*16(data), W3
movdqu 4*16(data), W4
movdqu 5*16(data), W5
movdqu 6*16(data), W6
movdqu 7*16(data), W7
addq $128, data
movdqa L_aligned_bswap, %xmm8
pshufb %xmm8, W0
pshufb %xmm8, W1
pshufb %xmm8, W2
pshufb %xmm8, W3
pshufb %xmm8, W4
pshufb %xmm8, W5
pshufb %xmm8, W6
pshufb %xmm8, W7
// compute WK[0:15] and save in stack
movdqa 0*16(K), %xmm8
movdqa 1*16(K), %xmm9
movdqa 2*16(K), %xmm10
movdqa 3*16(K), %xmm11
paddq %xmm0, %xmm8
paddq %xmm1, %xmm9
paddq %xmm2, %xmm10
paddq %xmm3, %xmm11
movdqa %xmm8, WK(0)
movdqa %xmm9, WK(2)
movdqa %xmm10, WK(4)
movdqa %xmm11, WK(6)
movdqa 4*16(K), %xmm8
movdqa 5*16(K), %xmm9
movdqa 6*16(K), %xmm10
movdqa 7*16(K), %xmm11
paddq %xmm4, %xmm8
paddq %xmm5, %xmm9
paddq %xmm6, %xmm10
paddq %xmm7, %xmm11
movdqa %xmm8, WK(8)
movdqa %xmm9, WK(10)
movdqa %xmm10, WK(12)
movdqa %xmm11, WK(14)
addq $128, K
L_loop:
// digests a-h = ctx->states;
mov 0*8(ctx), a
mov 1*8(ctx), b
mov 2*8(ctx), c
mov 3*8(ctx), d
mov 4*8(ctx), e
mov 5*8(ctx), f
mov 6*8(ctx), g
mov 7*8(ctx), h
// rounds 0:47 interleaved with W/WK update for rounds 16:63
rounds_schedule a, b, c, d, e, f, g, h, 16
rounds_schedule a, b, c, d, e, f, g, h, 32
rounds_schedule a, b, c, d, e, f, g, h, 48
rounds_schedule a, b, c, d, e, f, g, h, 64
// revert K to the beginning of K256[]
subq $640, K
subq $1, _num_blocks // num_blocks--
je L_final_block // if final block, wrap up final rounds
rounds_schedule_initial a, b, c, d, e, f, g, h, 0
// ctx->states += digests a-h
add a, 0*8(ctx)
add b, 1*8(ctx)
add c, 2*8(ctx)
add d, 3*8(ctx)
add e, 4*8(ctx)
add f, 5*8(ctx)
add g, 6*8(ctx)
add h, 7*8(ctx)
jmp L_loop // branch for next block
// wrap up digest update round 48:63 for final block
L_final_block:
rounds_schedule_final a, b, c, d, e, f, g, h, 0
// ctx->states += digests a-h
add a, 0*8(ctx)
add b, 1*8(ctx)
add c, 2*8(ctx)
add d, 3*8(ctx)
add e, 4*8(ctx)
add f, 5*8(ctx)
add g, 6*8(ctx)
add h, 7*8(ctx)
// if kernel, restore xmm0-xmm7
#if BUILDKERNEL
movdqa 0*16+xmm_save, %xmm0
movdqa 1*16+xmm_save, %xmm1
movdqa 2*16+xmm_save, %xmm2
movdqa 3*16+xmm_save, %xmm3
movdqa 4*16+xmm_save, %xmm4
movdqa 5*16+xmm_save, %xmm5
movdqa 6*16+xmm_save, %xmm6
movdqa 7*16+xmm_save, %xmm7
movdqa 8*16+xmm_save, %xmm8
movdqa 9*16+xmm_save, %xmm9
movdqa 10*16+xmm_save, %xmm10
movdqa 11*16+xmm_save, %xmm11
#endif
// free allocated stack memory
add $stack_size, sp
// restore callee-saved registers
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbx
pop %rbp
// return
ret
// data for using ssse3 pshufb instruction (big-endian loading of data)
CC_ASM_SECTION_CONST
.p2align 4
L_bswap:
.quad 0x0001020304050607
.quad 0x08090a0b0c0d0e0f
#endif // x86_64

View File

@ -0,0 +1,58 @@
/* Copyright (c) (2016,2018-2020) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#include <stdint.h>
#include <corecrypto/cc_config.h>
/* the K array */
const uint64_t sha512_K[80] CC_ALIGNED(16) = {
0x428a2f98d728ae22, 0x7137449123ef65cd,
0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc,
0x3956c25bf348b538, 0x59f111f1b605d019,
0x923f82a4af194f9b, 0xab1c5ed5da6d8118,
0xd807aa98a3030242, 0x12835b0145706fbe,
0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2,
0x72be5d74f27b896f, 0x80deb1fe3b1696b1,
0x9bdc06a725c71235, 0xc19bf174cf692694,
0xe49b69c19ef14ad2, 0xefbe4786384f25e3,
0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65,
0x2de92c6f592b0275, 0x4a7484aa6ea6e483,
0x5cb0a9dcbd41fbd4, 0x76f988da831153b5,
0x983e5152ee66dfab, 0xa831c66d2db43210,
0xb00327c898fb213f, 0xbf597fc7beef0ee4,
0xc6e00bf33da88fc2, 0xd5a79147930aa725,
0x06ca6351e003826f, 0x142929670a0e6e70,
0x27b70a8546d22ffc, 0x2e1b21385c26c926,
0x4d2c6dfc5ac42aed, 0x53380d139d95b3df,
0x650a73548baf63de, 0x766a0abb3c77b2a8,
0x81c2c92e47edaee6, 0x92722c851482353b,
0xa2bfe8a14cf10364, 0xa81a664bbc423001,
0xc24b8b70d0f89791, 0xc76c51a30654be30,
0xd192e819d6ef5218, 0xd69906245565a910,
0xf40e35855771202a, 0x106aa07032bbd1b8,
0x19a4c116b8d2d0c8, 0x1e376c085141ab53,
0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8,
0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb,
0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3,
0x748f82ee5defb2fc, 0x78a5636f43172f60,
0x84c87814a1f0ab72, 0x8cc702081a6439ec,
0x90befffa23631e28, 0xa4506cebde82bde9,
0xbef9a3f7b2c67915, 0xc67178f2e372532b,
0xca273eceea26619c, 0xd186b8c721c0c207,
0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178,
0x06f067aa72176fba, 0x0a637dc5a2c898a6,
0x113f9804bef90dae, 0x1b710b35131c471b,
0x28db77f523047d84, 0x32caab7b40c72493,
0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c,
0x4cc5d4becb3e42b6, 0x597f299cfc657e2a,
0x5fcb6fab3ad6faec, 0x6c44198c4a475817
};

182
cc/corecrypto/cc.h Normal file
View File

@ -0,0 +1,182 @@
/* Copyright (c) (2010-2012,2014-2020) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#ifndef _CORECRYPTO_CC_H_
#define _CORECRYPTO_CC_H_
#include <corecrypto/cc_config.h>
#include <corecrypto/cc_error.h>
#include <string.h>
#include <stdint.h>
#if __has_feature(attribute_availability_with_replacement)
#if __has_feature(attribute_availability_bridgeos)
#ifndef __CC_BRIDGE_OS_DEPRECATED
#define __CC_BRIDGEOS_DEPRECATED(_dep, _msg) __attribute__((availability(bridgeos,deprecated=_dep, replacement=_msg)))
#endif
#endif
#ifndef __CC_BRIDGEOS_DEPRECATED
#define __CC_BRIDGEOS_DEPRECATED(_dep, _msg)
#endif
#define cc_deprecate_with_replacement(replacement_message, ios_version, macos_version, tvos_version, watchos_version, bridgeos_version) \
__attribute__((availability(macos,deprecated=macos_version, replacement=replacement_message)))\
__attribute__((availability(ios,deprecated=ios_version, replacement=replacement_message)))\
__attribute__((availability(watchos,deprecated=watchos_version, replacement=replacement_message)))\
__attribute__((availability(tvos,deprecated=tvos_version, replacement=replacement_message)))\
__CC_BRIDGEOS_DEPRECATED(bridgeos_version, replacement_message)
#else /* !__has_feature(attribute_availability_with_replacement) */
#define cc_deprecate_with_replacement(replacement_message, ios_version, macos_version, tvos_version, watchos_version, bridgeos_version)
#endif /* __has_feature(attribute_availability_with_replacement) */
/* Provide a general purpose macro concat method. */
#define cc_concat_(a, b) a##b
#define cc_concat(a, b) cc_concat_(a, b)
#if defined(_MSC_VER)
#define __asm__(x)
#endif
/* Manage asserts here because a few functions in header public files do use asserts */
#if CORECRYPTO_DEBUG
#define cc_assert(x) assert(x)
#else
#define cc_assert(x)
#endif
#if CC_KERNEL
#include <kern/assert.h>
#elif CC_USE_S3
#define assert(args) // No assert in S3
#else
#include <assert.h>
#endif
/* Provide a static assert that can be used to create compile-type failures. */
#define cc_static_assert(e,m) \
enum { cc_concat(static_assert_, __COUNTER__) = 1/(int)(!!(e)) }
/* Declare a struct element with a guarenteed alignment of _alignment_.
The resulting struct can be used to create arrays that are aligned by
a certain amount. */
#define cc_aligned_struct(_alignment_) \
typedef struct { \
uint8_t b[_alignment_]; \
} CC_ALIGNED(_alignment_)
#if defined(__BIGGEST_ALIGNMENT__)
#define CC_MAX_ALIGNMENT ((size_t)__BIGGEST_ALIGNMENT__)
#else
#define CC_MAX_ALIGNMENT ((size_t)16)
#endif
/* pads a given size to be a multiple of the biggest alignment for any type */
#define cc_pad_align(_size_) ((_size_ + CC_MAX_ALIGNMENT - 1) & (~(CC_MAX_ALIGNMENT - 1)))
/* number of array elements used in a cc_ctx_decl */
#define cc_ctx_n(_type_, _size_) ((_size_ + sizeof(_type_) - 1) / sizeof(_type_))
/* sizeof of a context declared with cc_ctx_decl */
#define cc_ctx_sizeof(_type_, _size_) sizeof(_type_[cc_ctx_n(_type_, _size_)])
/*
1. _alloca cannot be removed becasue this header file is compiled with both MSVC++ and with clang.
2. The _MSC_VER version of cc_ctx_decl() is not compatible with the way *_decl macros as used in CommonCrypto, AppleKeyStore and SecurityFrameworks. To observe the incompatibilities and errors, use below definition. Corecrypto itself, accepts both deinitions
#define cc_ctx_decl(_type_, _size_, _name_) _type_ _name_ ## _array[cc_ctx_n(_type_, (_size_))]; _type_ *_name_ = _name_ ## _array
3. Never use sizeof() operator for the variables declared with cc_ctx_decl(), because it is not be compatible with the _MSC_VER version of cc_ctx_decl().
*/
#if defined(_MSC_VER)
#include <malloc.h>
#define cc_ctx_decl(_type_, _size_, _name_) _type_ * _name_ = (_type_ *) _alloca(sizeof(_type_) * cc_ctx_n(_type_, _size_) )
#define cc_ctx_decl_field(_type_, _size_, _name_) _type_ _name_ [cc_ctx_n(_type_, _size_)]
#else
#define cc_ctx_decl(_type_, _size_, _name_) \
_Pragma("GCC diagnostic push") \
_Pragma("GCC diagnostic ignored \"-Wvla\"") \
_type_ _name_ [cc_ctx_n(_type_, _size_)] \
_Pragma("GCC diagnostic pop")
#define cc_ctx_decl_field cc_ctx_decl
#endif
/*!
@brief cc_clear(len, dst) zeroizes array dst and it will not be optimized out.
@discussion It is used to clear sensitive data, particularly when the are defined in the stack
@param len number of bytes to be cleared in dst
@param dst input array
*/
CC_NONNULL((2))
void cc_clear(size_t len, void *dst);
// cc_zero is deprecated, please use cc_clear instead.
cc_deprecate_with_replacement("cc_clear", 13.0, 10.15, 13.0, 6.0, 4.0)
CC_NONNULL_ALL CC_INLINE
void cc_zero(size_t len, void *dst)
{
cc_clear(len, dst);
}
#define cc_copy(_size_, _dst_, _src_) memcpy(_dst_, _src_, _size_)
CC_INLINE CC_NONNULL((2, 3, 4))
void cc_xor(size_t size, void *r, const void *s, const void *t) {
uint8_t *_r=(uint8_t *)r;
const uint8_t *_s=(const uint8_t *)s;
const uint8_t *_t=(const uint8_t *)t;
while (size--) {
_r[size] = _s[size] ^ _t[size];
}
}
/*!
@brief cc_cmp_safe(num, pt1, pt2) compares two array ptr1 and ptr2 of num bytes.
@discussion The execution time/cycles is independent of the data and therefore guarantees no leak about the data. However, the execution time depends on num.
@param num number of bytes in each array
@param ptr1 input array
@param ptr2 input array
@return returns 0 if the num bytes starting at ptr1 are identical to the num bytes starting at ptr2 and 1 if they are different or if num is 0 (empty arrays).
*/
CC_NONNULL((2, 3))
int cc_cmp_safe (size_t num, const void * ptr1, const void * ptr2);
/* Exchange S and T of any type. NOTE: Both and S and T are evaluated
mutliple times and MUST NOT be expressions. */
#define CC_SWAP(S,T) do { \
volatile __typeof__(S) _cc_swap_tmp = S; S = T; T = _cc_swap_tmp; \
_cc_swap_tmp = 0;\
} while(0)
/* Return the maximum value between S and T. */
#define CC_MAX(S, T) ({__typeof__(S) _cc_max_s = S; __typeof__(T) _cc_max_t = T; _cc_max_s > _cc_max_t ? _cc_max_s : _cc_max_t;})
/* Clone of CC_MAX() that evalutes S and T multiple times to allow nesting. */
#define CC_MAX_EVAL(S, T) ((S) > (T) ? (S) : (T))
/* Return the minimum value between S and T. */
#define CC_MIN(S, T) ({__typeof__(S) _cc_min_s = S; __typeof__(T) _cc_min_t = T; _cc_min_s <= _cc_min_t ? _cc_min_s : _cc_min_t;})
/*
When building with "-nostdinc" (i.e. iboot), ptrauth.h is in a non-standard location.
This requires a new flag to be used when building iboot: -ibuiltininc which is not
yet available.
*/
#if __has_feature(ptrauth_calls) && (CC_KERNEL || CC_USE_L4 || CC_USE_SEPROM)
#include <ptrauth.h>
#define CC_SPTR(_sn_, _n_) \
__ptrauth(ptrauth_key_process_independent_code, 1, ptrauth_string_discriminator("cc_" #_sn_ #_n_)) _n_
#else
#define CC_SPTR(_sn_, _n_) _n_
#endif
#endif /* _CORECRYPTO_CC_H_ */

View File

@ -0,0 +1,83 @@
/* Copyright (c) (2016-2020) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#ifndef cc_absolute_time_h
#define cc_absolute_time_h
#include <corecrypto/cc_config.h>
#include <stdint.h>
// For more info on mach_absolute_time() precision:
// https://developer.apple.com/library/mac/qa/qa1398/_index.html
#if CC_USE_L4
#include <ert/time.h>
#define cc_absolute_time() ert_time_now()
// L4 doesn't use a scaling factor
#define cc_absolute_time_sf() (1.0 / 1000000000.0)
#elif CC_KERNEL
#include <mach/mach_time.h>
#include <kern/clock.h>
#define cc_absolute_time() (mach_absolute_time())
// Scale factor to convert absolute time to seconds
#define cc_absolute_time_sf() ({ \
struct mach_timebase_info info; \
clock_timebase_info(&info); \
((double)info.numer) / (1000000000.0 * info.denom); \
})
#elif CC_DARWIN
#include <mach/mach_time.h>
#define cc_absolute_time() (mach_absolute_time())
// Scale factor to convert absolute time to seconds
#define cc_absolute_time_sf() ({ \
struct mach_timebase_info info; \
mach_timebase_info(&info); \
((double)info.numer) / (1000000000.0 * info.denom); \
})
#elif defined(_WIN32)
#include <windows.h>
CC_INLINE uint64_t cc_absolute_time(void) {
LARGE_INTEGER time;
QueryPerformanceCounter(&time); //resolution < 1us
return (uint64_t)time.QuadPart;
}
CC_INLINE double cc_absolute_time_sf(){
LARGE_INTEGER freq;
QueryPerformanceFrequency(&freq); //performance counter freq in Hz
return (double)1 / freq.QuadPart;
}
#elif CC_LINUX
#if CORECRYPTO_SIMULATE_POSIX_ENVIRONMENT
#include <mach/mach_time.h>
#define cc_absolute_time() (mach_absolute_time()) // To test compilation on mac
#else
// The following is specific to non x86 (arm/mips/etc...) architectures on Linux.
#warning cc_absolute_time() has not been tested
#include <time.h>
#define NSEC_PER_USEC 1000ull
CC_INLINE uint64_t cc_absolute_time() {
struct timespec tm;
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &tm);
return tm.tv_sec * 1000000000ull + tm.tv_nsec;
}
#endif // CORECRYPTO_SIMULATE_POSIX_ENVIRONMENT
#define cc_absolute_time_sf() (1.0 / 1000000000.0)
#else
#warning Target OS is not defined. There should be a definition for cc_absolute_time() for the target OS/platform.
#endif
#endif /* cc_absolute_time_h */

600
cc/corecrypto/cc_config.h Normal file
View File

@ -0,0 +1,600 @@
/* Copyright (c) (2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#ifndef _CORECRYPTO_CC_CONFIG_H_
#define _CORECRYPTO_CC_CONFIG_H_
/* A word about configuration macros:
Conditional configuration macros specific to corecrypto should be named CORECRYPTO_xxx
or CCxx_yyy and be defined to be either 0 or 1 in this file. You can add an
#ifndef #error construct at the end of this file to make sure it's always defined.
They should always be tested using the #if directive, never the #ifdef directive.
No other conditional macros shall ever be used (except in this file)
Configuration Macros that are defined outside of corecrypto (eg: KERNEL, DEBUG, ...)
shall only be used in this file to define CCxxx macros.
External macros should be assumed to be either undefined, defined with no value,
or defined as true or false. We shall strive to build with -Wundef whenever possible,
so the following construct should be used to test external macros in this file:
#if defined(DEBUG) && (DEBUG)
#define CORECRYPTO_DEBUG 1
#else
#define CORECRYPTO_DEBUG 0
#endif
It is acceptable to define a conditional CC_xxxx macro in an implementation file,
to be used only in this file.
The current code is not guaranteed to follow those rules, but should be fixed to.
Corecrypto requires GNU and C99 compatibility.
Typically enabled by passing --gnu --c99 to the compiler (eg. armcc)
*/
//Do not set this macros to 1, unless you are developing/testing for Linux under macOS
#define CORECRYPTO_SIMULATE_POSIX_ENVIRONMENT 0
//Do not set these macros to 1, unless you are developing/testing for Windows under macOS
#define CORECRYPTO_SIMULATE_WINDOWS_ENVIRONMENT 0
#define CORECRYPTO_HACK_FOR_WINDOWS_DEVELOPMENT 0
#if (defined(DEBUG) && (DEBUG)) || defined(_DEBUG) //MSVC defines _DEBUG
/* CC_DEBUG is already used in CommonCrypto */
#define CORECRYPTO_DEBUG 1
#else
#define CORECRYPTO_DEBUG 0
#endif
// This macro can be used to enable prints when a condition in the macro "cc_require"
// is false. This is especially useful to confirm that negative testing fails
// at the intended location
#define CORECRYPTO_DEBUG_ENABLE_CC_REQUIRE_PRINTS 0
#if defined(KERNEL) && (KERNEL)
#define CC_KERNEL 1 // KEXT, XNU repo or kernel components such as AppleKeyStore
#else
#define CC_KERNEL 0
#endif
#if defined(__linux__) || CORECRYPTO_SIMULATE_POSIX_ENVIRONMENT
#define CC_LINUX 1
#else
#define CC_LINUX 0
#endif
#if defined(__ANDROID__) && (__ANDROID__)
#define CC_ANDROID 1
#else
#define CC_ANDROID 0
#endif
#if defined(USE_L4) && (USE_L4)
#define CC_USE_L4 1
#else
#define CC_USE_L4 0
#endif
#if defined(RTKIT) && (RTKIT)
#define CC_RTKIT 1
#else
#define CC_RTKIT 0
#endif
#if defined(RTKITROM) && (RTKITROM)
#define CC_RTKITROM 1
#else
#define CC_RTKITROM 0
#endif
#if defined(USE_SEPROM) && (USE_SEPROM)
#define CC_USE_SEPROM 1
#else
#define CC_USE_SEPROM 0
#endif
#if defined(USE_S3) && (USE_S3)
#define CC_USE_S3 1
#else
#define CC_USE_S3 0
#endif
#if (defined(ICE_FEATURES_ENABLED)) || (defined(MAVERICK) && (MAVERICK))
#define CC_BASEBAND 1
#else
#define CC_BASEBAND 0
#endif
#if defined(EFI) && (EFI)
#define CC_EFI 1
#else
#define CC_EFI 0
#endif
#if defined(IBOOT) && (IBOOT)
#define CC_IBOOT 1
#else
#define CC_IBOOT 0
#endif
#if defined(TARGET_OS_BRIDGE)
#define CC_BRIDGE TARGET_OS_BRIDGE
#else
#define CC_BRIDGE 0
#endif
// Check if we're running on a generic, userspace platform, i.e., not in the kernel, SEP, etc.
#ifndef CC_GENERIC_PLATFORM
#define CC_GENERIC_PLATFORM \
(!CC_RTKIT && !CC_KERNEL && !CC_USE_L4 && \
!CC_RTKITROM && !CC_EFI && !CC_IBOOT && \
!CC_USE_SEPROM && !CC_ANDROID && !CC_LINUX && \
!CC_BRIDGE)
#endif
// Defined by the XNU build scripts
// Applies to code embedded in XNU but NOT to the kext
#if defined(XNU_KERNEL_PRIVATE)
#define CC_XNU_KERNEL_PRIVATE 1
#else
#define CC_XNU_KERNEL_PRIVATE 0
#endif
// handle unaligned data, if the cpu cannot. Currently for gladman AES and the C version of the SHA256
#define CC_HANDLE_UNALIGNED_DATA CC_BASEBAND
// BaseBand configuration
#if CC_BASEBAND
// -- ENDIANESS
#if !defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__)
#if defined(ENDIAN_LITTLE) || (defined(__arm__) && !defined(__BIG_ENDIAN))
#define __LITTLE_ENDIAN__
#elif !defined(ENDIAN_BIG) && !defined(__BIG_ENDIAN)
#error Baseband endianess not defined.
#endif
#define AESOPT_ENDIAN_NO_FILE
#endif
// -- Architecture
#define CCN_UNIT_SIZE 4 // 32 bits
// -- External function
#define assert ASSERT // sanity
// -- Warnings
// Ignore irrelevant warnings after verification
// #1254-D: arithmetic on pointer to void or function type
// #186-D: pointless comparison of unsigned integer with zero
// #546-D: transfer of control bypasses initialization of
#ifdef __arm__
#pragma diag_suppress 186, 1254,546
#elif defined(__GNUC__)
// warning: pointer of type 'void *' used in arithmetic
#pragma GCC diagnostic ignored "-Wpointer-arith"
#endif // __arm__
#define CC_SMALL_CODE 1
#endif // CC_BASEBAND
#if CC_RTKIT || CC_RTKITROM
#define CC_SMALL_CODE 1
#endif
#ifndef CC_SMALL_CODE
#define CC_SMALL_CODE 0
#endif
//CC_DARWIN indicates the availability of XNU kernel functions,
//like what we have on OSX, iOS, tvOS, Watch OS
#if (CC_USE_L4 || CC_RTKIT || CC_RTKITROM || CC_USE_SEPROM || CC_EFI || CC_LINUX || defined(_WIN32) || CC_BASEBAND || CC_USE_S3 || CC_ANDROID)
#define CC_DARWIN 0
#else
#define CC_DARWIN 1
#endif
//arm arch64 definition for gcc
#if defined(__GNUC__) && defined(__aarch64__) && !defined(__arm64__)
#define __arm64__
#endif
#if !defined(CCN_UNIT_SIZE)
#if defined(__arm64__) || defined(__x86_64__) || defined(_WIN64)
#define CCN_UNIT_SIZE 8
#elif defined(__arm__) || defined(__i386__) || defined(_WIN32)
#define CCN_UNIT_SIZE 4
#else
#error undefined architecture
#endif
#endif /* !defined(CCN_UNIT_SIZE) */
//this allows corecrypto Windows development using xcode
#if defined(CORECRYPTO_SIMULATE_WINDOWS_ENVIRONMENT)
#if CORECRYPTO_SIMULATE_WINDOWS_ENVIRONMENT && CC_DARWIN && CORECRYPTO_DEBUG
#define CC_USE_ASM 0
#define CC_USE_HEAP_FOR_WORKSPACE 1
#if (CCN_UNIT_SIZE==8)
#define CCN_UINT128_SUPPORT_FOR_64BIT_ARCH 0
#else
#define CCN_UINT128_SUPPORT_FOR_64BIT_ARCH 1
#endif
#endif
#endif
#if !defined(CCN_UINT128_SUPPORT_FOR_64BIT_ARCH)
#if defined(_WIN64) && defined(_WIN32) && (CCN_UNIT_SIZE==8)
#define CCN_UINT128_SUPPORT_FOR_64BIT_ARCH 0
#elif defined(_WIN32)
#define CCN_UINT128_SUPPORT_FOR_64BIT_ARCH 1//should not be a problem
#else
#define CCN_UINT128_SUPPORT_FOR_64BIT_ARCH 1
#endif
#endif
#if defined(_MSC_VER)
#if defined(__clang__)
#define CC_ALIGNED(x) __attribute__ ((aligned(x))) //clang compiler
#else
#define CC_ALIGNED(x) __declspec(align(x)) //MS complier
#endif
#else
#if __clang__ || CCN_UNIT_SIZE==8
#define CC_ALIGNED(x) __attribute__ ((aligned(x)))
#else
#define CC_ALIGNED(x) __attribute__ ((aligned((x)>8?8:(x))))
#endif
#endif
#if defined(__arm__)
//this is copied from <arm/arch.h>, because <arm/arch.h> is not available on SEPROM environment
#if defined (__ARM_ARCH_7A__) || defined (__ARM_ARCH_7S__) || defined (__ARM_ARCH_7F__) || defined (__ARM_ARCH_7K__) || defined(__ARM_ARCH_7EM__)
#define _ARM_ARCH_7
#endif
#if defined(__ARM_ARCH_6M__) || defined(__TARGET_ARCH_6S_M) || defined (__armv6m__)
#define _ARM_ARCH_6M
#endif
#endif
#if defined(__arm64__) || defined(__arm__)
#define CCN_IOS 1
#define CCN_OSX 0
#elif defined(__x86_64__) || defined(__i386__)
#define CCN_IOS 0
#define CCN_OSX 1
#endif
#if CC_USE_S3
/* For corecrypto kext, CC_STATIC should be undefined */
#define CC_STATIC 1
#endif
#if !defined(CC_USE_HEAP_FOR_WORKSPACE)
#if CC_USE_S3 || CC_USE_SEPROM || CC_RTKITROM
#define CC_USE_HEAP_FOR_WORKSPACE 0
#else
#define CC_USE_HEAP_FOR_WORKSPACE 1
#endif
#endif
/* memset_s is only available in few target */
#if CC_USE_SEPROM || defined(__CC_ARM) \
|| defined(__hexagon__) || CC_EFI
#define CC_HAS_MEMSET_S 0
#else
#define CC_HAS_MEMSET_S 1
#endif
// Include target conditionals if available.
#if defined(__has_include) /* portability */
#if __has_include(<TargetConditionals.h>)
#include <TargetConditionals.h>
#endif /* __has_include(<TargetConditionals.h>) */
#endif /* defined(__has_include) */
// Disable RSA Keygen on iBridge
#if defined(TARGET_OS_BRIDGE) && TARGET_OS_BRIDGE && CC_KERNEL
#define CC_DISABLE_RSAKEYGEN 1 /* for iBridge */
#else
#define CC_DISABLE_RSAKEYGEN 0 /* default */
#endif
#if (CCN_UNIT_SIZE == 8) && !( defined(_MSC_VER) && defined(__clang__))
#define CCEC25519_CURVE25519_64BIT 1
#else
#define CCEC25519_CURVE25519_64BIT 0
#endif
//- functions implemented in assembly ------------------------------------------
//this the list of corecrypto clients that use assembly and the clang compiler
#if !(CC_DARWIN || CC_KERNEL || CC_USE_L4 || CC_IBOOT || CC_RTKIT || CC_RTKITROM || CC_USE_SEPROM || CC_USE_S3) && !defined(_WIN32) && CORECRYPTO_DEBUG
#warning "You are using the default corecrypto configuration, assembly optimizations may not be available for your platform"
#endif
// Enable assembler in Linux if CC_LINUX_ASM is defined
#if CC_LINUX && defined(CC_LINUX_ASM) && CC_LINUX_ASM
#define CC_USE_ASM 1
#endif
// Use this macro to strictly disable assembly regardless of cpu/os/compiler/etc.
// Our assembly code is not gcc compatible. Clang defines the __GNUC__ macro as well.
#if !defined(CC_USE_ASM)
#if defined(_WIN32) || CC_EFI || CC_BASEBAND || CC_XNU_KERNEL_PRIVATE || (defined(__GNUC__) && !defined(__clang__)) || defined(__ANDROID_API__) || CC_LINUX
#define CC_USE_ASM 0
#else
#define CC_USE_ASM 1
#endif
#endif
#define CC_CACHE_DESCRIPTORS CC_KERNEL
//-(1) ARM V7
#if defined(_ARM_ARCH_7) && __clang__ && CC_USE_ASM
#define CCN_DEDICATED_SQR CC_SMALL_CODE
#define CCN_MUL_KARATSUBA 0 // no performance improvement
#define CCN_ADD_ASM 1
#define CCN_SUB_ASM 1
#define CCN_MUL_ASM 0
#define CCN_ADDMUL1_ASM 1
#define CCN_MUL1_ASM 1
#define CCN_CMP_ASM 1
#define CCN_ADD1_ASM 1
#define CCN_SUB1_ASM 1
#define CCN_N_ASM 1
#define CCN_SET_ASM 1
#define CCN_SHIFT_RIGHT_ASM 1
#if defined(__ARM_NEON__)
#define CCN_SHIFT_LEFT_ASM 1
#else
#define CCN_SHIFT_LEFT_ASM 0
#endif
#define CCN_MULMOD_224_ASM 1
#define CCN_MULMOD_256_ASM 1
#define CCAES_ARM_ASM 1
#define CCAES_INTEL_ASM 0
#if CC_KERNEL || CC_USE_L4 || CC_IBOOT || CC_RTKIT || CC_RTKITROM || CC_USE_SEPROM || CC_USE_S3
#define CCAES_MUX 0
#else
#define CCAES_MUX 1
#endif
#define CCN_USE_BUILTIN_CLZ 1
#define CCSHA1_VNG_INTEL 0
#define CCSHA2_VNG_INTEL 0
#if defined(__ARM_NEON__) || CC_KERNEL
#define CCSHA1_VNG_ARM 1
#define CCSHA2_VNG_ARM 1
#else /* !defined(__ARM_NEON__) */
#define CCSHA1_VNG_ARM 0
#define CCSHA2_VNG_ARM 0
#endif /* !defined(__ARM_NEON__) */
#define CCSHA256_ARMV6M_ASM 0
#define CC_ACCELERATECRYPTO 1
//-(2) ARM 64
#elif defined(__arm64__) && __clang__ && CC_USE_ASM
#define CCN_DEDICATED_SQR CC_SMALL_CODE
#define CCN_MUL_KARATSUBA 0 // 4*n CCN_UNIT extra memory required.
#define CCN_ADD_ASM 1
#define CCN_SUB_ASM 1
#define CCN_MUL_ASM 1
#define CCN_ADDMUL1_ASM 0
#define CCN_MUL1_ASM 0
#define CCN_CMP_ASM 1
#define CCN_ADD1_ASM 0
#define CCN_SUB1_ASM 0
#define CCN_N_ASM 1
#define CCN_SET_ASM 0
#define CCN_SHIFT_RIGHT_ASM 1
#define CCN_SHIFT_LEFT_ASM 1
#define CCN_MULMOD_224_ASM 1
#define CCN_MULMOD_256_ASM 1
#define CCAES_ARM_ASM 1
#define CCAES_INTEL_ASM 0
#define CCAES_MUX 0 // On 64bit SoC, asm is much faster than HW
#define CCN_USE_BUILTIN_CLZ 1
#define CCSHA1_VNG_INTEL 0
#define CCSHA2_VNG_INTEL 0
#define CCSHA1_VNG_ARM 1
#define CCSHA2_VNG_ARM 1
#define CCSHA256_ARMV6M_ASM 0
#define CC_ACCELERATECRYPTO 1
//-(3) Intel 32/64
#elif (defined(__x86_64__) || defined(__i386__)) && __clang__ && CC_USE_ASM
#define CCN_DEDICATED_SQR 1
#define CCN_MUL_KARATSUBA 0 // 4*n CCN_UNIT extra memory required.
/* These assembly routines only work for a single CCN_UNIT_SIZE. */
#if (defined(__x86_64__) && CCN_UNIT_SIZE == 8) || (defined(__i386__) && CCN_UNIT_SIZE == 4)
#define CCN_ADD_ASM 1
#define CCN_SUB_ASM 1
#define CCN_MUL_ASM 1
#else
#define CCN_ADD_ASM 0
#define CCN_SUB_ASM 0
#define CCN_MUL_ASM 0
#endif
#if (defined(__x86_64__) && CCN_UNIT_SIZE == 8)
#define CCN_CMP_ASM 1
#define CCN_N_ASM 1
#define CCN_SHIFT_RIGHT_ASM 1
#define CCN_SHIFT_LEFT_ASM 1
#else
#define CCN_CMP_ASM 0
#define CCN_N_ASM 0
#define CCN_SHIFT_RIGHT_ASM 0
#define CCN_SHIFT_LEFT_ASM 0
#endif
#define CCN_MULMOD_224_ASM 0
#if defined(__x86_64__) && CCN_UNIT_SIZE == 8
#define CCN_MULMOD_256_ASM 1
#define CCN_ADDMUL1_ASM 1
#define CCN_MUL1_ASM 1
#else
#define CCN_MULMOD_256_ASM 0
#define CCN_ADDMUL1_ASM 0
#define CCN_MUL1_ASM 0
#endif
#define CCN_ADD1_ASM 0
#define CCN_SUB1_ASM 0
#define CCN_SET_ASM 0
#define CCAES_ARM_ASM 0
#define CCAES_INTEL_ASM 1
#define CCAES_MUX 0
#define CCN_USE_BUILTIN_CLZ 0
#define CCSHA1_VNG_INTEL 1
#define CCSHA2_VNG_INTEL 1
#define CCSHA1_VNG_ARM 0
#define CCSHA2_VNG_ARM 0
#define CCSHA256_ARMV6M_ASM 0
#define CC_ACCELERATECRYPTO 1
//-(4) disable assembly
#else
#if CCN_UINT128_SUPPORT_FOR_64BIT_ARCH
#define CCN_DEDICATED_SQR 1
#else
#define CCN_DEDICATED_SQR 0 //when assembly is off and 128-bit integers are not supported, dedicated square is off. This is the case on Windows
#endif
#define CCN_MUL_KARATSUBA 0 // 4*n CCN_UNIT extra memory required.
#define CCN_ADD_ASM 0
#define CCN_SUB_ASM 0
#define CCN_MUL_ASM 0
#define CCN_ADDMUL1_ASM 0
#define CCN_MUL1_ASM 0
#define CCN_CMP_ASM 0
#define CCN_ADD1_ASM 0
#define CCN_SUB1_ASM 0
#define CCN_N_ASM 0
#define CCN_SET_ASM 0
#define CCN_SHIFT_RIGHT_ASM 0
#define CCN_SHIFT_LEFT_ASM 0
#define CCN_MULMOD_224_ASM 0
#define CCN_MULMOD_256_ASM 0
#define CCAES_ARM_ASM 0
#define CCAES_INTEL_ASM 0
#define CCAES_MUX 0
#define CCN_USE_BUILTIN_CLZ 0
#define CCSHA1_VNG_INTEL 0
#define CCSHA2_VNG_INTEL 0
#define CCSHA1_VNG_ARM 0
#define CCSHA2_VNG_ARM 0
#define CCSHA256_ARMV6M_ASM 0
#define CC_ACCELERATECRYPTO 0
#endif
#define CC_INLINE static inline
#ifdef __GNUC__
#define CC_NORETURN __attribute__((__noreturn__))
#define CC_NOTHROW __attribute__((__nothrow__))
#define CC_NONNULL(N) __attribute__((__nonnull__ N))
#define CC_NONNULL4 CC_NONNULL((4))
#define CC_NONNULL_ALL __attribute__((__nonnull__))
#define CC_SENTINEL __attribute__((__sentinel__))
// Only apply the `CC_CONST` attribute to functions with no side-effects where the output is a strict function of pass by value input vars with no exterior side-effects.
// Specifically, do not apply CC_CONST if the function has any arguments that are pointers (directly, or indirectly)
#define CC_CONST __attribute__((__const__))
#define CC_PURE __attribute__((__pure__))
#define CC_WARN_RESULT __attribute__((__warn_unused_result__))
#define CC_MALLOC_CLEAR __attribute__((__malloc__))
#define CC_UNUSED __attribute__((unused))
#else /* !__GNUC__ */
/*! @parseOnly */
#define CC_UNUSED
/*! @parseOnly */
#define CC_NONNULL(N)
/*! @parseOnly */
#define CC_NONNULL4
/*! @parseOnly */
#define CC_NORETURN
/*! @parseOnly */
#define CC_NOTHROW
/*! @parseOnly */
#define CC_NONNULL_ALL
/*! @parseOnly */
#define CC_SENTINEL
/*! @parseOnly */
#define CC_CONST
/*! @parseOnly */
#define CC_PURE
/*! @parseOnly */
#define CC_WARN_RESULT
/*! @parseOnly */
#define CC_MALLOC_CLEAR
#endif /* !__GNUC__ */
// Bridge differences between MachO and ELF compiler/assemblers. */
#if CC_LINUX
#define CC_ASM_SECTION_CONST .rodata
#define CC_ASM_PRIVATE_EXTERN .hidden
#if CC_LINUX
// We need to be sure that assembler can access relocated C
// symbols. Sad but this is the quickest way to do that, at least with
// our current linux compiler (clang-3.4).
#define CC_C_LABEL(_sym) _sym@PLT
#endif
#define _IMM(x) $(x)
#else /* !CC_LINUX */
#define CC_ASM_SECTION_CONST .const
#define CC_ASM_PRIVATE_EXTERN .private_extern
#define CC_C_LABEL(_sym) _##_sym
#define _IMM(x) $$(x)
#endif /* !CC_LINUX */
// Enable FIPSPOST function tracing only when supported. */
#ifdef CORECRYPTO_POST_TRACE
#define CC_FIPSPOST_TRACE 1
#else
#define CC_FIPSPOST_TRACE 0
#endif
#ifndef CC_INTERNAL_SDK
#if __has_include(<System/i386/cpu_capabilities.h>)
#define CC_INTERNAL_SDK 1
#elif __has_include(<System/arm/cpu_capabilities.h>)
#define CC_INTERNAL_SDK 1
#else
#define CC_INTERNAL_SDK 0
#endif
#endif
// Currently thread sanitizer is only supported in local builds.
// Please edit your "corecrypto_test" scheme to build with thread
// sanitizer and then remove *all* variants of corecrypto_static
// besides "normal"
#if defined(__has_feature)
#if __has_feature(thread_sanitizer)
#define CC_TSAN 1
#else
#define CC_TSAN 0
#endif // __has_feature(thread_sanitizer)
#else
#define CC_TSAN 0
#endif // __has_feature
#endif /* _CORECRYPTO_CC_CONFIG_H_ */

76
cc/corecrypto/cc_debug.h Normal file
View File

@ -0,0 +1,76 @@
/* Copyright (c) (2012,2014-2019) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
//debug configuration header file
#ifndef _CORECRYPTO_CCN_DEBUG_H_
#define _CORECRYPTO_CCN_DEBUG_H_
#include <corecrypto/cc_config.h>
// DO NOT INCLUDE this HEADER file in CoreCrypto files added for XNU project or headers
// included by external clients.
// ========================
// Printf for corecrypto
// ========================
#if CC_KERNEL
#include <pexpert/pexpert.h>
#define cc_printf(x...) kprintf(x)
#if !CONFIG_EMBEDDED
extern int printf(const char *format, ...) __printflike(1,2);
#endif
#elif CC_USE_S3 || CC_IBOOT || CC_RTKIT || CC_RTKITROM
#include <stdio.h>
#define cc_printf(x...) printf(x)
#elif defined(__ANDROID_API__)
#include <android/log.h>
#define cc_printf(x...) __android_log_print(ANDROID_LOG_DEBUG, "corecrypto", x);
#else
#include <stdio.h>
#define cc_printf(x...) fprintf(stderr, x)
#endif
// ========================
// Integer types
// ========================
#if CC_KERNEL
/* Those are not defined in libkern */
#define PRIx64 "llx"
#define PRIx32 "x"
#define PRIx16 "hx"
#define PRIx8 "hhx"
#else
#include <inttypes.h>
#endif
#if CCN_UNIT_SIZE == 8
#define CCPRIx_UNIT ".016" PRIx64
#elif CCN_UNIT_SIZE == 4
#define CCPRIx_UNIT ".08" PRIx32
#elif CCN_UNIT_SIZE == 2
#define CCPRIx_UNIT ".04" PRIx16
#elif CCN_UNIT_SIZE == 1
#define CCPRIx_UNIT ".02" PRIx8
#else
#error invalid CCN_UNIT_SIZE
#endif
// ========================
// Print utilities for corecrypto
// ========================
#include <corecrypto/cc.h>
/* Print a byte array of arbitrary size */
void cc_print(const char *label, size_t count, const uint8_t *s);
#endif /* _CORECRYPTO_CCN_DEBUG_H_ */

165
cc/corecrypto/cc_error.h Normal file
View File

@ -0,0 +1,165 @@
/* Copyright (c) (2017,2018,2019,2020) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#ifndef _CORECRYPTO_CC_ERROR_H_
#define _CORECRYPTO_CC_ERROR_H_
enum {
CCERR_OK = 0,
/* the default error code */
CCERR_INTERNAL = -1,
CCERR_INTEGRITY = -2,
CCERR_DEVICE = -3,
CCERR_INTERRUPTS = -4,
CCERR_CRYPTO_CONFIG = -5,
CCERR_PERMS = -6,
CCERR_PARAMETER = -7,
CCERR_MEMORY = -8,
CCERR_FILEDESC = -9,
CCERR_OUT_OF_ENTROPY = -10,
CCERR_ATFORK = -11,
CCERR_OVERFLOW = -12,
CCERR_MEMORY_ALLOC_FAIL = -13,
CCEC_GENERATE_KEY_DEFAULT_ERR = -14,
CCEC_GENERATE_KEY_TOO_MANY_TRIES = -15,
CCEC_GENERATE_KEY_MULT_FAIL = -16,
CCEC_GENERATE_KEY_AFF_FAIL = -17,
CCEC_GENERATE_KEY_CONSISTENCY = -18,
CCEC_GENERATE_NOT_ON_CURVE = -19,
CCEC_GENERATE_NOT_ENOUGH_ENTROPY = -20,
CCEC_GENERATE_NOT_SUPPORTED = -21,
CCEC_GENERATE_INVALID_INPUT = -22,
// Program error: buffer too small or encrypted message is too small
CCRSA_INVALID_INPUT = -23,
// Invalid crypto configuration: Hash length versus RSA key size
CCRSA_INVALID_CONFIG = -24,
CCRSA_ENCODING_ERROR = -25,
CCRSA_DECODING_ERROR = -26,
// The data is invalid (we won't say more for security)
CCRSA_PRIVATE_OP_ERROR = -27,
CCRSA_KEY_ERROR = -28,
// Key generation specific
CCRSA_KEYGEN_PRIME_NOT_FOUND = -29,
CCRSA_KEYGEN_PRIME_NEED_NEW_SEED = -30,
CCRSA_KEYGEN_PRIME_TOO_MANY_ITERATIONS = -31,
CCRSA_KEYGEN_PRIME_SEED_GENERATION_ERROR = -32,
CCRSA_KEYGEN_MODULUS_CRT_INV_ERROR = -33,
CCRSA_KEYGEN_NEXT_PRIME_ERROR = -34,
CCRSA_KEYGEN_SEED_X_ERROR = -35,
CCRSA_KEYGEN_SEED_r_ERROR = -36,
CCRSA_KEYGEN_KEYGEN_CONSISTENCY_FAIL = -37,
CCRSA_KEYGEN_R1R2_SIZE_ERROR = -38,
CCRSA_KEYGEN_PQ_DELTA_ERROR = -39,
CCRSA_FIPS_KEYGEN_DISABLED = -40,
CCZP_INV_ERROR = -41,
CCZP_INV_NO_INVERSE = -42,
CCZP_INV_INVALID_INPUT = -43,
CCZ_INVALID_INPUT_ERROR = -44,
CCZ_INVALID_RADIX_ERROR = -45,
CCDH_ERROR_DEFAULT = -46,
CCDH_GENERATE_KEY_TOO_MANY_TRIES = -47,
CCDH_NOT_SUPPORTED_CONFIGURATION = -48,
CCDH_SAFETY_CHECK = -49,
CCDH_PUBLIC_KEY_MISSING = -50,
CCDH_INVALID_DOMAIN_PARAMETER = -51,
CCDH_INVALID_INPUT = -52,
CCDH_DOMAIN_PARAMETER_MISMATCH = -53,
CCDH_GENERATE_KEY_CONSISTENCY = -54,
CCSRP_ERROR_DEFAULT = -55,
CCSRP_GENERATE_KEY_TOO_MANY_TRIES = -56,
CCSRP_NOT_SUPPORTED_CONFIGURATION = -57,
CCSRP_SAFETY_CHECK = -58,
CCSRP_PUBLIC_KEY_MISSING = -59,
CCSRP_INVALID_DOMAIN_PARAMETER = -60,
CCDRBG_STATUS_ERROR = -61,
CCDRBG_STATUS_NEED_RESEED = -62,
CCDRBG_STATUS_PARAM_ERROR = -63,
// If this value is returned, the caller must abort or panic the process for
// security reasons. for example in the case of catastrophic error in
// http://csrc.nist.gov/publications/drafts/800-90/sp800_90a_r1_draft.pdf
// ccdrbg calls abort() or panic(), if they are available in the system.
CCDRBG_STATUS_ABORT = -64,
CCKPRNG_NEED_ENTROPY = -65,
CCKPRNG_ABORT = -66,
CCMODE_INVALID_INPUT = -67,
CCMODE_INVALID_CALL_SEQUENCE = -68,
CCMODE_INTEGRITY_FAILURE = -69,
CCMODE_NOT_SUPPORTED = -70,
CCMODE_INTERNAL_ERROR = -71,
// Configuration or unexpected issue
CCPOST_GENERIC_FAILURE = -72,
CCPOST_LIBRARY_ERROR = -73,
CCPOST_INTEGRITY_ERROR = -74,
// Output of the algo is not as expected
CCPOST_KAT_FAILURE = -75,
CCKPRNG_SEEDFILE_OPEN = -76,
CCKPRNG_SEEDFILE_READ = -78,
CCKPRNG_SEEDFILE_WRITE = -79,
CCKPRNG_SEEDFILE_CHMOD = -80,
CCKPRNG_SEEDFILE_CHOWN = -81,
CCKPRNG_RANDOMDEV_OPEN = -82,
CCKPRNG_RANDOMDEV_WRITE = -83,
CCKPRNG_GETENTROPY = -84,
CCSAE_HUNTPECK_EXCEEDED_MAX_TRIALS = -85,
CCERR_CALL_SEQUENCE = -86,
CCVRF_POINT_DECODE_FAILURE = -87,
CCVRF_POINT_INVALID_PUBLIC_KEY = -88,
CCVRF_VERIFY_FAILURE = -89,
// Error codes for Authenticated Encryption Modes
CCMODE_TAG_LENGTH_REQUEST_TOO_LONG = -100,
CCMODE_TAG_LENGTH_TOO_SHORT = -101,
CCMODE_NONCE_EMPTY = -102,
CCMODE_AD_EMPTY = -103,
CCMODE_DECRYPTION_OR_VERIFICATION_ERR=-104,
CCMODE_BUFFER_OUT_IN_OVERLAP = -105,
CCSAE_NOT_ENOUGH_COMMIT_PARTIAL_CALLS = -132,
CCSAE_GENERATE_COMMIT_CALL_AGAIN = -133,
CCERR_VALID_SIGNATURE = CCERR_OK,
CCERR_INVALID_SIGNATURE = -146,
CCERR_IOSERVICE_GETMATCHING = -147,
CCERR_IOSERVICE_OPEN = -148,
CCERR_IOCONNECT_CALL = -149,
CCEC_KEY_CANNOT_BE_UNIT = -160,
CCEC_COMPRESSED_POINT_ENCODING_ERROR = -161,
CCERR_RNG_NOT_SEEDED = -162,
};
#define CCDRBG_STATUS_OK CCERR_OK
#define CCKPRNG_OK CCERR_OK
#endif /* _CORECRYPTO_CC_ERROR_H_ */

View File

@ -0,0 +1,29 @@
/* Copyright (c) (2019,2020) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#ifndef corecrypto_cc_fault_canary_h
#define corecrypto_cc_fault_canary_h
#include "cc.h"
#define CC_FAULT_CANARY_SIZE 16
typedef uint8_t cc_fault_canary_t[CC_FAULT_CANARY_SIZE];
extern const cc_fault_canary_t CCEC_FAULT_CANARY;
extern const cc_fault_canary_t CCRSA_PKCS1_FAULT_CANARY;
extern const cc_fault_canary_t CCRSA_PSS_FAULT_CANARY;
#define CC_FAULT_CANARY_MEMCPY(_dst_, _src_) memcpy(_dst_, _src_, CC_FAULT_CANARY_SIZE)
#define CC_FAULT_CANARY_CLEAR(_name_) memset(_name_, 0x00, CC_FAULT_CANARY_SIZE)
#define CC_FAULT_CANARY_EQUAL(_a_, _b_) (cc_cmp_safe(CC_FAULT_CANARY_SIZE, _a_, _b_) == 0)
#endif /* corecrypto_cc_fault_canary_h */

View File

@ -0,0 +1,27 @@
/* Copyright (c) (2019,2020) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#ifndef corecrypto_cc_fault_canary_internal_h
#define corecrypto_cc_fault_canary_internal_h
/*!
@function cc_fault_canary_set
@abstract Set the output `fault_canary_out` to the value `fault_canary` if the two inputs are equal.
@param fault_canary_out Output fault canary value
@param fault_canary Fault canary for a specific operation (e.g. CCEC_FAULT_CANARY for ECC signing)
@param nbytes Byte length of inputs in1 and in2
@param in1 Input one
@param in2 Input two
*/
void cc_fault_canary_set(cc_fault_canary_t fault_canary_out, const cc_fault_canary_t fault_canary, size_t nbytes, const uint8_t *in1, const uint8_t *in2);
#endif /* corecrypto_cc_fault_canary_internal_h */

View File

@ -0,0 +1,16 @@
/* Copyright (c) (2019) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#include <stdbool.h>
#include <stdint.h>
#include <corecrypto/cc_priv.h>
extern bool cc_rdrand(uint64_t *rand);

150
cc/corecrypto/cc_macros.h Normal file
View File

@ -0,0 +1,150 @@
/* Copyright (c) (2012,2015,2016,2017,2019) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#ifndef _CORECRYPTO_CC_MACROS_H_
#define _CORECRYPTO_CC_MACROS_H_
#include <corecrypto/cc_config.h>
#ifndef __CC_DEBUG_ASSERT_COMPONENT_NAME_STRING
#define __CC_DEBUG_ASSERT_COMPONENT_NAME_STRING ""
#endif
#ifndef __CC_DEBUG_ASSERT_PRODUCTION_CODE
#define __CC_DEBUG_ASSERT_PRODUCTION_CODE !CORECRYPTO_DEBUG
#endif
#if CORECRYPTO_DEBUG_ENABLE_CC_REQUIRE_PRINTS
#if !CC_KERNEL
#include <string.h> // for strstr
#endif // !CC_KERNEL
CC_UNUSED static char *cc_strstr(const char *file) {
#if CC_KERNEL
(void) file;
#else
const char cc_char []="corecrypto";
char *p=strstr(file, cc_char);
if (p) return (p+strlen(cc_char)+1);
#endif
return NULL;
}
#define __CC_DEBUG_REQUIRE_MESSAGE(name, assertion, label, message, file, line, value) \
{char *___t = cc_strstr(file); cc_printf( "require: %s, %s%s:%d\n", assertion, (message!=0) ? message : "", ___t==NULL?file:___t, line);}
#endif // CORECRYPTO_DEBUG_ENABLE_CC_REQUIRE_PRINTS
#ifndef cc_require
#if (__CC_DEBUG_ASSERT_PRODUCTION_CODE) || (!CORECRYPTO_DEBUG_ENABLE_CC_REQUIRE_PRINTS)
#if defined(_WIN32) && defined (__clang__)
#define cc_require(assertion, exceptionLabel) \
do { \
if (!(assertion) ) { \
goto exceptionLabel; \
} \
} while ( 0 )
#else
#define cc_require(assertion, exceptionLabel) \
do { \
if ( __builtin_expect(!(assertion), 0) ) { \
goto exceptionLabel; \
} \
} while ( 0 )
#endif
#else
#define cc_require(assertion, exceptionLabel) \
do { \
if ( __builtin_expect(!(assertion), 0) ) { \
__CC_DEBUG_REQUIRE_MESSAGE(__CC_DEBUG_ASSERT_COMPONENT_NAME_STRING, \
#assertion, #exceptionLabel, 0, __FILE__, __LINE__, 0); \
goto exceptionLabel; \
} \
} while ( 0 )
#endif
#endif
#ifndef cc_require_action
#if __CC_DEBUG_ASSERT_PRODUCTION_CODE || (!CORECRYPTO_DEBUG_ENABLE_CC_REQUIRE_PRINTS)
#if defined(_WIN32) && defined(__clang__)
#define cc_require_action(assertion, exceptionLabel, action) \
do \
{ \
if (!(assertion)) \
{ \
{ \
action; \
} \
goto exceptionLabel; \
} \
} while ( 0 )
#else
#define cc_require_action(assertion, exceptionLabel, action) \
do \
{ \
if ( __builtin_expect(!(assertion), 0) ) \
{ \
{ \
action; \
} \
goto exceptionLabel; \
} \
} while ( 0 )
#endif
#else
#define cc_require_action(assertion, exceptionLabel, action) \
do \
{ \
if ( __builtin_expect(!(assertion), 0) ) \
{ \
__CC_DEBUG_REQUIRE_MESSAGE( \
__CC_DEBUG_ASSERT_COMPONENT_NAME_STRING, \
#assertion, #exceptionLabel, 0, __FILE__, __LINE__, 0); \
{ \
action; \
} \
goto exceptionLabel; \
} \
} while ( 0 )
#endif
#endif
#ifndef cc_require_or_return
#if (__CC_DEBUG_ASSERT_PRODUCTION_CODE) || (!CORECRYPTO_DEBUG_ENABLE_CC_REQUIRE_PRINTS)
#if defined(_WIN32) && defined (__clang__)
#define cc_require_or_return(assertion, value) \
do { \
if (!(assertion) ) { \
return value; \
} \
} while ( 0 )
#else
#define cc_require_or_return(assertion, value) \
do { \
if ( __builtin_expect(!(assertion), 0) ) { \
return value; \
} \
} while ( 0 )
#endif
#else
#define cc_require_or_return(assertion, value) \
do { \
if ( __builtin_expect(!(assertion), 0) ) { \
__CC_DEBUG_REQUIRE_MESSAGE(__CC_DEBUG_ASSERT_COMPONENT_NAME_STRING, \
#assertion, #exceptionLabel, 0, __FILE__, __LINE__, 0); \
return value; \
} \
} while ( 0 )
#endif
#endif
#endif /* _CORECRYPTO_CC_MACROS_H_ */

192
cc/corecrypto/cc_memory.h Normal file
View File

@ -0,0 +1,192 @@
/* Copyright (c) (2014,2015,2016,2017,2018,2019,2020) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#include "cc_config.h"
#ifndef corecrypto_cc_memory_h
#define corecrypto_cc_memory_h
#if CORECRYPTO_DEBUG && !defined(_WIN32) && !defined(_WIN64)
#define CC_ALLOC_DEBUG 1
#endif
struct ws_dbg {
const void *p;
const char *file;
int line;
const char *func;
};
#if defined(CC_ALLOC_DEBUG)
extern struct ws_dbg g_ws_dbg;
#endif
#include <corecrypto/cc_config.h>
#include <corecrypto/cc_error.h>
#include "cc_debug.h"
#include <corecrypto/cc_priv.h>
CC_INLINE void cc_alloc_debug(CC_UNUSED const void *p, CC_UNUSED const char *file, CC_UNUSED int line, CC_UNUSED const char *func)
{
#if defined(CC_ALLOC_DEBUG)
// Contract for some client is to have a single malloc at a time
cc_assert(g_ws_dbg.p == NULL);
g_ws_dbg = (struct ws_dbg){ p, file, line, func };
#endif
}
CC_INLINE void cc_free_debug(CC_UNUSED const void *p)
{
#if defined(CC_ALLOC_DEBUG)
// Contract for some client is to have a single malloc at a time
cc_assert(g_ws_dbg.p == p); // Free the address we allocated
g_ws_dbg = (struct ws_dbg){};
#endif
}
// =============================================================================
// Declare workspace with memory in STACK
// This is the least preferred option since most corecrypto client have
// small stack. It is still useful when needing small allocations and errors
// can't be easily propagated
// =============================================================================
// Declare a variable in stack and use its address
// Only uses this when we don't have a way to propagate error
#define CC_DECL_WORKSPACE_STACK(ws, n) \
cc_unit ws##_buf[(n)]; \
cc_ws ws##_ctx = { &ws##_buf[0], &ws##_buf[(n)] }; \
cc_ws_t ws = &ws##_ctx; \
cc_alloc_debug(ws->start, __FILE__, __LINE__, __func__);
// Reset pointers to avoid future reference
#define CC_FREE_WORKSPACE_STACK(ws) \
cc_free_debug(ws->start); \
ws->start = NULL; \
ws->end = NULL;
#define CC_CLEAR_AND_FREE_WORKSPACE_STACK(ws) \
cc_try_abort_if(ws->start > ws->end, "free ws"); \
ccn_clear((cc_size)(ws->end - ws->start), ws->start); \
CC_FREE_WORKSPACE_STACK(ws);
// =============================================================================
// Declare workspace in the region correspding to HEAP or STACK
// depending on the setting of CC_USE_HEAP_FOR_WORKSPACE
// This should be the preference for large memory allocations but it requires
// to propagate error in case of allocation failure
// =============================================================================
#if CC_USE_HEAP_FOR_WORKSPACE
// Malloc/free functions to be used
#if CC_KERNEL
#include <IOKit/IOLib.h>
#include <vm/pmap.h>
CC_INLINE void *cc_malloc_clear(size_t s)
{
void *p = NULL;
if (pmap_in_ppl()) {
if (s > PAGE_SIZE) {
panic("PPL cc_malloc_clear trying to allocate %zu > PAGE_SIZE", s);
}
p = pmap_claim_reserved_ppl_page();
} else {
p = IOMalloc(s);
}
if (p != NULL) {
memset(p, 0, s);
}
return p;
}
CC_INLINE void cc_free(void *p, size_t size)
{
if (pmap_in_ppl()) {
if (size > PAGE_SIZE) {
panic("PPL cc_malloc_clear trying to free %zu > PAGE_SIZE", size);
}
pmap_free_reserved_ppl_page(p);
return;
}
IOFree(p, size);
}
#else // !CC_KERNEL
#include <stdlib.h>
CC_INLINE void *cc_malloc_clear(size_t s)
{
void *p = malloc(s);
if (p != NULL) {
memset(p, 0, s);
}
return p;
}
CC_INLINE void cc_free(void *p, size_t size CC_UNUSED)
{
free(p);
}
#endif // !CC_KERNEL
#define CC_DECL_WORKSPACE_OR_FAIL(ws, n) \
cc_unit *ws##_buf = (cc_unit *) cc_malloc_clear(ccn_sizeof_n((n))); \
cc_ws ws##_ctx = { &ws##_buf[0], &ws##_buf[(n)] }; \
cc_ws_t ws = &ws##_ctx; \
if (NULL == ws->start) \
return CCERR_MEMORY_ALLOC_FAIL; \
cc_alloc_debug(ws->start, __FILE__, __LINE__, __func__);
// Free and reset pointers to avoid future references
#define CC_FREE_WORKSPACE(ws) \
cc_free_debug(ws->start); \
cc_try_abort_if(ws->start > ws->end, "free ws"); \
cc_free(ws->start, (size_t)(ws->end - ws->start) * sizeof(ws->start[0])); \
ws->start = NULL; \
ws->end = NULL;
#else // !CC_USE_HEAP_FOR_WORKSPACE
// Declare a variable in stack and use its address
// Could use alloca but alloca is not so portable, and not secure.
#define CC_DECL_WORKSPACE_OR_FAIL CC_DECL_WORKSPACE_STACK
// Reset pointers to avoid future reference
#define CC_FREE_WORKSPACE CC_FREE_WORKSPACE_STACK
#endif // !CC_USE_HEAP_FOR_WORKSPACE
// =============================================================================
// Common
// =============================================================================
#define CC_CLEAR_AND_FREE_WORKSPACE(ws) \
cc_try_abort_if(ws->start > ws->end, "clear ws"); \
ccn_clear((cc_size)(ws->end - ws->start), ws->start); \
CC_FREE_WORKSPACE(ws);
// To allocate array of n cc_unit in the WS
#define CC_DECL_BP_WS(ws, bp) cc_unit *bp = ws->start;
#define CC_FREE_BP_WS(ws, bp) ws->start = bp;
#define CC_ALLOC_WS(ws, n) \
ws->start; \
ws->start += n; \
cc_try_abort_if(ws->start > ws->end, "alloc ws");
#if CC_KERNEL
#include <libkern/section_keywords.h>
#define CC_READ_ONLY_LATE(_t) SECURITY_READ_ONLY_LATE(_t)
#else
#define CC_READ_ONLY_LATE(_t) _t
#endif
#endif // corecrypto_cc_memory_h

818
cc/corecrypto/cc_priv.h Normal file
View File

@ -0,0 +1,818 @@
/* Copyright (c) (2010,2011,2012,2014,2015,2016,2017,2018,2019,2020) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#ifndef _CORECRYPTO_CC_PRIV_H_
#define _CORECRYPTO_CC_PRIV_H_
#include <corecrypto/cc.h>
#include <stdbool.h>
#include <stdint.h>
// Fork handlers for the stateful components of corecrypto.
void cc_atfork_prepare(void);
void cc_atfork_parent(void);
void cc_atfork_child(void);
#ifndef __has_builtin
#define __has_builtin(x) 0
#endif
#ifndef __DECONST
#define __DECONST(type, var) ((type)(uintptr_t)(const void *)(var))
#endif
/* defines the following macros :
CC_ARRAY_LEN: returns the number of elements in an array
CC_STORE32_BE : store 32 bit value in big endian in unaligned buffer.
CC_STORE32_LE : store 32 bit value in little endian in unaligned buffer.
CC_STORE64_BE : store 64 bit value in big endian in unaligned buffer.
CC_STORE64_LE : store 64 bit value in little endian in unaligned buffer.
CC_LOAD32_BE : load 32 bit value in big endian from unaligned buffer.
CC_LOAD32_LE : load 32 bit value in little endian from unaligned buffer.
CC_LOAD64_BE : load 64 bit value in big endian from unaligned buffer.
CC_LOAD64_LE : load 64 bit value in little endian from unaligned buffer.
CC_ROR : Rotate Right 32 bits. Rotate count can be a variable.
CC_ROL : Rotate Left 32 bits. Rotate count can be a variable.
CC_RORc : Rotate Right 32 bits. Rotate count must be a constant.
CC_ROLc : Rotate Left 32 bits. Rotate count must be a constant.
CC_ROR64 : Rotate Right 64 bits. Rotate count can be a variable.
CC_ROL64 : Rotate Left 64 bits. Rotate count can be a variable.
CC_ROR64c : Rotate Right 64 bits. Rotate count must be a constant.
CC_ROL64c : Rotate Left 64 bits. Rotate count must be a constant.
CC_BSWAP : byte swap a 32 bits variable.
CC_H2BE32 : convert a 32 bits value between host and big endian order.
CC_H2LE32 : convert a 32 bits value between host and little endian order.
CC_BSWAP64 : byte swap a 64 bits variable
CC_READ_LE32 : read a 32 bits little endian value
CC_WRITE_LE32 : write a 32 bits little endian value
CC_WRITE_LE64 : write a 64 bits little endian value
CC_H2BE64 : convert a 64 bits value between host and big endian order
CC_H2LE64 : convert a 64 bits value between host and little endian order
*/
// RTKitOSPlatform should replace CC_MEMCPY with memcpy
#define CC_MEMCPY(D,S,L) cc_memcpy((D),(S),(L))
#define CC_MEMMOVE(D,S,L) cc_memmove((D),(S),(L))
#define CC_MEMSET(D,V,L) cc_memset((D),(V),(L))
#if __has_builtin(__builtin___memcpy_chk) && !defined(_MSC_VER)
#define cc_memcpy(dst, src, len) __builtin___memcpy_chk((dst), (src), (len), __builtin_object_size((dst), 1))
#define cc_memcpy_nochk(dst, src, len) __builtin___memcpy_chk((dst), (src), (len), __builtin_object_size((dst), 0))
#else
#define cc_memcpy(dst, src, len) memcpy((dst), (src), (len))
#define cc_memcpy_nochk(dst, src, len) memcpy((dst), (src), (len))
#endif
#if __has_builtin(__builtin___memmove_chk) && !defined(_MSC_VER)
#define cc_memmove(dst, src, len) __builtin___memmove_chk((dst), (src), (len), __builtin_object_size((dst), 1))
#else
#define cc_memmove(dst, src, len) memmove((dst), (src), (len))
#endif
#if __has_builtin(__builtin___memset_chk) && !defined(_MSC_VER)
#define cc_memset(dst, val, len) __builtin___memset_chk((dst), (val), (len), __builtin_object_size((dst), 1))
#else
#define cc_memset(dst, val, len) memset((dst), (val), (len))
#endif
#define CC_ARRAY_LEN(x) (sizeof((x))/sizeof((x)[0]))
// MARK: - Loads and Store
// MARK: -- 32 bits - little endian
// MARK: --- Default version
#define CC_STORE32_LE(x, y) do { \
((unsigned char *)(y))[3] = (unsigned char)(((x)>>24)&255); \
((unsigned char *)(y))[2] = (unsigned char)(((x)>>16)&255); \
((unsigned char *)(y))[1] = (unsigned char)(((x)>>8)&255); \
((unsigned char *)(y))[0] = (unsigned char)((x)&255); \
} while(0)
#define CC_LOAD32_LE(x, y) do { \
x = ((uint32_t)(((const unsigned char *)(y))[3] & 255)<<24) | \
((uint32_t)(((const unsigned char *)(y))[2] & 255)<<16) | \
((uint32_t)(((const unsigned char *)(y))[1] & 255)<<8) | \
((uint32_t)(((const unsigned char *)(y))[0] & 255)); \
} while(0)
// MARK: -- 64 bits - little endian
#define CC_STORE64_LE(x, y) do { \
((unsigned char *)(y))[7] = (unsigned char)(((x)>>56)&255); \
((unsigned char *)(y))[6] = (unsigned char)(((x)>>48)&255); \
((unsigned char *)(y))[5] = (unsigned char)(((x)>>40)&255); \
((unsigned char *)(y))[4] = (unsigned char)(((x)>>32)&255); \
((unsigned char *)(y))[3] = (unsigned char)(((x)>>24)&255); \
((unsigned char *)(y))[2] = (unsigned char)(((x)>>16)&255); \
((unsigned char *)(y))[1] = (unsigned char)(((x)>>8)&255); \
((unsigned char *)(y))[0] = (unsigned char)((x)&255); \
} while(0)
#define CC_LOAD64_LE(x, y) do { \
x = (((uint64_t)(((const unsigned char *)(y))[7] & 255))<<56) | \
(((uint64_t)(((const unsigned char *)(y))[6] & 255))<<48) | \
(((uint64_t)(((const unsigned char *)(y))[5] & 255))<<40) | \
(((uint64_t)(((const unsigned char *)(y))[4] & 255))<<32) | \
(((uint64_t)(((const unsigned char *)(y))[3] & 255))<<24) | \
(((uint64_t)(((const unsigned char *)(y))[2] & 255))<<16) | \
(((uint64_t)(((const unsigned char *)(y))[1] & 255))<<8) | \
(((uint64_t)(((const unsigned char *)(y))[0] & 255))); \
} while(0)
// MARK: -- 32 bits - big endian
// MARK: --- intel version
#if (defined(__i386__) || defined(__x86_64__)) && !defined(_MSC_VER)
#define CC_STORE32_BE(x, y) \
__asm__ __volatile__ ( \
"bswapl %0 \n\t" \
"movl %0,(%1)\n\t" \
"bswapl %0 \n\t" \
::"r"(x), "r"(y))
#define CC_LOAD32_BE(x, y) \
__asm__ __volatile__ ( \
"movl (%1),%0\n\t" \
"bswapl %0\n\t" \
:"=r"(x): "r"(y))
#else
// MARK: --- default version
#define CC_STORE32_BE(x, y) do { \
((unsigned char *)(y))[0] = (unsigned char)(((x)>>24)&255); \
((unsigned char *)(y))[1] = (unsigned char)(((x)>>16)&255); \
((unsigned char *)(y))[2] = (unsigned char)(((x)>>8)&255); \
((unsigned char *)(y))[3] = (unsigned char)((x)&255); \
} while(0)
#define CC_LOAD32_BE(x, y) do { \
x = ((uint32_t)(((const unsigned char *)(y))[0] & 255)<<24) | \
((uint32_t)(((const unsigned char *)(y))[1] & 255)<<16) | \
((uint32_t)(((const unsigned char *)(y))[2] & 255)<<8) | \
((uint32_t)(((const unsigned char *)(y))[3] & 255)); \
} while(0)
#endif
// MARK: -- 64 bits - big endian
// MARK: --- intel 64 bits version
#if defined(__x86_64__) && !defined (_MSC_VER)
#define CC_STORE64_BE(x, y) \
__asm__ __volatile__ ( \
"bswapq %0 \n\t" \
"movq %0,(%1)\n\t" \
"bswapq %0 \n\t" \
::"r"(x), "r"(y))
#define CC_LOAD64_BE(x, y) \
__asm__ __volatile__ ( \
"movq (%1),%0\n\t" \
"bswapq %0\n\t" \
:"=r"(x): "r"(y))
#else
// MARK: --- default version
#define CC_STORE64_BE(x, y) do { \
((unsigned char *)(y))[0] = (unsigned char)(((x)>>56)&255); \
((unsigned char *)(y))[1] = (unsigned char)(((x)>>48)&255); \
((unsigned char *)(y))[2] = (unsigned char)(((x)>>40)&255); \
((unsigned char *)(y))[3] = (unsigned char)(((x)>>32)&255); \
((unsigned char *)(y))[4] = (unsigned char)(((x)>>24)&255); \
((unsigned char *)(y))[5] = (unsigned char)(((x)>>16)&255); \
((unsigned char *)(y))[6] = (unsigned char)(((x)>>8)&255); \
((unsigned char *)(y))[7] = (unsigned char)((x)&255); \
} while(0)
#define CC_LOAD64_BE(x, y) do { \
x = (((uint64_t)(((const unsigned char *)(y))[0] & 255))<<56) | \
(((uint64_t)(((const unsigned char *)(y))[1] & 255))<<48) | \
(((uint64_t)(((const unsigned char *)(y))[2] & 255))<<40) | \
(((uint64_t)(((const unsigned char *)(y))[3] & 255))<<32) | \
(((uint64_t)(((const unsigned char *)(y))[4] & 255))<<24) | \
(((uint64_t)(((const unsigned char *)(y))[5] & 255))<<16) | \
(((uint64_t)(((const unsigned char *)(y))[6] & 255))<<8) | \
(((uint64_t)(((const unsigned char *)(y))[7] & 255))); \
} while(0)
#endif
// MARK: - 32-bit Rotates
#if defined(_MSC_VER)
// MARK: -- MSVC version
#include <stdlib.h>
#if !defined(__clang__)
#pragma intrinsic(_lrotr,_lrotl)
#endif
#define CC_ROR(x,n) _lrotr(x,n)
#define CC_ROL(x,n) _lrotl(x,n)
#define CC_RORc(x,n) _lrotr(x,n)
#define CC_ROLc(x,n) _lrotl(x,n)
#elif (defined(__i386__) || defined(__x86_64__))
// MARK: -- intel asm version
CC_INLINE uint32_t CC_ROL(uint32_t word, int i)
{
__asm__ ("roll %%cl,%0"
:"=r" (word)
:"0" (word),"c" (i));
return word;
}
CC_INLINE uint32_t CC_ROR(uint32_t word, int i)
{
__asm__ ("rorl %%cl,%0"
:"=r" (word)
:"0" (word),"c" (i));
return word;
}
/* Need to be a macro here, because 'i' is an immediate (constant) */
#define CC_ROLc(word, i) \
({ uint32_t _word=(word); \
__asm__ __volatile__ ("roll %2,%0" \
:"=r" (_word) \
:"0" (_word),"I" (i)); \
_word; \
})
#define CC_RORc(word, i) \
({ uint32_t _word=(word); \
__asm__ __volatile__ ("rorl %2,%0" \
:"=r" (_word) \
:"0" (_word),"I" (i)); \
_word; \
})
#else
// MARK: -- default version
CC_INLINE uint32_t CC_ROL(uint32_t word, int i)
{
return ( (word<<(i&31)) | (word>>(32-(i&31))) );
}
CC_INLINE uint32_t CC_ROR(uint32_t word, int i)
{
return ( (word>>(i&31)) | (word<<(32-(i&31))) );
}
#define CC_ROLc(x, y) CC_ROL(x, y)
#define CC_RORc(x, y) CC_ROR(x, y)
#endif
// MARK: - 64 bits rotates
#if defined(__x86_64__) && !defined(_MSC_VER) //clang _MSVC doesn't support GNU-style inline assembly
// MARK: -- intel 64 asm version
CC_INLINE uint64_t CC_ROL64(uint64_t word, int i)
{
__asm__("rolq %%cl,%0"
:"=r" (word)
:"0" (word),"c" (i));
return word;
}
CC_INLINE uint64_t CC_ROR64(uint64_t word, int i)
{
__asm__("rorq %%cl,%0"
:"=r" (word)
:"0" (word),"c" (i));
return word;
}
/* Need to be a macro here, because 'i' is an immediate (constant) */
#define CC_ROL64c(word, i) \
({ \
uint64_t _word=(word); \
__asm__("rolq %2,%0" \
:"=r" (_word) \
:"0" (_word),"J" (i)); \
_word; \
})
#define CC_ROR64c(word, i) \
({ \
uint64_t _word=(word); \
__asm__("rorq %2,%0" \
:"=r" (_word) \
:"0" (_word),"J" (i)); \
_word; \
})
#else /* Not x86_64 */
// MARK: -- default C version
CC_INLINE uint64_t CC_ROL64(uint64_t word, int i)
{
return ( (word<<(i&63)) | (word>>(64-(i&63))) );
}
CC_INLINE uint64_t CC_ROR64(uint64_t word, int i)
{
return ( (word>>(i&63)) | (word<<(64-(i&63))) );
}
#define CC_ROL64c(x, y) CC_ROL64(x, y)
#define CC_ROR64c(x, y) CC_ROR64(x, y)
#endif
// MARK: - Byte Swaps
#if __has_builtin(__builtin_bswap32)
#define CC_BSWAP32(x) __builtin_bswap32(x)
#else
CC_INLINE uint32_t CC_BSWAP32(uint32_t x)
{
return
((x & 0xff000000) >> 24) |
((x & 0x00ff0000) >> 8) |
((x & 0x0000ff00) << 8) |
((x & 0x000000ff) << 24);
}
#endif
#if __has_builtin(__builtin_bswap64)
#define CC_BSWAP64(x) __builtin_bswap64(x)
#else
CC_INLINE uint64_t CC_BSWAP64(uint64_t x)
{
return
((x & 0xff00000000000000ULL) >> 56) |
((x & 0x00ff000000000000ULL) >> 40) |
((x & 0x0000ff0000000000ULL) >> 24) |
((x & 0x000000ff00000000ULL) >> 8) |
((x & 0x00000000ff000000ULL) << 8) |
((x & 0x0000000000ff0000ULL) << 24) |
((x & 0x000000000000ff00ULL) << 40) |
((x & 0x00000000000000ffULL) << 56);
}
#endif
#ifdef __LITTLE_ENDIAN__
#define CC_H2BE32(x) CC_BSWAP32(x)
#define CC_H2LE32(x) (x)
#define CC_H2BE64(x) CC_BSWAP64(x)
#define CC_H2LE64(x) (x)
#else
#define CC_H2BE32(x) (x)
#define CC_H2LE32(x) CC_BSWAP32(x)
#define CC_H2BE64(x) (x)
#define CC_H2LE64(x) CC_BSWAP64(x)
#endif
#define CC_READ_LE32(ptr) \
( (uint32_t)( \
((uint32_t)((const uint8_t *)(ptr))[0]) | \
(((uint32_t)((const uint8_t *)(ptr))[1]) << 8) | \
(((uint32_t)((const uint8_t *)(ptr))[2]) << 16) | \
(((uint32_t)((const uint8_t *)(ptr))[3]) << 24)))
#define CC_WRITE_LE32(ptr, x) \
do { \
((uint8_t *)(ptr))[0] = (uint8_t)( (x) & 0xFF); \
((uint8_t *)(ptr))[1] = (uint8_t)(((x) >> 8) & 0xFF); \
((uint8_t *)(ptr))[2] = (uint8_t)(((x) >> 16) & 0xFF); \
((uint8_t *)(ptr))[3] = (uint8_t)(((x) >> 24) & 0xFF); \
} while(0)
#define CC_WRITE_LE64(ptr, x) \
do { \
((uint8_t *)(ptr))[0] = (uint8_t)( (x) & 0xFF); \
((uint8_t *)(ptr))[1] = (uint8_t)(((x) >> 8) & 0xFF); \
((uint8_t *)(ptr))[2] = (uint8_t)(((x) >> 16) & 0xFF); \
((uint8_t *)(ptr))[3] = (uint8_t)(((x) >> 24) & 0xFF); \
((uint8_t *)(ptr))[4] = (uint8_t)(((x) >> 32) & 0xFF); \
((uint8_t *)(ptr))[5] = (uint8_t)(((x) >> 40) & 0xFF); \
((uint8_t *)(ptr))[6] = (uint8_t)(((x) >> 48) & 0xFF); \
((uint8_t *)(ptr))[7] = (uint8_t)(((x) >> 56) & 0xFF); \
} while(0)
/* extract a byte portably */
#ifdef _MSC_VER
#define cc_byte(x, n) ((unsigned char)((x) >> (8 * (n))))
#else
#define cc_byte(x, n) (((x) >> (8 * (n))) & 255)
#endif
/* Count leading zeros (for nonzero inputs) */
/*
* On i386 and x86_64, we know clang and GCC will generate BSR for
* __builtin_clzl. This instruction IS NOT constant time on all micro-
* architectures, but it *is* constant time on all micro-architectures that
* have been used by Apple, and we expect that to continue to be the case.
*
* When building for x86_64h with clang, this produces LZCNT, which is exactly
* what we want.
*
* On arm and arm64, we know that clang and GCC generate the constant-time CLZ
* instruction from __builtin_clzl( ).
*/
#if defined(_WIN32)
/* We use the Windows implementations below. */
#elif defined(__x86_64__) || defined(__i386__) || defined(__arm64__) || defined(__arm__)
/* We use a thought-to-be-good version of __builtin_clz. */
#elif defined __GNUC__
#warning Using __builtin_clz() on an unknown architecture; it may not be constant-time.
/* If you find yourself seeing this warning, file a radar for someone to
* check whether or not __builtin_clz() generates a constant-time
* implementation on the architecture you are targeting. If it does, append
* the name of that architecture to the list of "safe" architectures above. */
#endif
CC_INLINE CC_CONST unsigned cc_clz32_fallback(uint32_t data)
{
unsigned int b = 0;
unsigned int bit = 0;
// Work from LSB to MSB
for (int i = 0; i < 32; i++) {
bit = (data >> i) & 1;
// If the bit is 0, update the "leading bits are zero" counter "b".
b += (1 - bit);
/* If the bit is 0, (bit - 1) is 0xffff... therefore b is retained.
* If the bit is 1, (bit - 1) is 0 therefore b is set to 0.
*/
b &= (bit - 1);
}
return b;
}
CC_INLINE CC_CONST unsigned cc_clz64_fallback(uint64_t data)
{
unsigned int b = 0;
unsigned int bit = 0;
// Work from LSB to MSB
for (int i = 0; i < 64; i++) {
bit = (data >> i) & 1;
// If the bit is 0, update the "leading bits are zero" counter.
b += (1 - bit);
/* If the bit is 0, (bit - 1) is 0xffff... therefore b is retained.
* If the bit is 1, (bit - 1) is 0 therefore b is set to 0.
*/
b &= (bit - 1);
}
return b;
}
CC_INLINE CC_CONST unsigned cc_ctz32_fallback(uint32_t data)
{
unsigned int b = 0;
unsigned int bit = 0;
// Work from MSB to LSB
for (int i = 31; i >= 0; i--) {
bit = (data >> i) & 1;
// If the bit is 0, update the "trailing zero bits" counter.
b += (1 - bit);
/* If the bit is 0, (bit - 1) is 0xffff... therefore b is retained.
* If the bit is 1, (bit - 1) is 0 therefore b is set to 0.
*/
b &= (bit - 1);
}
return b;
}
CC_INLINE CC_CONST unsigned cc_ctz64_fallback(uint64_t data)
{
unsigned int b = 0;
unsigned int bit = 0;
// Work from MSB to LSB
for (int i = 63; i >= 0; i--) {
bit = (data >> i) & 1;
// If the bit is 0, update the "trailing zero bits" counter.
b += (1 - bit);
/* If the bit is 0, (bit - 1) is 0xffff... therefore b is retained.
* If the bit is 1, (bit - 1) is 0 therefore b is set to 0.
*/
b &= (bit - 1);
}
return b;
}
/*!
@function cc_clz32
@abstract Count leading zeros of a nonzero 32-bit value
@param data A nonzero 32-bit value
@result Count of leading zeros of @p data
@discussion @p data is assumed to be nonzero.
*/
CC_INLINE CC_CONST unsigned cc_clz32(uint32_t data) {
cc_assert(data != 0);
#if defined(_WIN32)
return cc_clz32_fallback(data);
#elif defined(__x86_64__) || defined(__i386__) || defined(__arm64__) || defined(__arm__) || defined(__GNUC__)
cc_static_assert(sizeof(unsigned) == 4, "clz relies on an unsigned int being 4 bytes");
return (unsigned)__builtin_clz(data);
#else
return cc_clz32_fallback(data);
#endif
}
/*!
@function cc_clz64
@abstract Count leading zeros of a nonzero 64-bit value
@param data A nonzero 64-bit value
@result Count of leading zeros of @p data
@discussion @p data is assumed to be nonzero.
*/
CC_INLINE CC_CONST unsigned cc_clz64(uint64_t data) {
cc_assert(data != 0);
#if defined(_WIN32)
return cc_clz64_fallback(data);
#elif defined(__x86_64__) || defined(__i386__) || defined(__arm64__) || defined(__arm__) || defined(__GNUC__)
return (unsigned)__builtin_clzll(data);
#else
return cc_clz64_fallback(data);
#endif
}
/*!
@function cc_ctz32
@abstract Count trailing zeros of a nonzero 32-bit value
@param data A nonzero 32-bit value
@result Count of trailing zeros of @p data
@discussion @p data is assumed to be nonzero.
*/
CC_INLINE CC_CONST unsigned cc_ctz32(uint32_t data) {
cc_assert(data != 0);
#if defined(_WIN32)
return cc_ctz32_fallback(data);
#elif defined(__x86_64__) || defined(__i386__) || defined(__arm64__) || defined(__arm__) || defined(__GNUC__)
cc_static_assert(sizeof(unsigned) == 4, "ctz relies on an unsigned int being 4 bytes");
return (unsigned)__builtin_ctz(data);
#else
return cc_ctz32_fallback(data);
#endif
}
/*!
@function cc_ctz64
@abstract Count trailing zeros of a nonzero 64-bit value
@param data A nonzero 64-bit value
@result Count of trailing zeros of @p data
@discussion @p data is assumed to be nonzero.
*/
CC_INLINE CC_CONST unsigned cc_ctz64(uint64_t data) {
cc_assert(data != 0);
#if defined(_WIN32)
return cc_ctz64_fallback(data);
#elif defined(__x86_64__) || defined(__i386__) || defined(__arm64__) || defined(__arm__) || defined(__GNUC__)
return (unsigned)__builtin_ctzll(data);
#else
return cc_ctz64_fallback(data);
#endif
}
/*!
@function cc_ffs32_fallback
@abstract Find first bit set in a 32-bit value
@param data A 32-bit value
@result One plus the index of the least-significant bit set in @p data or, if @p data is zero, zero
*/
CC_INLINE CC_CONST unsigned cc_ffs32_fallback(int32_t data)
{
unsigned b = 0;
unsigned bit = 0;
unsigned seen = 0;
// Work from LSB to MSB
for (int i = 0; i < 32; i++) {
bit = ((uint32_t)data >> i) & 1;
// Track whether we've seen a 1 bit.
seen |= bit;
// If the bit is 0 and we haven't seen a 1 yet, increment b.
b += (1 - bit) & (seen - 1);
}
// If we saw a 1, return b + 1, else 0.
return (~(seen - 1)) & (b + 1);
}
/*!
@function cc_ffs64_fallback
@abstract Find first bit set in a 64-bit value
@param data A 64-bit value
@result One plus the index of the least-significant bit set in @p data or, if @p data is zero, zero
*/
CC_INLINE CC_CONST unsigned cc_ffs64_fallback(int64_t data)
{
unsigned b = 0;
unsigned bit = 0;
unsigned seen = 0;
// Work from LSB to MSB
for (int i = 0; i < 64; i++) {
bit = ((uint64_t)data >> i) & 1;
// Track whether we've seen a 1 bit.
seen |= bit;
// If the bit is 0 and we haven't seen a 1 yet, increment b.
b += (1 - bit) & (seen - 1);
}
// If we saw a 1, return b + 1, else 0.
return (~(seen - 1)) & (b + 1);
}
/*!
@function cc_ffs32
@abstract Find first bit set in a 32-bit value
@param data A 32-bit value
@result One plus the index of the least-significant bit set in @p data or, if @p data is zero, zero
*/
CC_INLINE CC_CONST unsigned cc_ffs32(int32_t data)
{
cc_static_assert(sizeof(int) == 4, "ffs relies on an int being 4 bytes");
#ifdef _WIN32
return cc_ffs32_fallback(data);
#else
return (unsigned)__builtin_ffs(data);
#endif
}
/*!
@function cc_ffs64
@abstract Find first bit set in a 64-bit value
@param data A 64-bit value
@result One plus the index of the least-significant bit set in @p data or, if @p data is zero, zero
*/
CC_INLINE CC_CONST unsigned cc_ffs64(int64_t data)
{
#ifdef _WIN32
return cc_ffs64_fallback(data);
#else
return (unsigned)__builtin_ffsll(data);
#endif
}
#define cc_add_overflow __builtin_add_overflow
#define cc_mul_overflow __builtin_mul_overflow
/* HEAVISIDE_STEP (shifted by one)
function f(x): x->0, when x=0
x->1, when x>0
Can also be seen as a bitwise operation:
f(x): x -> y
y[0]=(OR x[i]) for all i (all bits)
y[i]=0 for all i>0
Run in constant time (log2(<bitsize of x>))
Useful to run constant time checks
*/
#define CC_HEAVISIDE_STEP(r, s) { \
const uint64_t _s = (uint64_t)s; \
const uint64_t _t = (_s & 0xffffffff) | (_s >> 32); \
r = (__typeof__(r))((0xffffffff + _t) >> 32); \
}
/* Return 1 if x mod 4 =1,2,3, 0 otherwise */
#define CC_CARRY_2BITS(x) (((x>>1) | x) & 0x1)
#define CC_CARRY_3BITS(x) (((x>>2) | (x>>1) | x) & 0x1)
#define cc_ceiling(a,b) (((a)+((b)-1))/(b))
#define CC_BITLEN_TO_BYTELEN(x) cc_ceiling((x), 8)
/*!
@brief cc_muxp(s, a, b) is equivalent to z = s ? a : b, but it executes in constant time
@param a input pointer
@param b input pointer
@param s The selection parameter s must be 0 or 1. if s is integer 1 a is returned. If s is integer 0, b is returned. Otherwise, the output is undefined.
@return Returns a, if s is 1 and b if s is 0
*/
void *cc_muxp(int s, const void *a, const void *b);
/*!
@brief CC_MUXU(r, s, a, b) is equivalent to r = s ? a : b, but executes in constant time
@param a Input a
@param b Input b
@param s Selection parameter s. Must be 0 or 1.
@param r Output, set to a if s=1, or b if s=0.
*/
#define CC_MUXU(r, s, a, b) \
{ \
__typeof__(r) _cond = (__typeof__(r))((s)-1); \
r = (~_cond & (a)) | (_cond & (b)); \
}
#define CC_PROVIDES_ABORT (!(CC_USE_SEPROM || CC_USE_S3 || CC_BASEBAND || CC_EFI || CC_IBOOT || CC_RTKITROM))
/*!
@function cc_abort
@abstract Abort execution unconditionally
*/
CC_NORETURN
void cc_abort(const char *msg);
/*!
@function cc_try_abort
@abstract Abort execution iff the platform provides a function like @p abort() or @p panic()
@discussion If the platform does not provide a means to abort execution, this function does nothing; therefore, callers should return an error code after calling this function.
*/
#if CC_PROVIDES_ABORT
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wmissing-noreturn"
CC_INLINE
void cc_try_abort(const char *msg)
{
cc_abort(msg);
}
#pragma clang diagnostic pop
#else
CC_INLINE
void cc_try_abort(CC_UNUSED const char *msg)
{
}
#endif
#if __has_builtin(__builtin_expect)
#define CC_UNLIKELY(cond) __builtin_expect(cond, 0)
#else
#define CC_UNLIKELY(cond) cond
#endif
CC_INLINE
void cc_try_abort_if(bool condition, const char *msg)
{
if (CC_UNLIKELY(condition)) {
cc_try_abort(msg);
}
}
/*
Unfortunately, since we export this symbol, this declaration needs
to be in a public header to satisfy TAPI.
See fipspost_trace_priv.h for more details.
*/
extern const void *fipspost_trace_vtable;
#endif /* _CORECRYPTO_CC_PRIV_H_ */

View File

@ -0,0 +1,90 @@
/* Copyright (c) (2012,2014,2015,2016,2017,2018,2019,2020) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#ifndef CORECRYPTO_CC_RUNTIME_CONFIG_H_
#define CORECRYPTO_CC_RUNTIME_CONFIG_H_
#include <corecrypto/cc_config.h>
#if defined(__x86_64__) || defined(__i386__)
#if CC_KERNEL
#include <i386/cpuid.h>
#define CC_HAS_RDRAND() ((cpuid_features() & CPUID_FEATURE_RDRAND) != 0)
#define CC_HAS_AESNI() ((cpuid_features() & CPUID_FEATURE_AES) != 0)
#define CC_HAS_SupplementalSSE3() ((cpuid_features() & CPUID_FEATURE_SSSE3) != 0)
#define CC_HAS_AVX1() ((cpuid_features() & CPUID_FEATURE_AVX1_0) != 0)
#define CC_HAS_AVX2() ((cpuid_info()->cpuid_leaf7_features & CPUID_LEAF7_FEATURE_AVX2) != 0)
#define CC_HAS_AVX512_AND_IN_KERNEL() ((cpuid_info()->cpuid_leaf7_features & CPUID_LEAF7_FEATURE_AVX512F) !=0)
#define CC_HAS_BMI2() ((cpuid_info()->cpuid_leaf7_features & CPUID_LEAF7_FEATURE_BMI2) != 0)
#define CC_HAS_ADX() ((cpuid_info()->cpuid_leaf7_features & CPUID_LEAF7_FEATURE_ADX) != 0)
#elif CC_DARWIN && CC_INTERNAL_SDK
#include <System/i386/cpu_capabilities.h>
#define CC_HAS_RDRAND() (_get_cpu_capabilities() & kHasRDRAND)
#define CC_HAS_AESNI() (_get_cpu_capabilities() & kHasAES)
#define CC_HAS_SupplementalSSE3() (_get_cpu_capabilities() & kHasSupplementalSSE3)
#define CC_HAS_AVX1() (_get_cpu_capabilities() & kHasAVX1_0)
#define CC_HAS_AVX2() (_get_cpu_capabilities() & kHasAVX2_0)
#define CC_HAS_AVX512_AND_IN_KERNEL() 0
#define CC_HAS_BMI2() (_get_cpu_capabilities() & kHasBMI2)
#define CC_HAS_ADX() (_get_cpu_capabilities() & kHasADX)
#else
#define CC_HAS_AESNI() __builtin_cpu_supports("aes")
#define CC_HAS_SupplementalSSE3() __builtin_cpu_supports("ssse3")
#define CC_HAS_AVX1() __builtin_cpu_supports("avx")
#define CC_HAS_AVX2() __builtin_cpu_supports("avx2")
#define CC_HAS_AVX512_AND_IN_KERNEL() 0
#define CC_HAS_BMI2() __builtin_cpu_supports("bmi2")
#if CC_LINUX || !CC_INTERNAL_SDK
#include <cpuid.h>
#include <stdbool.h>
CC_INLINE bool _cpu_supports_rdrand()
{
unsigned int eax, ebx, ecx, edx;
__cpuid(1, eax, ebx, ecx, edx);
return ecx & bit_RDRND;
}
CC_INLINE bool _cpu_supports_adx()
{
unsigned int eax, ebx, ecx, edx;
__cpuid_count(7, 0, eax, ebx, ecx, edx);
return ebx & bit_ADX;
}
#define CC_HAS_RDRAND() _cpu_supports_rdrand()
#define CC_HAS_ADX() _cpu_supports_adx()
#else
#define CC_HAS_RDRAND() 0
#define CC_HAS_ADX() 0
#endif
#endif
#endif // defined(__x86_64__) || defined(__i386__)
#if defined(__arm64__)
#if CC_DARWIN && CC_INTERNAL_SDK
#include <System/arm/cpu_capabilities.h>
#define CC_HAS_SHA512() (_get_cpu_capabilities() & kHasARMv82SHA512)
#define CC_HAS_SHA3() (_get_cpu_capabilities() & kHasARMv82SHA3)
#else
#define CC_HAS_SHA512() (0)
#define CC_HAS_SHA3() (0)
#endif
#endif // defined(__arm64__)
#endif /* CORECRYPTO_CC_RUNTIME_CONFIG_H_ */

View File

@ -0,0 +1,174 @@
/* Copyright (c) (2013,2015,2016,2019) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#ifndef corecrypto_arm_aes_compatability_h
#define corecrypto_arm_aes_compatability_h
// #include <Availability.h>
#include <sys/cdefs.h>
#if defined(__clang__) && ((defined(__apple_build_version__) && __apple_build_version__ > 5010000))
#define __USES_V_CRYPTO_INTRINSICS 1
#else
#define __USES_V_CRYPTO_INTRINSICS 0
#endif
// AES INSTRUCTIONS
// aese.16b v0, v1
// aesd.16b v0, v1
// aesmc.16b v0, v1
// aesimc.16b v0, v1
// SHA1 INTRINSICS
// sha1su0.4s v0, v1, v2
// sha1su1.4s v0, v1
// sha1c.4s v0, v1, v2 // or q0, s1, v2.4s
// sha1m.4s v0, v1, v2 // or q0, s1, v2.4s
// sha1p.4s v0, v1, v2 // or q0, s1, v2.4s
// sha1h.4s v0, v1 // or s0, s1
// SHA256 INTRINSICS
// sha256su0.4s v0, v1
// sha256su1.4s v0, v1, v2
// sha256h.4s v0, v1, v2 // or q0, q1, v2.4s
// sha256h2.4s v0, v1, v2 // or q0, q1, v2.4s
#if __USES_V_CRYPTO_INTRINSICS == 1
.macro AESE
aese.16b v$0, v$1
.endm
.macro AESD
aesd.16b v$0, v$1
.endm
.macro AESMC
aesmc.16b v$0, v$1
.endm
.macro AESIMC
aesimc.16b v$0, v$1
.endm
#else
.macro AESE
aese q$0, q$1
.endm
.macro AESD
aesd q$0, q$1
.endm
.macro AESMC
aesmc q$0, q$1
.endm
.macro AESIMC
aesimc q$0, q$1
.endm
#endif
#if __USES_V_CRYPTO_INTRINSICS == 1
.macro SHA1SU0
sha1su0 v$0.4s, v$1.4s, v$2.4s
.endm
.macro SHA1SU1
sha1su1 v$0.4s, v$1.4s
.endm
.macro SHA1C
sha1c q$0, s$1, v$2.4s
.endm
.macro SHA1M
sha1m q$0, s$1, v$2.4s
.endm
.macro SHA1P
sha1p q$0, s$1, v$2.4s
.endm
.macro SHA1H
sha1h s$0, s$1
.endm
.macro SHA256SU0
sha256su0 v$0.4s, v$1.4s
.endm
.macro SHA256SU1
sha256su1 v$0.4s, v$1.4s, v$2.4s
.endm
.macro SHA256H
sha256h q$0, q$1, v$2.4s
.endm
.macro SHA256H2
sha256h2 q$0, q$1, v$2.4s
.endm
#else
.macro SHA1SU0
sha1su0 q$0, q$1, q$2
.endm
.macro SHA1SU1
sha1su1 q$0, q$1
.endm
.macro SHA1C
sha1c q$0, q$1, q$2
.endm
.macro SHA1M
sha1m q$0, q$1, q$2
.endm
.macro SHA1P
sha1p q$0, q$1, q$2
.endm
.macro SHA1H
sha1h q$0, q$1
.endm
.macro SHA256SU0
sha256su0 q$0, q$1
.endm
.macro SHA256SU1
sha256su1 q$0, q$1, q$2
.endm
.macro SHA256H
sha256h q$0, q$1, q$2
.endm
.macro SHA256H2
sha256h2 q$0, q$1, q$2
.endm
#endif
#endif /*corecrypto_arm_aes_compatability_h*/

View File

@ -0,0 +1,46 @@
/* Copyright (c) (2011,2015,2016,2018-2020) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#ifndef _CORECRYPTO_CCARM_PAC_BTI_MACROS_H_
#define _CORECRYPTO_CCARM_PAC_BTI_MACROS_H_
/*
* This file defines commonly used macros in handwritten assembly
* for making functions BTI and PAC compatible.
*/
#ifndef __arm64e__
#define __arm64e__ 0
#endif
.macro SIGN_LR
#if __arm64e__
pacibsp
#endif
.endmacro
.macro AUTH_LR_AND_RET
#if __arm64e__
retab
#else
ret
#endif
.endmacro
.macro BRANCH_TARGET_CALL
#if __arm64e__
hint #34 /* bti c */
#endif
.endmacro
#endif /* _CORECRYPTO_CCARM_PAC_BTI_MACROS_H_ */

View File

@ -0,0 +1,596 @@
/* Copyright (c) (2014,2015,2016,2018,2019) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#include <corecrypto/cc_priv.h>
#include "../corecrypto_test/include/testmore.h"
#include "testbyteBuffer.h"
#include <stdbool.h>
#include <limits.h>
#define CC_SECURITY_TEST
#if (CC == 0)
entryPoint(cc_tests,"cc")
#else
#ifdef CC_SECURITY_TEST
#include <corecrypto/ccrng_test.h>
#include "cccycles.h"
#include "ccstats.h"
#include "ccconstanttime.h"
#endif
// Disable the static analyzer for the code below since we do voluntary access to
// uninitialized memory area in stack
#ifdef __clang_analyzer__
int stack_clear_test(size_t size);
#endif
#ifndef __clang_analyzer__
#if defined(__has_feature) && __has_feature(address_sanitizer)
#define CC_NO_SANITIZE __attribute__((no_sanitize_address))
#else
#define CC_NO_SANITIZE
#endif // __has_feature
#define STACK_MAGIC 0xC0DEBA5E
CC_NO_SANITIZE static void
stack_dirty(size_t size)
{
volatile uint32_t array[size];
for (size_t i=0;i<size;i++)
{
array[i]=STACK_MAGIC;
}
}
CC_NO_SANITIZE static void
stack_clear(size_t size)
{
uint32_t array[size];
cc_clear(sizeof(array),array);
}
CC_NO_SANITIZE static int
stack_test(size_t size)
{
volatile uint32_t array[size];
for (size_t i=0;i<size;i++)
{
if (array[i]==STACK_MAGIC)
{
return 1; //error stack was not cleared.
}
}
return 0;
}
CC_NO_SANITIZE static int
stack_clear_test(size_t size)
{
stack_dirty(size);
stack_clear(size);
return stack_test(size);
}
#endif /* __clang_analyzer__ */
// Static analyzer re-enabled.
#define CLZ_RANDOM_TESTS 10000
static void
clz_tests(void) {
int i;
uint64_t r64;
uint32_t r32;
struct ccrng_state *rng = global_test_rng;
is(cc_clz32_fallback(2863311530), cc_clz32(2863311530), "clz32 1010... pattern");
is(cc_clz64_fallback(12297829382473034410U), cc_clz64(12297829382473034410U), "clz64 1010... pattern");
is(cc_clz32_fallback(1431655765), cc_clz32(1431655765), "clz32 0101... pattern");
is(cc_clz64_fallback(6148914691236517205U), cc_clz64(6148914691236517205U), "clz64 0101... pattern");
for (i = 0; i < 32; i++) {
is(cc_clz32_fallback(1U << i), cc_clz32(1U << i), "clz32");
is(cc_clz32_fallback((1U << i) + 1), cc_clz32((1U << i) + 1), "clz32 + 1");
is(cc_clz32_fallback((1U << i) + (1U << 16)), cc_clz32((1U << i) + (1U << 16)), "clz32 + 1 << 16");
}
for (i = 0; i < 64; i++) {
is(cc_clz64_fallback(1ULL << i), cc_clz64(1ULL << i), "clz64");
is(cc_clz64_fallback((1ULL << i) + 1), cc_clz64((1ULL << i) + 1), "clz64 + 1");
is(cc_clz64_fallback((1ULL << i) + UINT_MAX + 1), cc_clz64((1ULL << i) + UINT_MAX + 1), "clz64 + 1 << 32");
}
for (i = 0; i < CLZ_RANDOM_TESTS; i++)
{
ccrng_generate(rng, sizeof(r64), &r64);
is(cc_clz64_fallback(r64), cc_clz64(r64), "clz64 random");
r32 = r64 >> 32;
is(cc_clz32_fallback(r32), cc_clz32(r32), "clz32 random");
}
}
#define CTZ_RANDOM_TESTS 10000
static void
ctz_tests(void) {
int i;
uint64_t r64;
uint32_t r32;
struct ccrng_state *rng = global_test_rng;
is(cc_ctz32_fallback(2863311530), cc_ctz32(2863311530), "ctz32 1010... pattern");
is(cc_ctz64_fallback(12297829382473034410U), cc_ctz64(12297829382473034410U), "ctz64 1010... pattern");
is(cc_ctz32_fallback(1431655765), cc_ctz32(1431655765), "ctz32 0101... pattern");
is(cc_ctz64_fallback(6148914691236517205U), cc_ctz64(6148914691236517205U), "ctz64 0101... pattern");
for (i = 0; i < 32; i++) {
is(cc_ctz32_fallback(1U << i), cc_ctz32(1U << i), "ctz32");
is(cc_ctz32_fallback((1U << i) + 1), cc_ctz32((1U << i) + 1), "ctz32 + 1");
is(cc_ctz32_fallback((1U << i) + (1U << 16)), cc_ctz32((1U << i) + (1U << 16)), "ctz32 + 1 << 16");
}
for (i = 0; i < 64; i++) {
is(cc_ctz64_fallback(1ULL << i), cc_ctz64(1ULL << i), "ctz64");
is(cc_ctz64_fallback((1ULL << i) + 1), cc_ctz64((1ULL << i) + 1), "ctz64 + 1");
is(cc_ctz64_fallback((1ULL << i) + UINT_MAX + 1), cc_ctz64((1ULL << i) + UINT_MAX + 1), "ctz64 + 1 << 32");
}
for (i = 0; i < CTZ_RANDOM_TESTS; i++)
{
ccrng_generate(rng, sizeof(r64), &r64);
is(cc_ctz64_fallback(r64), cc_ctz64(r64), "ctz64 random");
r32 = r64 >> 32;
is(cc_ctz32_fallback(r32), cc_ctz32(r32), "ctz32 random");
}
}
#define FFS_RANDOM_TESTS 10000
static void
ffs_tests(void) {
int i;
int64_t r64;
int32_t r32;
struct ccrng_state *rng = global_test_rng;
is(cc_ffs32_fallback(0), cc_ffs32(0), "ffs32 zero");
is(cc_ffs64_fallback(0), cc_ffs64(0), "ffs64 zero");
is(cc_ffs32_fallback((int32_t)2863311530), cc_ffs32((int32_t)2863311530), "ffs32 1010... pattern");
is(cc_ffs64_fallback((int64_t)12297829382473034410U), cc_ffs64((int64_t)12297829382473034410U), "ffs64 1010... pattern");
is(cc_ffs32_fallback(1431655765), cc_ffs32(1431655765), "ffs32 0101... pattern");
is(cc_ffs64_fallback(6148914691236517205), cc_ffs64(6148914691236517205), "ffs64 0101... pattern");
for (i = 0; i < 32; i++) {
is(cc_ffs32_fallback(1 << i), cc_ffs32(1 << i), "ffs32");
is(cc_ffs32_fallback((1 << i) + 1), cc_ffs32((1 << i) + 1), "ffs32 + 1");
is(cc_ffs32_fallback((1 << i) + (1 << 16)), cc_ffs32((1 << i) + (1 << 16)), "ffs32 + 1 << 16");
}
for (i = 0; i < 64; i++) {
is(cc_ffs64_fallback(1LL << i), cc_ffs64(1LL << i), "ffs64");
is(cc_ffs64_fallback((1LL << i) + 1), cc_ffs64((1LL << i) + 1), "ffs64 + 1");
is(cc_ffs64_fallback((1LL << i) + UINT_MAX + 1), cc_ffs64((1LL << i) + UINT_MAX + 1), "ffs64 + 1 << 32");
}
for (i = 0; i < FFS_RANDOM_TESTS; i++) {
ccrng_generate(rng, sizeof(r64), &r64);
is(cc_ffs64_fallback(r64), cc_ffs64(r64), "ffs64 random");
r32 = r64 >> 32;
is(cc_ffs32_fallback(r32), cc_ffs32(r32), "ffs32 random");
}
}
static void
Rotate_Tests(void) {
int c=1;
uint32_t result32=0xaaaaaaaa;
uint64_t result64=0xaaaaaaaaaaaaaaaa;
/* The first argument is NOT a variable on purpose */
is(result32, CC_ROL(0x55555555, c), "CC_ROL 1");
is(result32, CC_ROLc(0x55555555, 1), "CC_ROLc 1");
is(result64, CC_ROL64(0x5555555555555555, c), "CC_ROL64 1");
is(result64, CC_ROL64c(0x5555555555555555, 1), "CC_ROL64c 1");
is(result32, CC_ROR(0x55555555, c), "CC_ROR 1");
is(result32, CC_RORc(0x55555555, 1), "CC_RORc 1");
is(result64, CC_ROR64(0x5555555555555555, c), "CC_ROR64 1");
is(result64, CC_ROR64c(0x5555555555555555, 1), "CC_ROR64c 1");
}
static void
mux_Tests(void) {
uint8_t i8;
uint16_t i16;
uint32_t i32;
uint64_t i64;
CC_MUXU(i8,0,(uint8_t)0xAB,(uint8_t)0xBA);
is(i8,0xBA,"sizeof(uint8_t)!=1");
CC_MUXU(i8,1,(uint8_t)0xBA,(uint8_t)0xAB);
is(i8,0xBA,"sizeof(uint8_t)!=1");
CC_MUXU(i16,0,(uint16_t)0xAB00,(uint16_t)0xBA00);
is(i16,0xBA00,"sizeof(uint8_t)!=1");
CC_MUXU(i16,1,(uint16_t)0xBA00,(uint16_t)0xAB00);
is(i16,0xBA00,"sizeof(uint8_t)!=1");
CC_MUXU(i32,0,(uint32_t)0xAB00BEEF,(uint32_t)0xBA00BEEF);
is(i32,0xBA00BEEF,"sizeof(uint8_t)!=1");
CC_MUXU(i32,1,(uint32_t)0xBA00BEEF,(uint32_t)0xAB00BEEF);
is(i32,0xBA00BEEF,"sizeof(uint8_t)!=1");
CC_MUXU(i64,0,(uint64_t)0xAB00BEEF11223344,(uint64_t)0xBA00BEEF11223344);
is(i64,0xBA00BEEF11223344,"sizeof(uint8_t)!=1");
CC_MUXU(i32,1,(uint64_t)0xBA00BEEF11223344,(uint64_t)0xAB00BEEF11223344);
is(i64,0xBA00BEEF11223344,"sizeof(uint8_t)!=1");
}
static void
HEAVISIDE_STEP_Tests(void)
{
uint8_t i8;
uint16_t i16;
uint32_t i32;
uint64_t i64;
size_t i; // loop index
uint8_t err=0,nb_test=0;
// Sanity check on intended lengths
ok(sizeof(uint8_t) == 1, "sizeof(uint8_t)!=1");
ok(sizeof(uint16_t) == 2, "sizeof(uint16_t)!=2");
ok(sizeof(uint32_t) == 4, "sizeof(uint32_t)!=4");
ok(sizeof(uint64_t) == 8, "sizeof(uint64_t)!=1");
for (i=0;i<8*sizeof(i8);i++)
{
nb_test++;
CC_HEAVISIDE_STEP(i8,((uint8_t)1<<i));
if (i8!=1) err++;
}
ok(err==0,"CC_HEAVISIDE_STEP(i8)");
for (i=0;i<8*sizeof(i16);i++)
{
nb_test++;
CC_HEAVISIDE_STEP(i16,((uint16_t)1<<i));
if (i16!=1) err++;
}
ok(err==0,"CC_HEAVISIDE_STEP(i16)");
for (i=0;i<8*sizeof(i32);i++)
{
nb_test++;
CC_HEAVISIDE_STEP(i32,((uint32_t)1<<i));
if (i32!=1) err++;
}
ok(err==0,"CC_HEAVISIDE_STEP(i32)");
for (i=0;i<8*sizeof(i64);i++)
{
nb_test++;
CC_HEAVISIDE_STEP(i64,((uint64_t)1<<i));
if (i64!=1) err++;
}
ok(err==0,"CC_HEAVISIDE_STEP(i64)");
ok(err + (64+32+16+8)-nb_test==0, "CC HEAVISIDE_STEP test failed");
}
static void
cmp_secure_functionalTests(void) {
#define ARRAY_SIZE 10
// --- Bytes
uint8_t array1[ARRAY_SIZE]={1,2,3,4,5,6,7,8,9,0};
uint8_t array2[ARRAY_SIZE];
memcpy(array2,array1,sizeof(array1));
// Equal
ok(cc_cmp_safe(sizeof(array1), array1,array2)==0, "array1 to array2");
ok(cc_cmp_safe(sizeof(array1), array2,array1)==0, "array2 to array1");
// length is zero
ok(cc_cmp_safe(0, array2,array1)!=0, "Array of size 0");
// Equal but first byte
array1[0]++;
ok(cc_cmp_safe(sizeof(array1), array1,array2)!=0, "first byte");
array1[0]--;
// Equal but last byte
array1[sizeof(array1)-1]++;
ok(cc_cmp_safe(sizeof(array1), array1,array2)!=0, "last byte");
array1[sizeof(array1)-1]--;
// --- cc_units
uint64_t u64_array1[ARRAY_SIZE]={};
for (size_t i=0;i<ARRAY_SIZE;i++) u64_array1[i]=i;
uint64_t u64_array2[ARRAY_SIZE];
uint64_t tmp;
memcpy(u64_array2,u64_array1,sizeof(u64_array1));
// Equal
ok(cc_cmp_safe(sizeof(u64_array1), u64_array1,u64_array2)==0, "array1 to array2");
ok(cc_cmp_safe(sizeof(u64_array1), u64_array2,u64_array1)==0, "array2 to array1");
// length is zero
ok(cc_cmp_safe(0, u64_array2,u64_array1)!=0, "Array of size 0");
// Equal but first byte
((uint8_t *)u64_array1)[0]++;
ok(cc_cmp_safe(sizeof(u64_array1),u64_array1,u64_array2)!=0, "first byte");
((uint8_t *)u64_array1)[0]--;
// Equal but last byte
CC_LOAD64_BE(tmp,&u64_array1[ARRAY_SIZE-1]);
CC_STORE64_BE(tmp^0x80,&u64_array1[ARRAY_SIZE-1]);
ok(cc_cmp_safe(sizeof(u64_array1), u64_array1,u64_array2)!=0, "last byte");
CC_STORE64_BE(tmp,&u64_array1[ARRAY_SIZE-1]);
}
#ifdef CC_SECURITY_TEST
//======================================================================
// Constant time verification parameters
//======================================================================
// Number of iteration of test where timings are not taken into account.
// Made to reach a stable performance state
#define CC_WARMUP 10
// Each sample is the average time for many iteration with identical inputs
#define CC_TIMING_REPEAT 150
// Number of sample for the statistical analysis
// typically 100~1000 is a good range
#define CC_TIMING_SAMPLES 200
// In case of failure, try many times
// This is to reduce false positives due to noise/timing accuracy.
// If implementation is not constant time, the behavior will be consistent
// So that this does not reduce the detection power.
#define CC_TIMING_RETRIES 10
// Two statitical tools are available: T-test and Wilcoxon.
// T-test assumes that the distribution to be compared are normal
// Wilcoxon measure offset between distribution.
// Due to potential switches between performance state or occasional
// latencies, Wilcoxon is recommended.
// > Set to 1 to use T-test instead of Wilcoxon
#define T_TEST 1
// Number of iteration of the full test (to play with to evaluate chances of false positives)
#define CMP_SECURITY_TEST_ITERATION 1
// Quantile for the repeated timing. Empirical value.
#define CC_TIMING_PERCENTILE 9
//======================================================================
static const int verbose=1;
#define TEST_LAST_BYTE 1
#define TEST_FIRST_BYTE 2
#define TEST_RANDOM 3
#define TEST_EQUAL 4
static int
cmp_secure_timeconstantTests(size_t length, struct ccrng_state *rng, uint32_t test_id) {
// Random for messages
uint8_t array1[length];
uint8_t array2[length];
int failure_cnt=0;
int early_abort=1;
uint32_t j,sample_counter;
bool retry=true;
if (length<=0) {goto errOut;}
j=0;
while(retry)
{
sample_counter=0; // Index of current sample
measurement_t timing_sample[2*CC_TIMING_SAMPLES];
for (size_t i=0;i<2*CC_TIMING_SAMPLES+(CC_WARMUP/CC_TIMING_REPEAT);i++)
{
ccrng_generate(rng,length,array1);
volatile int cmp_result;
if ((i&1) == 0)
{
// -------------------------
// Random
// -------------------------
switch(test_id) {
// All equal, except last byte
case TEST_LAST_BYTE:
memcpy(array2,array1,length);
array2[length-1]^=1;
break;
// All equal, except first byte
case TEST_FIRST_BYTE:
memcpy(array2,array1,length);
array2[0]^=1;
break;
// Random
case TEST_RANDOM:
ccrng_generate(rng,length,array2);
break;
// All equal
case TEST_EQUAL:
memcpy(array2,array1,length);
break;
default:
return 0; // failure
}
}
else
{
// -------------------------
// Equal
// -------------------------
memcpy(array2,array1,length);
}
#if 1
// Actual function to test
TIMING_WITH_QUANTILE(timing_sample[sample_counter].timing,
CC_TIMING_REPEAT,
CC_TIMING_PERCENTILE,
cmp_result=cc_cmp_safe(length, array1, array2),errOut);
#else
// Reference which can be expected to fail
TIMING_WITH_QUANTILE(timing_sample[sample_counter].timing,
CC_TIMING_REPEAT,
CC_TIMING_PERCENTILE,
cmp_result=memcmp(array1, array2,length),errOut);
#endif
timing_sample[sample_counter].group=sample_counter&1;
#if CC_WARMUP
if (i>=CC_WARMUP/CC_TIMING_REPEAT)
#endif
{
sample_counter++;
}
}
#if CCN_OSX
if (verbose>1) {
char file_name[64];
snprintf(file_name,sizeof(file_name),"corecrypto_test_cc_cmp_timings_%.2zu.csv",length);
export_measurement_to_file(file_name,timing_sample,sample_counter);
}
#endif
// Process results
#if T_TEST
// T test
int status=T_test_isRejected(timing_sample,sample_counter);
#else
// Wilcoxon Rank-Sum Test
int status=WilcoxonRankSumTest(timing_sample,sample_counter);
#endif
if (status!=0)
{
j++; // retry counter
if (j>=CC_TIMING_RETRIES)
{
diag("Constant timing FAILED for len %d after %d attempts",length,j);
//ok_or_fail((status==0),"Decrypt+padding constant timing");
failure_cnt++;
break;
}
}
else
{
if ((verbose>1) && (j>0)) diag("Constant timing ok for len %d after %d attempts (of %d)",length,j+1,CC_TIMING_RETRIES);
break;
}
} // retry
early_abort=0;
errOut:
if (failure_cnt || early_abort)
{
return 0;
}
return 1;
}
#define CMP_SECURITY_TEST_MAX_LENGTH 2048
static void
memcmp_secure_securityTests(void) {
// Random for messages
struct ccrng_state *rng = global_test_rng;
for (size_t i=0;i<CMP_SECURITY_TEST_ITERATION;i++)
{
size_t r;
ccrng_generate(rng,sizeof(r),&r);
r=(r%CMP_SECURITY_TEST_MAX_LENGTH)+1;
ok(cmp_secure_timeconstantTests(r,rng,TEST_FIRST_BYTE), "Time constant check, first byte difference");
ok(cmp_secure_timeconstantTests(r,rng,TEST_LAST_BYTE), "Time constant check, last byte difference");
ok(cmp_secure_timeconstantTests(r,rng,TEST_RANDOM), "Time constant check, random");
ok(cmp_secure_timeconstantTests(r,rng,TEST_EQUAL), "Time constant check of equal input - if it fails, it's a test issue");
}
}
#endif // CC_SECURITY_TEST
#ifdef CC_SECURITY_TEST
#define kPlan_ccSecurityTestNb 5
#else
#define kPlan_ccSecurityTestNb 0
#endif
int cc_tests(TM_UNUSED int argc, TM_UNUSED char *const *argv)
{
int num_tests = 36 + kPlan_ccSecurityTestNb;
num_tests += 292 + 2 * CLZ_RANDOM_TESTS; // clz_tests
num_tests += 292 + 2 * CTZ_RANDOM_TESTS; // ctz_tests
num_tests += 294 + 2 * FFS_RANDOM_TESTS; // ffs_tests
plan_tests(num_tests);
clz_tests();
ctz_tests();
ffs_tests();
//For Windows port, many unsigned longs have been replaced with size_t.
//This test makes sure corecrypto is agnostic to the change.
//This test can be removed leter on.
#if defined(_WIN64) && defined(_WIN32)
ok(sizeof(size_t)!=sizeof(unsigned long),
#else
ok(sizeof(size_t)==sizeof(unsigned long),
#endif
"Historically, corecrypto assumes size_t and long have the same size. Fon Win64, that is not the case");
if(verbose) diag("Stack cleanup");
ok(stack_clear_test(100)==0, "Stack clearing");
if(verbose) diag("mux test");
mux_Tests();
if(verbose) diag("HEAVISIDE_STEP test");
HEAVISIDE_STEP_Tests();
if(verbose) diag("Rotate test");
Rotate_Tests();
if(verbose) diag("Secure comparison test");
cmp_secure_functionalTests();
#ifdef CC_SECURITY_TEST
if(verbose) diag("Secure comparison security test");
memcmp_secure_securityTests();
#endif // CC_SECURITY_TEST
// Silence code coverage
const char *label = "corecrypto";
const uint8_t *buffer = (const uint8_t *)label;
cc_print("label", strlen(label), buffer);
return 0;
}
#endif //CC

54
cc/src/cc_abort.c Normal file
View File

@ -0,0 +1,54 @@
/* Copyright (c) (2015,2016,2019) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#include <corecrypto/cc_priv.h>
//cc_abort() is implemented to comply with by FIPS 140-2, when DRBG produces
//two equal consecutive blocks.
#if !CC_PROVIDES_ABORT
#error "This environment does not provide an abort()/panic()-like function"
#elif CC_KERNEL
#include <kern/debug.h>
void cc_abort(const char * msg)
{
panic("%s", msg);
}
#elif CC_USE_L4
#include <sys/panic.h>
#include <stdarg.h>
void cc_abort(const char * msg)
{
sys_panic(msg);
}
#elif CC_RTKIT
#include <RTK_platform.h>
void cc_abort(const char * msg)
{
RTK_abort("%s", msg);
}
#else
#include <stdlib.h>
void cc_abort(const char * msg CC_UNUSED)
{
abort();
}
#endif

18
cc/src/cc_atfork_child.c Normal file
View File

@ -0,0 +1,18 @@
/* Copyright (c) (2019) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#include <corecrypto/cc_priv.h>
#include <corecrypto/ccrng_cryptographic.h>
void cc_atfork_child(void)
{
ccrng_cryptographic_atfork_child();
}

18
cc/src/cc_atfork_parent.c Normal file
View File

@ -0,0 +1,18 @@
/* Copyright (c) (2019) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#include <corecrypto/cc_priv.h>
#include <corecrypto/ccrng_cryptographic.h>
void cc_atfork_parent(void)
{
ccrng_cryptographic_atfork_parent();
}

View File

@ -0,0 +1,18 @@
/* Copyright (c) (2019) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#include <corecrypto/cc_priv.h>
#include <corecrypto/ccrng_cryptographic.h>
void cc_atfork_prepare(void)
{
ccrng_cryptographic_atfork_prepare();
}

35
cc/src/cc_clear.c Normal file
View File

@ -0,0 +1,35 @@
/* Copyright (c) (2014,2015,2016,2017,2018,2019) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#include <corecrypto/cc.h>
#include "corecrypto/fipspost_trace.h"
#if ( CC_HAS_MEMSET_S == 1 ) && (defined( __STDC_WANT_LIB_EXT1__ ) && ( __STDC_WANT_LIB_EXT1__ == 1 ) )
void cc_clear(size_t len, void *dst)
{
FIPSPOST_TRACE_EVENT;
memset_s(dst,len,0,len);
}
#elif defined(_WIN32) && !defined(__clang__) //Clang with Microsoft CodeGen, doesn't support SecureZeroMemory
#include <windows.h>
static void cc_clear(size_t len, void *dst)
{
SecureZeroMemory(dst, len);
}
#else
void cc_clear(size_t len, void *dst)
{
FIPSPOST_TRACE_EVENT;
volatile char *vptr = (volatile char *)dst;
while (len--)
*vptr++ = '\0';
}
#endif

26
cc/src/cc_cmp_safe.c Normal file
View File

@ -0,0 +1,26 @@
/* Copyright (c) (2014,2015,2019) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#include <corecrypto/cc_priv.h>
int cc_cmp_safe (size_t num, const void * ptr1, const void * ptr2)
{
size_t i;
const uint8_t *s=(const uint8_t *)ptr1;
const uint8_t *t=(const uint8_t *)ptr2;
uint8_t flag=((num<=0)?1:0); // If 0 return an error
for (i=0;i<num;i++)
{
flag|=(s[i]^t[i]);
}
CC_HEAVISIDE_STEP(flag,flag); // flag=(flag==0)?0:1;
return flag; // 0 iff all bytes were equal, 1 if there is any difference
}

26
cc/src/cc_debug.c Normal file
View File

@ -0,0 +1,26 @@
/* Copyright (c) (2014,2015,2016,2017,2018,2019) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#include <corecrypto/cc_priv.h>
#include "cc_debug.h"
#include "cc_memory.h"
#if CORECRYPTO_DEBUG
struct ws_dbg g_ws_dbg;
#endif
void cc_print(const char *label, size_t count, const uint8_t *s) {
cc_printf("%s { %zu, ",label, count);
for (size_t ix=0; ix<count ; ix++) {
cc_printf("%.02x", s[ix]);
}
cc_printf(" }\n");
}

35
cc/src/cc_fault_canary.c Normal file
View File

@ -0,0 +1,35 @@
/* Copyright (c) (2019,2020) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#include <corecrypto/cc_fault_canary.h>
#include <corecrypto/cc_fault_canary_internal.h>
const cc_fault_canary_t CCEC_FAULT_CANARY = { 0xce, 0x3c, 0xed, 0x46, 0x6b, 0x11, 0xbf, 0x08, 0x13, 0xa0, 0xd4, 0xbf, 0x89, 0x60, 0xeb, 0x56 };
const cc_fault_canary_t CCRSA_PSS_FAULT_CANARY = { 0xef, 0x49, 0xba, 0x59, 0x22, 0xfe, 0x10, 0xdd, 0x84, 0x4f, 0x24,
0xd6, 0xad, 0xc0, 0xa9, 0x93 };
const cc_fault_canary_t CCRSA_PKCS1_FAULT_CANARY = { 0xea, 0xc5, 0x4a, 0x7c, 0x9f, 0x28, 0xdf, 0x10, 0xb6, 0xe9, 0x3e, 0xb9, 0x1c, 0xd3, 0x3a, 0xc5 };
void cc_fault_canary_set(cc_fault_canary_t fault_canary_out, const cc_fault_canary_t fault_canary, size_t nbytes, const uint8_t *in1, const uint8_t *in2)
{
// We need to be careful with our xor's.
// The first loop XORs the actual fault canary value
for (size_t ci = 0; ci < CC_FAULT_CANARY_SIZE; ci++) {
size_t bi = ci % nbytes;
fault_canary_out[ci] = in1[bi] ^ in2[bi] ^ fault_canary[ci];
}
// The second loop XORs the existing value in the input fault canary buffer.
for (size_t i = CC_FAULT_CANARY_SIZE; i < nbytes; i++) {
size_t bi = i % nbytes;
size_t ci = i % sizeof(CCEC_FAULT_CANARY);
fault_canary_out[ci] = in1[bi] ^ in2[bi] ^ fault_canary_out[ci];
}
}

27
cc/src/cc_muxp.c Normal file
View File

@ -0,0 +1,27 @@
/* Copyright (c) (2015,2016,2018,2019) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#include <corecrypto/cc_priv.h>
/*
Per C99 ISO/IEC 9899:1999 §6.5.8 and 6.5.9 Relational operator:
Each of the operators < , > , <= , >=, ==, != yield 1 if the specified relation is true and 0 if it is false. ... The result type is integer.
Also applies to other revisions of the C standard such as C11.
*/
// returns z= s ? a : b in constant time, when a and be are pointers. s must be either 0 or 1.
void *cc_muxp(int s, const void *a, const void *b)
{
cc_assert(s==1 || s==0);
uintptr_t ia = (uintptr_t) a;
uintptr_t ib = (uintptr_t) b;
uintptr_t cond =~((uintptr_t)s-(uintptr_t)1);//s?~zero:zero; see above
uintptr_t rc = (cond&ia)|(~cond&ib);
return (void *)rc;
}

39
cc/src/cc_rdrand.c Normal file
View File

@ -0,0 +1,39 @@
/* Copyright (c) (2019) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#include <corecrypto/cc_runtime_config.h>
#include "cc_internal.h"
#if defined(__x86_64__)
bool cc_rdrand(uint64_t *rand)
{
bool ok;
if (CC_HAS_RDRAND()) {
asm volatile ("rdrand %0; setc %1" : "=r"(rand), "=qm"(ok) : : "cc");
} else {
*rand = 0;
ok = false;
}
return ok;
}
#else
bool cc_rdrand(uint64_t *rand)
{
*rand = 0;
return false;
}
#endif

View File

@ -0,0 +1,25 @@
/* Copyright (c) (2014,2015,2019) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#import "cc_unit.h"
NSString *cc_composeString(NSString *format, ...) {
if (!format) return @"";
NSString *composedString;
va_list args;
va_start(args, format);
composedString = [[[NSString alloc] initWithFormat:format arguments:args] autorelease];
va_end(args);
return composedString;
}

21
cc/xcunit/cc_hex_string.m Normal file
View File

@ -0,0 +1,21 @@
/* Copyright (c) (2010,2014,2015,2016,2019) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#import "cc_unit.h"
NSString *cc_hex_string(size_t len, const unsigned char *s) {
NSMutableString *r = [[NSMutableString alloc] initWithCapacity: 3 + len * 8];
for (size_t ix = 0; ix < len; ++ix) {
[r appendFormat: @"%.02x", s[ix]];
}
[r autorelease];
return r;
}

85
cc/xcunit/cc_unit.h Normal file
View File

@ -0,0 +1,85 @@
/* Copyright (c) (2014,2015,2016,2017,2019) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#import <XCTest/XCTest.h>
#import <corecrypto/ccrng_test.h>
#import <corecrypto/ccrng_system.h>
NSString *cc_hex_string(size_t len, const unsigned char *s);
NSString *cc_composeString(NSString *format, ...);
#define XCAssertMemEquals(len, a1, a2, description, ...) \
({ \
@try { \
const void *_a1value = (a1); \
const void *_a2value = (a2); \
size_t _lenvalue = (len); \
if (memcmp(_a1value, _a2value, _lenvalue) != 0) {\
NSString *_expression = cc_composeString(description, ##__VA_ARGS__); \
NSString *_a1encoded = cc_hex_string(_lenvalue, _a1value); \
NSString *_a2encoded = cc_hex_string(_lenvalue, _a2value); \
XCTFail(@"%@\n%@\n should be \n%@",_expression, _a1encoded, _a2encoded);\
}\
}\
@catch (NSException *exception) {\
XCTFail(@"An exception caught");\
}\
})
#define XCAssertCharsEquals(len, a1, a2, description, ...) \
({ \
@try { \
const void *_a1value = (a1); \
const void *_a2value = (a2); \
size_t _lenvalue = (len); \
if (memcmp(_a1value, _a2value, _lenvalue) != 0) { \
NSString *_expression = cc_composeString(description, ##__VA_ARGS__); \
NSString *_a1encoded = cc_hex_string(_lenvalue, _a1value); \
NSString *_a2encoded = cc_hex_string(_lenvalue, _a2value); \
XCTFail(@"%@\n%@\n should be \n%@",_expression, _a1encoded, _a2encoded);\
} \
} \
@catch (NSException *exception) {\
XCTFail(@"An exception caught");\
}\
})
// When choosing the input seed, it must have the format "\x00\x01\x02\x03"...
#define XCTestRNG(rngname,input_seed) \
struct ccrng_test_state _test_rng; \
struct ccrng_state* rngname=(struct ccrng_state*)&_test_rng; \
size_t seedlen=sizeof(input_seed)-1; \
uint8_t random_seed[16]; \
uint8_t *seed=(uint8_t *)input_seed; \
if (input_seed==NULL || seedlen<=0) \
{\
seed=random_seed; \
seedlen=sizeof(random_seed); \
struct ccrng_system_state system_rng; \
XCTAssert(ccrng_system_init(&system_rng)==0); \
XCTAssert(ccrng_generate((struct ccrng_state *)&system_rng, seedlen, random_seed)==0); \
ccrng_system_done(&system_rng); \
} else {\
printf("Forced "); \
seed=(uint8_t *)input_seed; \
} \
XCTAssert(ccrng_test_init(&_test_rng, seedlen,seed,"")==0); \
NSString *_seed_encoded = cc_hex_string(seedlen, seed); \
printf("XCTestRNG seed: %s {", [_seed_encoded UTF8String]); \
for (size_t i=0;i<seedlen;i++) printf("\\x%02x",seed[i]); \
printf("}\n"); \
#define XCTestRNG_Done(rng) \
ccrng_test_done((struct ccrng_test_state*)rng); \
rng=NULL;

View File

@ -0,0 +1,74 @@
/* Copyright (c) (2012,2015,2016,2017,2019) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#ifndef _CORECRYPTO_FIPSPOST_H_
#define _CORECRYPTO_FIPSPOST_H_
#include <stdint.h>
#include <corecrypto/cc_config.h>
// Boot-Arg fips_mode Flags
//
// FIPS_MODE_FLAG_FULL is the default value when no other value is set, which
// is the case for all production devices.
//
// When performing tests, if _FORCEFAIL is set to true, then the tests
// intentionally fail and log their failure. The kernelspace and userspace
// flags can be enabled independently.
//
// If it's not desired to panic, supply the _NOPANIC flag with the
// _FORCEFAIL flag.
//
// Additional logging can be enabled by supplying the _VERBOSE flag.
//
// _NOINTEG is used to ignore just the results of the module integrity
// check process, which is very useful when setting breakpoints in the
// kext for diagnostic or auditing purposes.
//
// Supplying _TRACE causes a trace buffer to be accumulated of the instrumented
// functions for only one execution of the POST. As the POST finishes, the
// _TRACE flag is cleared from the fips_mode and no further tracing will occur.
#define FIPS_MODE_FLAG_DEBUG (1 << 0)
#define FIPS_MODE_FLAG_FULL (1 << 1)
#define FIPS_MODE_FLAG_DISABLE (1 << 2)
#define FIPS_MODE_FLAG_VERBOSE (1 << 3)
#define FIPS_MODE_FLAG_US_FORCEFAIL (1 << 4)
#define FIPS_MODE_FLAG_KS_FORCEFAIL (1 << 5)
#define FIPS_MODE_FLAG_NOINTEG (1 << 6)
#define FIPS_MODE_FLAG_TRACE (1 << 7)
#define FIPS_MODE_FLAG_NOPANIC (1 << 8)
#define FIPS_MODE_IS_DEBUG(MODE) ((MODE) & FIPS_MODE_FLAG_DEBUG)
#define FIPS_MODE_IS_FULL(MODE) ((MODE) & FIPS_MODE_FLAG_FULL)
#define FIPS_MODE_IS_DISABLE(MODE) ((MODE) & FIPS_MODE_FLAG_DISABLE)
#define FIPS_MODE_IS_VERBOSE(MODE) ((MODE) & FIPS_MODE_FLAG_VERBOSE)
#define FIPS_MODE_IS_US_FORCEFAIL(MODE) ((MODE) & FIPS_MODE_FLAG_US_FORCEFAIL)
#define FIPS_MODE_IS_KS_FORCEFAIL(MODE) ((MODE) & FIPS_MODE_FLAG_KS_FORCEFAIL)
#define FIPS_MODE_IS_NOINTEG(MODE) ((MODE) & FIPS_MODE_FLAG_NOINTEG)
#define FIPS_MODE_IS_TRACE(MODE) ((MODE) & FIPS_MODE_FLAG_TRACE)
#define FIPS_MODE_IS_NOPANIC(MODE) ((MODE) & FIPS_MODE_FLAG_NOPANIC)
#if CC_KERNEL
#define FIPS_MODE_FLAG_FORCEFAIL FIPS_MODE_FLAG_KS_FORCEFAIL
#define FIPS_MODE_IS_FORCEFAIL(MODE) FIPS_MODE_IS_KS_FORCEFAIL(MODE)
#else
#define FIPS_MODE_FLAG_FORCEFAIL FIPS_MODE_FLAG_US_FORCEFAIL
#define FIPS_MODE_IS_FORCEFAIL(MODE) FIPS_MODE_IS_US_FORCEFAIL(MODE)
#endif
struct mach_header;
/*
* Entrypoint for all POST tests.
*/
int fipspost_post(uint32_t fips_mode, struct mach_header *pmach_header);
#endif /* _CORECRYPTO_FIPSPOST_H_ */

View File

@ -0,0 +1,18 @@
/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#ifndef _CORECRYPTO_FIPSPOST_GET_CPU_KEY_H_
#define _CORECRYPTO_FIPSPOST_GET_CPU_KEY_H_
size_t fipspost_get_cpu_key(char *label, size_t label_size, cpu_type_t cpuType,
cpu_subtype_t cpusubtype);
#endif

View File

@ -0,0 +1,101 @@
/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#ifndef _CORECRYPTO_FIPSPOST_GET_HMAC_H_
#define _CORECRYPTO_FIPSPOST_GET_HMAC_H_
#include <corecrypto/ccsha2.h>
struct mach_header;
/*
* The pre-calculated SHA256 HMAC gets placed here for integrity
* testing. The current value is a random number. Use a different random
* number for each architecture type supported.
*/
#define FIPSPOST_PRECALC_HMAC_SIZE CCSHA256_OUTPUT_SIZE
#define FIPSPOST_HMAC_VALUE fipspost_precalc_hmac
#define FIPSPOST_PRECALC_HMAC_VARIABLE \
__attribute__((section("__TEXT,__fips_hmacs"))) const unsigned char FIPSPOST_HMAC_VALUE[FIPSPOST_PRECALC_HMAC_SIZE]
#define FIPSPOST_PRECALC_HMAC(ARCH, MODE) \
{ ARCH, MODE, 0x10, 0xdc, 0xe5, 0x34, 0x6f, 0x01, \
0xdd, 0x82, 0xf8, 0xad, 0xe5, 0x8f, 0xa1, 0xcc, \
0xc1, 0x32, 0xe5, 0xa8, 0x53, 0xc8, 0x39, 0xa3, \
0x84, 0x5f, 0x3b, 0xcb, 0x39, 0x9e, 0xd1, 0x7b }
/* Comprehensive list, in the order of mach/machine.h */
#define FIPSPOST_PRECALC_HMAC_VALUE_X86_64 FIPSPOST_PRECALC_HMAC(0x86, 0x64)
#define FIPSPOST_PRECALC_HMAC_VALUE_X86_32 FIPSPOST_PRECALC_HMAC(0x86, 0x32)
#define FIPSPOST_PRECALC_HMAC_VALUE_ARM_4T FIPSPOST_PRECALC_HMAC(0xa4, 0x01)
#define FIPSPOST_PRECALC_HMAC_VALUE_ARM_6 FIPSPOST_PRECALC_HMAC(0xa6, 0x00)
#define FIPSPOST_PRECALC_HMAC_VALUE_ARM_V5TEJ FIPSPOST_PRECALC_HMAC(0xa5, 0x01)
#define FIPSPOST_PRECALC_HMAC_VALUE_ARM_XSCALE FIPSPOST_PRECALC_HMAC(0xa5, 0x02)
#define FIPSPOST_PRECALC_HMAC_VALUE_ARM_7A FIPSPOST_PRECALC_HMAC(0xa7, 0x0a)
#define FIPSPOST_PRECALC_HMAC_VALUE_ARM_7F FIPSPOST_PRECALC_HMAC(0xa7, 0x0f)
#define FIPSPOST_PRECALC_HMAC_VALUE_ARM_7S FIPSPOST_PRECALC_HMAC(0xa7, 0x05)
#define FIPSPOST_PRECALC_HMAC_VALUE_ARM_7K FIPSPOST_PRECALC_HMAC(0xa7, 0x04)
#define FIPSPOST_PRECALC_HMAC_VALUE_ARM_6M FIPSPOST_PRECALC_HMAC(0xa6, 0x01)
#define FIPSPOST_PRECALC_HMAC_VALUE_ARM_7M FIPSPOST_PRECALC_HMAC(0xa7, 0x06)
#define FIPSPOST_PRECALC_HMAC_VALUE_ARM_7EM FIPSPOST_PRECALC_HMAC(0xa7, 0x07)
#define FIPSPOST_PRECALC_HMAC_VALUE_ARM_64 FIPSPOST_PRECALC_HMAC(0xa8, 0x64)
#define FIPSPOST_PRECALC_HMAC_VALUE_ARM_64_V8 FIPSPOST_PRECALC_HMAC(0xa8, 0x68)
#define FIPSPOST_PRECALC_HMAC_VALUE_ARM_64E FIPSPOST_PRECALC_HMAC(0xa8, 0x6e)
#define FIPSPOST_PRECALC_HMAC_VALUE_ARM_64_32 FIPSPOST_PRECALC_HMAC(0xa8, 0x32)
#define FIPSPOST_CREATE_PRECALC_HMAC(ARCH, VARIANT) \
FIPSPOST_PRECALC_HMAC_VARIABLE = FIPSPOST_PRECALC_HMAC_VALUE ## _ ## ARCH ## _ ## VARIANT;
/*
* Declare the individual variants based on the current architecture. Use the
* raw compiler flags because each archive must have a different value, even if
* they're all classed as '__arm__', to avoid duplicate values in a FAT file.
*/
#if defined(__x86_64__)
#define FIPSPOST_DECLARE_PRECALC_HMAC FIPSPOST_CREATE_PRECALC_HMAC(X86, 64)
#elif defined(__i386__)
#define FIPSPOST_DECLARE_PRECALC_HMAC FIPSPOST_CREATE_PRECALC_HMAC(X86, 32)
#elif defined(__ARM_ARCH_4T__)
#define FIPSPOST_DECLARE_PRECALC_HMAC FIPSPOST_CREATE_PRECALC_HMAC(ARM, 4T)
#elif defined(__ARM_ARCH_6K__)
#define FIPSPOST_DECLARE_PRECALC_HMAC FIPSPOST_CREATE_PRECALC_HMAC(ARM, 6)
// Unknown compiler flags for V5TEJ
// Unknown compiler flags for XSCALE
#elif defined (__ARM_ARCH_7A__) && !defined (__ARM_ARCH_7K__)
#define FIPSPOST_DECLARE_PRECALC_HMAC FIPSPOST_CREATE_PRECALC_HMAC(ARM, 7A)
#elif defined (__ARM_ARCH_7F__)
#define FIPSPOST_DECLARE_PRECALC_HMAC FIPSPOST_CREATE_PRECALC_HMAC(ARM, 7F)
#elif defined (__ARM_ARCH_7S__)
#define FIPSPOST_DECLARE_PRECALC_HMAC FIPSPOST_CREATE_PRECALC_HMAC(ARM, 7S)
#elif defined (__ARM_ARCH_7K__)
#define FIPSPOST_DECLARE_PRECALC_HMAC FIPSPOST_CREATE_PRECALC_HMAC(ARM, 7K)
#elif defined(__ARM_ARCH_6M__)
#define FIPSPOST_DECLARE_PRECALC_HMAC FIPSPOST_CREATE_PRECALC_HMAC(ARM, 6M)
#elif defined (__ARM_ARCH_7M__)
#define FIPSPOST_DECLARE_PRECALC_HMAC FIPSPOST_CREATE_PRECALC_HMAC(ARM, 7M)
#elif defined(__ARM_ARCH_7EM__)
#define FIPSPOST_DECLARE_PRECALC_HMAC FIPSPOST_CREATE_PRECALC_HMAC(ARM, 7EM)
#elif defined(__arm64e__)
#define FIPSPOST_DECLARE_PRECALC_HMAC FIPSPOST_CREATE_PRECALC_HMAC(ARM, 64E)
#elif defined(__ARM64_ARCH_8_32__)
#define FIPSPOST_DECLARE_PRECALC_HMAC FIPSPOST_CREATE_PRECALC_HMAC(ARM, 64_32)
#elif defined(__ARM_ARCH_ISA_A64)
#define FIPSPOST_DECLARE_PRECALC_HMAC FIPSPOST_CREATE_PRECALC_HMAC(ARM, 64)
// Unknown compiler flags for 64_V8
#else
#error Unsupported architecture type; add as necessary in the order of mach/machine.h.
#endif
#define FIPSPOST_EXTERN_PRECALC_HMAC extern FIPSPOST_PRECALC_HMAC_VARIABLE;
int fipspost_get_hmac(const struct mach_header* pmach_header, unsigned char* sha256HMACBuffer, size_t max_offset);
#endif

View File

@ -0,0 +1,33 @@
/* Copyright (c) (2020) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#ifndef _CORECRYPTO_FIPSPOST_INDICATOR_H_
#define _CORECRYPTO_FIPSPOST_INDICATOR_H_
/// Checks if a symmetric algorithm mode is allowed for the given key size.
int fips_allowed_mode(const void *mode, size_t key_byte_length);
/// Checks if a function is allowed according to FIPS. The arguments are precise the context in which the function will used if
/// required. E.G., for a SHA* hash function no parameters are needed, since the function is sufficient to define the use. On the
/// opposite a symmetric mode requires the key length in bytes and the cryptographic algorithm. num_args: the number of passed
/// arguments. It can currently be 0, 1, or 2. Depending on num_args, the following arguments can be:
/// * num_args == 1:
/// - struct ccdigest_info * for a DRBG function
/// - ccec_const_cp_t for an ECC function
/// - struct ccdigest_info * for a HMAC function
/// - ccdh_const_gp_t for a DH function
/// - ccec_const_cp_t for ECDH function
/// - key_byte_length for a KDF CTR CMAC function
/// - struct ccdigest_info * for a KDF CTR HMAC or PBKDF2 function
/// - key_bit_length for RSA related functions
int fips_allowed(const void *function, size_t num_args, ...);
#endif /* _CORECRYPTO_FIPSPOST_INDICATOR_H_ */

View File

@ -0,0 +1,17 @@
/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#ifndef _CORECRYPTO_FIPSPOST_POST_AES_CBC_H_
#define _CORECRYPTO_FIPSPOST_POST_AES_CBC_H_
int fipspost_post_aes_cbc(uint32_t fips_mode);
#endif

View File

@ -0,0 +1,20 @@
/* Copyright (c) (2018,2019) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
// Created on 5/1/18.
//
// Copyright (c) 2018 Apple Inc. All rights reserved.
#ifndef fipspost_post_aes_ccm_h
#define fipspost_post_aes_ccm_h
int fipspost_post_aes_ccm(uint32_t fips_mode);
#endif /* fipspost_post_aes_ccm_h */

View File

@ -0,0 +1,18 @@
/* Copyright (c) (2017,2019,2020) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#ifndef _CORECRYPTO_FIPSPOST_POST_AES_CMAC_H_
#define _CORECRYPTO_FIPSPOST_POST_AES_CMAC_H_
int fipspost_post_aes_cmac(uint32_t fips_mode);
#endif

View File

@ -0,0 +1,17 @@
/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#ifndef _CORECRYPTO_FIPSPOST_POST_AES_ECB_H_
#define _CORECRYPTO_FIPSPOST_POST_AES_ECB_H_
int fipspost_post_aes_ecb(uint32_t fips_mode);
#endif

View File

@ -0,0 +1,17 @@
/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#ifndef _CORECRYPTO_FIPSPOST_POST_AES_GCM_H_
#define _CORECRYPTO_FIPSPOST_POST_AES_GCM_H_
int fipspost_post_aes_gcm(uint32_t fips_mode);
#endif

View File

@ -0,0 +1,20 @@
/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#ifndef _CORECRYPTO_FIPSPOST_POST_AES_SKG_H_
#define _CORECRYPTO_FIPSPOST_POST_AES_SKG_H_
int fipspost_post_aes_skg_enc_ecb_128(uint32_t fips_mode);
int fipspost_post_aes_skg_dec_ecb_128(uint32_t fips_mode);
int fipspost_post_aes_skg_enc_cbc_128(uint32_t fips_mode);
int fipspost_post_aes_skg_dec_cbc_128(uint32_t fips_mode);
#endif

View File

@ -0,0 +1,17 @@
/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#ifndef _CORECRYPTO_FIPSPOST_POST_AES_TRNG_H_
#define _CORECRYPTO_FIPSPOST_POST_AES_TRNG_H_
int fipspost_post_aes_trng(uint32_t fips_mode);
#endif

View File

@ -0,0 +1,17 @@
/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#ifndef _CORECRYPTO_FIPSPOST_POST_AES_XTS_H_
#define _CORECRYPTO_FIPSPOST_POST_AES_XTS_H_
int fipspost_post_aes_xts(uint32_t fips_mode);
#endif

View File

@ -0,0 +1,17 @@
/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#ifndef _CORECRYPTO_FIPSPOST_POST_DRBG_CTR_H_
#define _CORECRYPTO_FIPSPOST_POST_DRBG_CTR_H_
int fipspost_post_drbg_ctr(uint32_t fips_mode);
#endif

View File

@ -0,0 +1,17 @@
/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#ifndef _CORECRYPTO_FIPSPOST_POST_DRBG_HMAC_H_
#define _CORECRYPTO_FIPSPOST_POST_DRBG_HMAC_H_
int fipspost_post_drbg_hmac(uint32_t fips_mode);
#endif

View File

@ -0,0 +1,17 @@
/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#ifndef _CORECRYPTO_FIPSPOST_POST_DRBG_TRNG_H_
#define _CORECRYPTO_FIPSPOST_POST_DRBG_TRNG_H_
int fipspost_post_drbg_trng(uint32_t fips_mode);
#endif

View File

@ -0,0 +1,17 @@
/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#ifndef _CORECRYPTO_FIPSPOST_POST_ECDH_H_
#define _CORECRYPTO_FIPSPOST_POST_ECDH_H_
int fipspost_post_ecdh(uint32_t fips_mode);
#endif

View File

@ -0,0 +1,17 @@
/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#ifndef _CORECRYPTO_FIPSPOST_POST_ECDSA_H_
#define _CORECRYPTO_FIPSPOST_POST_ECDSA_H_
int fipspost_post_ecdsa(uint32_t fips_mode);
#endif

View File

@ -0,0 +1,17 @@
/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#ifndef _CORECRYPTO_FIPSPOST_POST_FFDH_H_
#define _CORECRYPTO_FIPSPOST_POST_FFDH_H_
int fipspost_post_ffdh(uint32_t fips_mode);
#endif

View File

@ -0,0 +1,17 @@
/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#ifndef _CORECRYPTO_FIPSPOST_POST_HMAC_H_
#define _CORECRYPTO_FIPSPOST_POST_HMAC_H_
int fipspost_post_hmac(uint32_t fips_mode);
#endif

View File

@ -0,0 +1,19 @@
/* Copyright (c) (2019,2020) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#ifndef _CORECRYPTO_FIPSPOST_POST_INDICATOR_H_
#define _CORECRYPTO_FIPSPOST_POST_INDICATOR_H_
#include <stdint.h>
int fipspost_post_indicator(uint32_t fips_mode);
#endif /* _CORECRYPTO_FIPSPOST_POST_INDICATOR_H_ */

View File

@ -0,0 +1,19 @@
/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#ifndef _CORECRYPTO_FIPSPOST_POST_INTEGRITY_H_
#define _CORECRYPTO_FIPSPOST_POST_INTEGRITY_H_
struct mach_header;
int fipspost_post_integrity(uint32_t fips_mode, struct mach_header *pmach_header);
#endif

View File

@ -0,0 +1,17 @@
/* Copyright (c) (2012,2015,2020) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#ifndef _CORECRYPTO_FIPSPOST_POST_KDF_CTR_H_
#define _CORECRYPTO_FIPSPOST_POST_KDF_CTR_H_
int fipspost_post_kdf_ctr(uint32_t fips_mode);
#endif

View File

@ -0,0 +1,17 @@
/* Copyright (c) (2012,2015,2020) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#ifndef _CORECRYPTO_FIPSPOST_POST_PBKDF_H_
#define _CORECRYPTO_FIPSPOST_POST_PBKDF_H_
int fipspost_post_pbkdf(uint32_t fips_mode);
#endif

View File

@ -0,0 +1,22 @@
/* Copyright (c) (2019) Apple Inc. All rights reserved.
*
* corecrypto is licensed under Apple Inc.s Internal Use License Agreement (which
* is contained in the License.txt file distributed with corecrypto) and only to
* people who accept that license. IMPORTANT: Any license rights granted to you by
* Apple Inc. (if any) are limited to internal use within your organization only on
* devices and computers you own or control, for the sole purpose of verifying the
* security characteristics and correct functioning of the Apple Software. You may
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
*/
#ifndef _CORECRYPTO_FIPSPOST_POST_RSA_H_
#define _CORECRYPTO_FIPSPOST_POST_RSA_H_
#include <stdint.h>
#include <stdlib.h>
// DER RSA key used for RSA operation tests pulled from FIPS 186-2 RSA test vectors.
extern const uint8_t fipspost_post_rsa_test_key[];
extern const size_t fipspost_post_rsa_test_key_nbytes;
#endif

Some files were not shown because too many files have changed in this diff Show More