first commit
This commit is contained in:
commit
67e48c5b1d
|
|
@ -0,0 +1,2 @@
|
|||
build
|
||||
|
||||
|
|
@ -0,0 +1,310 @@
|
|||
# Copyright (c) (2018-2020) Apple Inc. All rights reserved.
|
||||
#
|
||||
# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
# is contained in the License.txt file distributed with corecrypto) and only to
|
||||
# people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
# Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
# devices and computers you own or control, for the sole purpose of verifying the
|
||||
# security characteristics and correct functioning of the Apple Software. You may
|
||||
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
|
||||
#
|
||||
|
||||
# CMake corecrypto build for Linux
|
||||
#
|
||||
# This CMake generates corecrypto_static library. It is meant to be
|
||||
# used for Linux only.
|
||||
#
|
||||
|
||||
cmake_minimum_required(VERSION 3.4.3)
|
||||
set(CMAKE_OSX_SYSROOT "macosx.internal") # NOTE: This must be set before the call to project
|
||||
project (corecrypto C)
|
||||
|
||||
option(CC_LINUX_ASM "Enable assembler support on Linux platform" OFF)
|
||||
|
||||
include (CoreCryptoSources.cmake)
|
||||
|
||||
#
|
||||
# Build Macros and Targets
|
||||
#
|
||||
|
||||
# get_include_dirs: extract include directories from list of headers
|
||||
macro (get_include_dirs out in)
|
||||
foreach (file ${in})
|
||||
|
||||
# Add directory including the header
|
||||
get_filename_component(dir ${file} DIRECTORY)
|
||||
list(APPEND ${out} ${dir})
|
||||
|
||||
# If the directory is corecrypto, we should also add its
|
||||
# parent to the include dir.
|
||||
get_filename_component(dirname ${dir} NAME)
|
||||
if (${dirname} STREQUAL "corecrypto")
|
||||
get_filename_component(parent ${dir} DIRECTORY)
|
||||
list(APPEND ${out} ${parent})
|
||||
endif()
|
||||
|
||||
endforeach()
|
||||
endmacro()
|
||||
|
||||
|
||||
# Project-level settings
|
||||
|
||||
## Build all objects with -fPIC
|
||||
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
||||
|
||||
## CMake spelling of -std=gnu99
|
||||
set(CMAKE_C_STANDARD 99)
|
||||
set(CMAKE_C_EXTENSIONS ON)
|
||||
|
||||
## Project-globals
|
||||
set_property(DIRECTORY
|
||||
APPEND PROPERTY COMPILE_DEFINITIONS
|
||||
COMPILING_CORECRYPTO=1
|
||||
$<$<CONFIG:Debug>:DEBUG=1>
|
||||
$<$<CONFIG:Release>:NDEBUG>
|
||||
)
|
||||
set(CC_C_OPTIONS
|
||||
-DBUILDKERNEL=0
|
||||
-Wundef
|
||||
-Wcast-qual
|
||||
-Wno-error=deprecated-declarations
|
||||
$<$<CONFIG:Debug>:-Werror>
|
||||
)
|
||||
add_compile_options(
|
||||
"$<$<COMPILE_LANGUAGE:C>:${CC_C_OPTIONS}>"
|
||||
)
|
||||
|
||||
# System dependencies
|
||||
find_package(UnixCommands REQUIRED) # For ${BASH}
|
||||
find_package(Threads REQUIRED)
|
||||
find_library(MATH_LIBRARY m DOC "libm")
|
||||
if(NOT MATH_LIBRARY)
|
||||
message(SEND_ERROR "Could not find libm")
|
||||
endif()
|
||||
|
||||
# Platform-specific dependencies
|
||||
if(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
|
||||
|
||||
find_library(SYSTEM_FRAMEWORK NAMES System)
|
||||
mark_as_advanced(SYSTEM_FRAMEWORK)
|
||||
find_path(SYSTEM_CPU_CAPABILITIES_PATH i386/cpu_capabilities.h
|
||||
HINTS "${SYSTEM_FRAMEWORK}/PrivateHeaders")
|
||||
mark_as_advanced(SYSTEM_CPU_CAPABILITIES_PATH)
|
||||
if(NOT SYSTEM_FRAMEWORK OR NOT SYSTEM_CPU_CAPABILITIES_PATH)
|
||||
unset(SYSTEM_FRAMEWORK CACHE)
|
||||
message(SEND_ERROR
|
||||
"Could not find internal System.framework\n"
|
||||
"HINT: Run cmake with xcrun to point it at the right SDK, or try:\n"
|
||||
" ${CMAKE_COMMAND} -DCMAKE_OSX_SYSROOT=macosx.internal .")
|
||||
else()
|
||||
message("-- Found internal System.framework")
|
||||
endif()
|
||||
|
||||
# Compile assembler sources in OSX
|
||||
enable_language(ASM)
|
||||
|
||||
# Enable FIPS POST trace in OSX
|
||||
set_source_files_properties(cc_fips/src/fipspost_trace.c cc_fips/crypto_test/crypto_test_cc_fips.c
|
||||
PROPERTIES COMPILE_FLAGS -DCORECRYPTO_POST_TRACE=1)
|
||||
|
||||
elseif(CMAKE_SYSTEM_NAME STREQUAL "Linux")
|
||||
|
||||
# Exclude sources that don't apply to Linux (or haven't yet been ported)
|
||||
set (CORECRYPTO_EXCLUDE_SRCS
|
||||
# exclude files that are OSX dependent
|
||||
cc_fips/src/fipspost_get_cpu_key.c
|
||||
cc_fips/src/fipspost_get_hmac.c
|
||||
cckprng/src/cckprng_diag.c
|
||||
cckprng/src/cckprng_diaggens.c
|
||||
cckprng/src/cckprng_generate.c
|
||||
cckprng/src/cckprng_init.c
|
||||
cckprng/src/cckprng_initgen.c
|
||||
cckprng/src/cckprng_loadseed.c
|
||||
cckprng/src/cckprng_printdiag.c
|
||||
cckprng/src/cckprng_ratchetseed.c
|
||||
cckprng/src/cckprng_refresh.c
|
||||
cckprng/src/cckprng_rekeygen.c
|
||||
cckprng/src/cckprng_rekeygens.c
|
||||
cckprng/src/cckprng_reseed.c
|
||||
cckprng/src/cckprng_storeseed.c
|
||||
cckprng/src/prng.c
|
||||
)
|
||||
|
||||
set (CORECRYPTO_TEST_EXCLUDE_SRCS
|
||||
# exclude files that are OSX dependent
|
||||
cc_fips/src/fipspost_get_cpu_key.c
|
||||
cc_fips/src/fipspost_get_hmac.c
|
||||
corecrypto_test/lib/ccshadow.c
|
||||
corecrypto_test/lib/cccycles.c
|
||||
cckprng/crypto_test/crypto_test_kprng.c
|
||||
|
||||
# this test requires trace to be enabled
|
||||
cc_fips/crypto_test/crypto_test_cc_fips.c
|
||||
)
|
||||
|
||||
set (CORECRYPTO_PERF_EXCLUDE_SRCS
|
||||
# exclude files that are OSX dependent
|
||||
corecrypto_perf/src/ccperf_kprng.c
|
||||
)
|
||||
|
||||
if (CC_LINUX_ASM)
|
||||
enable_language(ASM)
|
||||
|
||||
# Add assembler specific clang flags
|
||||
set (CC_ASM_OPTIONS
|
||||
-integrated-as # Always use clang internal assembler
|
||||
-x assembler-with-cpp # Run preprocessor despite .s name
|
||||
)
|
||||
add_compile_options(
|
||||
"$<$<COMPILE_LANGUAGE:ASM>:${CC_ASM_OPTIONS}>"
|
||||
)
|
||||
|
||||
# Enable Linux assembler in corecrypto
|
||||
add_compile_options(
|
||||
"-DCC_LINUX_ASM=1"
|
||||
)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
include(GNUInstallDirs)
|
||||
if(NOT CMAKE_C_COMPILER_ID MATCHES "Clang")
|
||||
message(FATAL_ERROR "Only clang is supported for compilation, found ${CMAKE_C_COMPILER_ID} (${CMAKE_C_COMPILER})")
|
||||
endif()
|
||||
|
||||
#
|
||||
# corecrypto_static library target
|
||||
#
|
||||
|
||||
# A few include dirs cannot be automatically generated by the above headers
|
||||
# list. Manually fix it up.
|
||||
set (CORECRYPTO_FIXED_INCLUDE_DIRS
|
||||
ccaes/src/vng
|
||||
cckprng
|
||||
cckprng/corecrypto
|
||||
corecrypto_test/include
|
||||
acceleratecrypto/Include
|
||||
acceleratecrypto/Header
|
||||
ccec25519/src
|
||||
)
|
||||
|
||||
# Find include dirs for corecrypto_static headers.
|
||||
set (cc_include_dir ${CORECRYPTO_FIXED_INCLUDE_DIRS})
|
||||
get_include_dirs (cc_include_dir "${CORECRYPTO_PROJECT_HDRS}")
|
||||
get_include_dirs (cc_include_dir "${CORECRYPTO_PUBLIC_HDRS}")
|
||||
get_include_dirs (cc_include_dir "${CORECRYPTO_PRIVATE_HDRS}")
|
||||
list (REMOVE_DUPLICATES cc_include_dir)
|
||||
|
||||
|
||||
# Filter out excluded sources
|
||||
if(CORECRYPTO_EXCLUDE_SRCS)
|
||||
list(REMOVE_ITEM CORECRYPTO_SRCS ${CORECRYPTO_EXCLUDE_SRCS})
|
||||
endif()
|
||||
|
||||
|
||||
# Create target for corecrypto_static
|
||||
add_library(corecrypto_static STATIC ${CORECRYPTO_SRCS})
|
||||
target_link_libraries(corecrypto_static
|
||||
PRIVATE $<$<PLATFORM_ID:Darwin>:${SYSTEM_FRAMEWORK}> ${MATH_LIBRARY})
|
||||
target_include_directories(corecrypto_static PRIVATE ${cc_include_dir})
|
||||
set_property(TARGET corecrypto_static PROPERTY POSITION_INDEPENDENT_CODE ON)
|
||||
|
||||
# Generate pkgconfig for corecrypto_static
|
||||
configure_file("corecrypto.pc.in" "corecrypto.pc" @ONLY)
|
||||
|
||||
# Install corecrypto_static
|
||||
install (TARGETS corecrypto_static ARCHIVE
|
||||
DESTINATION "${CMAKE_INSTALL_LIBDIR}")
|
||||
install (FILES ${CORECRYPTO_PUBLIC_HDRS} ${CORECRYPTO_PRIVATE_HDRS}
|
||||
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/corecrypto")
|
||||
install (FILES ${CMAKE_CURRENT_BINARY_DIR}/corecrypto.pc
|
||||
DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
|
||||
|
||||
|
||||
#
|
||||
# corecrypto_test target
|
||||
#
|
||||
|
||||
# Remove the .inc and other non C files from the sources
|
||||
foreach (file ${CORECRYPTO_TEST_SRCS})
|
||||
string (REGEX MATCH ".+\\.c$" match ${file})
|
||||
if (NOT match)
|
||||
list (REMOVE_ITEM CORECRYPTO_TEST_SRCS ${file})
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
# A few include dirs cannot be automatically generated by the above headers
|
||||
# list. Manually fix it up.
|
||||
set (CORECRYPTO_TEST_FIXED_INCLUDE_DIRS
|
||||
ccsha2/src
|
||||
ccrng/src
|
||||
ccec25519/src
|
||||
ccaes/src/ios_hardware
|
||||
corecrypto_test
|
||||
cczp/src
|
||||
)
|
||||
|
||||
# Find include dirs for corecrypto_test headers.
|
||||
set (cctest_include_dir ${CORECRYPTO_TEST_FIXED_INCLUDE_DIRS})
|
||||
get_include_dirs (cctest_include_dir "${CORECRYPTO_TEST_HDRS}")
|
||||
get_include_dirs (cctest_include_dir "${CORECRYPTO_TEST_SRCS}")
|
||||
list (REMOVE_DUPLICATES cctest_include_dir)
|
||||
|
||||
|
||||
# Create target for corecrypto_test
|
||||
if(CORECRYPTO_TEST_EXCLUDE_SRCS)
|
||||
list (REMOVE_ITEM CORECRYPTO_TEST_SRCS ${CORECRYPTO_TEST_EXCLUDE_SRCS})
|
||||
endif()
|
||||
add_executable(corecrypto_test ${CORECRYPTO_TEST_SRCS})
|
||||
target_compile_definitions(corecrypto_test PRIVATE CC_UNITTEST=1)
|
||||
target_include_directories(corecrypto_test
|
||||
PRIVATE ${cctest_include_dir} ${cc_include_dir})
|
||||
target_link_libraries(corecrypto_test PRIVATE corecrypto_static
|
||||
Threads::Threads ${MATH_LIBRARY} ${CMAKE_DL_LIBS})
|
||||
|
||||
# Generate test vectors
|
||||
set(CC_CONVERT_TEST_VECTORS scripts/convert_testvectors.sh)
|
||||
set(CC_TEST_VECTORS corecrypto_test/test_vectors/wycheproof/chacha20_poly1305_test.json)
|
||||
set(GENERATED_TEST_VECTORS_DIR ${CMAKE_CURRENT_BINARY_DIR}/gen/corecrypto_test/include)
|
||||
set(GENERATED_TEST_VECTORS ${GENERATED_TEST_VECTORS_DIR}/cc_generated_test_vectors.h
|
||||
)
|
||||
add_custom_command(
|
||||
OUTPUT ${GENERATED_TEST_VECTORS}
|
||||
COMMAND ${CMAKE_COMMAND} -E make_directory ${GENERATED_TEST_VECTORS_DIR}
|
||||
COMMAND ${BASH} ${CMAKE_SOURCE_DIR}/${CC_CONVERT_TEST_VECTORS} ${GENERATED_TEST_VECTORS} ${CMAKE_CURRENT_SOURCE_DIR}/corecrypto_test/test_vectors/wycheproof
|
||||
COMMENT "Generating test vectors"
|
||||
DEPENDS ${CC_CONVERT_TEST_VECTORS} ${CC_TEST_VECTORS}
|
||||
)
|
||||
target_sources(corecrypto_test PRIVATE ${GENERATED_TEST_VECTORS})
|
||||
target_include_directories(corecrypto_test PRIVATE ${GENERATED_TEST_VECTORS_DIR})
|
||||
|
||||
set(CC_CONVERT_TEST_VECTORS_PC scripts/convert_h2c_testvectors.py)
|
||||
message(STATUS "Running python convert_h2c_testvectors.py")
|
||||
execute_process(
|
||||
COMMAND ${PYTHON} ${CMAKE_SOURCE_DIR}/${CC_CONVERT_TEST_VECTORS_PC} ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
RESULT_VARIABLE RESULT_PC
|
||||
OUTPUT_VARIABLE OUTPUT_PC
|
||||
ERROR_VARIABLE ERROR_PC
|
||||
)
|
||||
message(STATUS "result convert_vectors: ${RESULT_PC}")
|
||||
message(STATUS "output convert_vectors: ${OUTPUT_PC}")
|
||||
message(STATUS "error convert_vectors: ${ERROR_PC}")
|
||||
|
||||
#
|
||||
# corecrypto_perf target
|
||||
#
|
||||
|
||||
# ccperf.h lives in corecrypto_perf/corecrypto. Add it up
|
||||
set (CORECRYPTO_PERF_FIXED_INCLUDE_DIRS
|
||||
corecrypto_perf/corecrypto
|
||||
)
|
||||
set (ccperf_include_dir ${CORECRYPTO_PERF_FIXED_INCLUDE_DIRS})
|
||||
|
||||
# Create target for corecrypto_perf
|
||||
if(CORECRYPTO_PERF_EXCLUDE_SRCS)
|
||||
list (REMOVE_ITEM CORECRYPTO_PERF_SRCS ${CORECRYPTO_PERF_EXCLUDE_SRCS})
|
||||
endif()
|
||||
add_executable(corecrypto_perf ${CORECRYPTO_PERF_SRCS})
|
||||
target_include_directories(corecrypto_perf
|
||||
PRIVATE ${ccperf_include_dir} ${cctest_include_dir} ${cc_include_dir})
|
||||
target_link_libraries(corecrypto_perf PRIVATE corecrypto_static Threads::Threads ${MATH_LIBRARY})
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,61 @@
|
|||
Copyright (c) Apple Inc. All rights reserved.
|
||||
|
||||
corecrypto Internal Use License Agreement
|
||||
|
||||
IMPORTANT: This Apple corecrypto software is supplied to you by Apple Inc. ("Apple")
|
||||
in consideration of your agreement to the following terms, and your download or use
|
||||
of this Apple software constitutes acceptance of these terms. If you do not agree
|
||||
with these terms, please do not download or use this Apple software.
|
||||
|
||||
1. As used in this Agreement, the term "Apple Software" collectively means and
|
||||
includes all of the Apple corecrypto materials provided by Apple here, including
|
||||
but not limited to the Apple corecrypto software, frameworks, libraries, documentation
|
||||
and other Apple-created materials. In consideration of your agreement to abide by the
|
||||
following terms, conditioned upon your compliance with these terms and subject to
|
||||
these terms, Apple grants you, for a period of ninety (90) days from the date you
|
||||
download the Apple Software, a limited, non-exclusive, non-sublicensable license
|
||||
under Apple’s copyrights in the Apple Software to make a reasonable number of copies
|
||||
of, compile, and run the Apple Software internally within your organization only on
|
||||
devices and computers you own or control, for the sole purpose of verifying the
|
||||
security characteristics and correct functioning of the Apple Software; provided
|
||||
that you must retain this notice and the following text and disclaimers in all
|
||||
copies of the Apple Software that you make. You may not, directly or indirectly,
|
||||
redistribute the Apple Software or any portions thereof. The Apple Software is only
|
||||
licensed and intended for use as expressly stated above and may not be used for other
|
||||
purposes or in other contexts without Apple's prior written permission. Except as
|
||||
expressly stated in this notice, no other rights or licenses, express or implied, are
|
||||
granted by Apple herein.
|
||||
|
||||
2. The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO
|
||||
WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED WARRANTIES
|
||||
OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, REGARDING
|
||||
THE APPLE SOFTWARE OR ITS USE AND OPERATION ALONE OR IN COMBINATION WITH YOUR PRODUCTS,
|
||||
SYSTEMS, OR SERVICES. APPLE DOES NOT WARRANT THAT THE APPLE SOFTWARE WILL MEET YOUR
|
||||
REQUIREMENTS, THAT THE OPERATION OF THE APPLE SOFTWARE WILL BE UNINTERRUPTED OR
|
||||
ERROR-FREE, THAT DEFECTS IN THE APPLE SOFTWARE WILL BE CORRECTED, OR THAT THE APPLE
|
||||
SOFTWARE WILL BE COMPATIBLE WITH FUTURE APPLE PRODUCTS, SOFTWARE OR SERVICES. NO ORAL
|
||||
OR WRITTEN INFORMATION OR ADVICE GIVEN BY APPLE OR AN APPLE AUTHORIZED REPRESENTATIVE
|
||||
WILL CREATE A WARRANTY.
|
||||
|
||||
3. IN NO EVENT SHALL APPLE BE LIABLE FOR ANY DIRECT, SPECIAL, INDIRECT, INCIDENTAL
|
||||
OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||
GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) ARISING
|
||||
IN ANY WAY OUT OF THE USE, REPRODUCTION, COMPILATION OR OPERATION OF THE APPLE
|
||||
SOFTWARE, HOWEVER CAUSED AND WHETHER UNDER THEORY OF CONTRACT, TORT (INCLUDING
|
||||
NEGLIGENCE), STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
4. This Agreement is effective until terminated. Your rights under this Agreement will
|
||||
terminate automatically without notice from Apple if you fail to comply with any term(s)
|
||||
of this Agreement. Upon termination, you agree to cease all use of the Apple Software
|
||||
and destroy all copies, full or partial, of the Apple Software. This Agreement will be
|
||||
governed and construed in accordance with the laws of the State of California, without
|
||||
regard to its choice of law rules.
|
||||
|
||||
You may report security issues about Apple products to product-security@apple.com,
|
||||
as described here: https://www.apple.com/support/security/. Non-security bugs and
|
||||
enhancement requests can be made via https://bugreport.apple.com as described
|
||||
here: https://developer.apple.com/bug-reporting/
|
||||
|
||||
EA1350
|
||||
10/5/15
|
||||
|
|
@ -0,0 +1,13 @@
|
|||
# Copyright (c) (2017,2018,2019) Apple Inc. All rights reserved.
|
||||
#
|
||||
# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
# is contained in the License.txt file distributed with corecrypto) and only to
|
||||
# people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
# Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
# devices and computers you own or control, for the sole purpose of verifying the
|
||||
# security characteristics and correct functioning of the Apple Software. You may
|
||||
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
#
|
||||
|
||||
coverage:
|
||||
./scripts/corecrypto_coverage.sh
|
||||
|
|
@ -0,0 +1,127 @@
|
|||
/* Copyright (c) (2010,2012,2014-2020) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
The corecrypto (cc) project
|
||||
===========================
|
||||
|
||||
The main goal is to provide low level fast math routines and crypto APIs which
|
||||
can be used in various environments (Kernel, bootloader, userspace, etc.). It
|
||||
is an explicit goal to minimize dependancies between modules and functions so
|
||||
that clients of this library only end up with the routines they need and
|
||||
nothing more.
|
||||
|
||||
Corecrypto compiles under all Apple OSs, Windows, Android and Linux.
|
||||
|
||||
Corecrypto Modules
|
||||
------------------
|
||||
|
||||
Current corecrypto consists of the following submodules:
|
||||
|
||||
* `cc`: Headers and code common to all of the modules
|
||||
* `ccasn1`: ASN.1 typeid constants and ccoid definition.
|
||||
* `ccder`: DER encoding decoding support
|
||||
* `ccn`: Math on vectors of n cc_units
|
||||
* `cczp`: Modular arithmetic mod integer p, on vectors of n cc_units
|
||||
* `ccz`: Variable sized signed integer math routines
|
||||
* `ccdrbg`: Deterministic Random Byte Generators
|
||||
* `ccrng`: Random Bytes Generators
|
||||
* `ccdh`: Diffie-Hellman routines.
|
||||
* `ccec25519`: Elliptic curve signature and Diffie-Hellman routines using the Edward's 25519 curve
|
||||
* `ccrsa`: RSA routines.
|
||||
* `ccec`: Eliptic Curve Curves, ec specific math and APIs
|
||||
* `ccdigest`: Digest abstraction layer.
|
||||
* `cchmac`: HMAC using any ccdigest.
|
||||
* `ccpbkdf2`: PBKDF2 using any ccdigest.
|
||||
* `ccmd2`: MD2 digest implementations.
|
||||
* `ccmd4`: MD4 digest implementations.
|
||||
* `ccmd5`: MD5 digest implementations.
|
||||
* `ccripemd`: RIPE-MD digest implementations.
|
||||
* `ccsha1`: SHA-1 digest implementations.
|
||||
* `ccsha2`: SHA-2 digest implementations.
|
||||
* `ccmode`: Symmetric cipher chaining mode interfaces.
|
||||
* `ccpad`: Symmetric cipher padding code.
|
||||
* `ccaes`: AES symmetric cipher implementations.
|
||||
* `ccblowfish`: Blowfish symmetric cipher implementations.
|
||||
* `cccast`: Cast symmetric cipher implementations.
|
||||
* `ccdes`: DES and 3DES symmetric cipher implementations.
|
||||
* `ccrc2`: RC2 symmetric cipher implementations.
|
||||
* `ccrc4`: RC4 symmetric cipher implementations.
|
||||
* `ccperf`: Performance testing harness.
|
||||
* `cctest`: Common utilities for creating self tests and XCunit tests.
|
||||
* `ccprime`: Functions for generating large prime numbers. Mostly used in RSA key generation.
|
||||
* `ccspake`: SPAKE2+ password-based key exchange implementation.
|
||||
|
||||
### Module Subdirectories
|
||||
|
||||
Each module has the following subdirectories:
|
||||
|
||||
* `corecrypto`: headers for this module
|
||||
* `src`: sources for this module
|
||||
* `doc`: documentation, references, etc.
|
||||
* `xcunit`: XCTest based unit tests for this module.
|
||||
* `crypto_tests`: sources for executable tests for this module
|
||||
* `test_vectors`: test vectors for this module
|
||||
* `tools`: sources for random helper tools.
|
||||
|
||||
The following subdirections don't follow the module layout yet:
|
||||
|
||||
* `corecrypto_kext`: Supporting files for kernel extension build and fips support.
|
||||
* `corecrypto_dylib`: Supporting files for userspace shared lib build and fips support.
|
||||
|
||||
ARMV6m
|
||||
------
|
||||
The ARMV6m is not on corecrypto project target list. To compile corecrypto under ARMV6m use the following command:
|
||||
`$xcodebuild -target "corecrypto_static" OTHER_CFLAGS="-Qunused-arguments" -sdk iphoneos.internal -arch armv6m`
|
||||
|
||||
|
||||
Windows
|
||||
-------
|
||||
corecrypto compiles under Windows using Visual Studio 2015 and Clang with Microsoft CodeGen. The corecrypto Solution contains three projects:
|
||||
|
||||
1. `corecrypto`: This projects compiles corecrypto, and produces a static library in 32 and 64 bit modes.
|
||||
2. `corecrypto_test`: This project compiles corecrypto test files and links statically with the corecrypto debug library.
|
||||
3. `corecrypto_perf`: This project compiles corecrypto performance measurement files and links statically with the corecrypto release library.
|
||||
4. `corecrypto_wintest`: This project contains a simple code that links to the corecrypto.lib and complies in c++ using the Visul C++ compiler. This project created to
|
||||
make sure corecrypto library can linked to c++ software that are compiled with the Microsoft Compiler.
|
||||
|
||||
Android
|
||||
------
|
||||
corecrypto library, `corecrypto_test` and `corecrypto_perf` complie under Android. The Android project file is in the android subdirectory.
|
||||
|
||||
Linux
|
||||
-----
|
||||
The corecrypto library, `corecrypto_test` and `corecrypto_perf` compile under Linux and are built using cmake. See Cmake section for more details.
|
||||
The Linux implementation does not use ASM implementations due to differences between assemblers on Darwin and Linux.
|
||||
|
||||
CMake
|
||||
-----
|
||||
The corecrypto library, 'corecrypto_test' and 'corecrypto_perf' can also be built using cmake in macOS and Linux.
|
||||
|
||||
To compile using cmake, run the usual cmake commands:
|
||||
```
|
||||
$ cd <srcdir>
|
||||
$ mkdir build && cd build
|
||||
$ CC=clang CXX=clang++ cmake ..
|
||||
$ make
|
||||
```
|
||||
where `<srcdir>` is the path to the directory containing the sources.
|
||||
|
||||
To install, type `make install` from the build directory (will require root privileges).
|
||||
|
||||
Prototypes changes:
|
||||
-------------------
|
||||
From time to time, corecrypto needs to change the prototypes of functions.
|
||||
In this case, we use a macro defined as:
|
||||
CC_CHANGEFUNCTION_<radar>_<function name>
|
||||
and the header will document instructions to migrate from the old to new function prototype.
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,919 @@
|
|||
// !$*UTF8*$!
|
||||
{
|
||||
archiveVersion = 1;
|
||||
classes = {
|
||||
};
|
||||
objectVersion = 50;
|
||||
objects = {
|
||||
|
||||
/* Begin PBXAggregateTarget section */
|
||||
2CD5E9C120D85B370097F130 /* AccelerateCrypto */ = {
|
||||
isa = PBXAggregateTarget;
|
||||
buildConfigurationList = 2CD5E9C420D85B370097F130 /* Build configuration list for PBXAggregateTarget "AccelerateCrypto" */;
|
||||
buildPhases = (
|
||||
);
|
||||
dependencies = (
|
||||
2C88439021B74BE100C49BD9 /* PBXTargetDependency */,
|
||||
2C6CED2E20E195E90045D491 /* PBXTargetDependency */,
|
||||
);
|
||||
name = AccelerateCrypto;
|
||||
productName = AccelerateCrypto;
|
||||
};
|
||||
/* End PBXAggregateTarget section */
|
||||
|
||||
/* Begin PBXBuildFile section */
|
||||
2C6CED1120E1956A0045D491 /* sha512_compress_armv7neon.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EDA20DD5D2C00840ABB /* sha512_compress_armv7neon.s */; };
|
||||
2C6CED1220E195710045D491 /* sha512_compress_arm64.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447ED820DD5D2C00840ABB /* sha512_compress_arm64.s */; };
|
||||
2C6CED1320E1957F0045D491 /* sha512_compress_avx1.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447ED420DD5D2C00840ABB /* sha512_compress_avx1.s */; };
|
||||
2C6CED1420E1957F0045D491 /* sha512_compress_avx2.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447ED320DD5D2C00840ABB /* sha512_compress_avx2.s */; };
|
||||
2C6CED1520E1957F0045D491 /* sha512_compress_ssse3.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447ED220DD5D2C00840ABB /* sha512_compress_ssse3.s */; };
|
||||
2C6CED1620E1957F0045D491 /* sha512_compress.c in Sources */ = {isa = PBXBuildFile; fileRef = 2C447ED620DD5D2C00840ABB /* sha512_compress.c */; };
|
||||
2C6CED1720E1957F0045D491 /* sha512_K.c in Sources */ = {isa = PBXBuildFile; fileRef = 2C447ED120DD5D2C00840ABB /* sha512_K.c */; };
|
||||
2C6CED1820E195850045D491 /* sha256_compress_armv7neon.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EC720DD5D1900840ABB /* sha256_compress_armv7neon.s */; };
|
||||
2C6CED1920E195890045D491 /* sha256_compress_arm64.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EC520DD5D1800840ABB /* sha256_compress_arm64.s */; };
|
||||
2C6CED1A20E1958D0045D491 /* sha256_compress_avx1.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EB920DD5D1800840ABB /* sha256_compress_avx1.s */; };
|
||||
2C6CED1B20E1958D0045D491 /* sha256_compress_avx2.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EC820DD5D1900840ABB /* sha256_compress_avx2.s */; };
|
||||
2C6CED1C20E1958D0045D491 /* sha256_compress_ssse3_32.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EC920DD5D1900840ABB /* sha256_compress_ssse3_32.s */; };
|
||||
2C6CED1D20E1958D0045D491 /* sha256_compress_ssse3_64.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EBA20DD5D1800840ABB /* sha256_compress_ssse3_64.s */; };
|
||||
2C6CED1E20E1958D0045D491 /* sha256_compress.c in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EBB20DD5D1800840ABB /* sha256_compress.c */; };
|
||||
2C6CED1F20E1958D0045D491 /* sha256_K.c in Sources */ = {isa = PBXBuildFile; fileRef = 2C447ECA20DD5D1900840ABB /* sha256_K.c */; };
|
||||
2C6CED2020E195930045D491 /* sha1_compress_armv7neon.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EB420DD5D0100840ABB /* sha1_compress_armv7neon.s */; };
|
||||
2C6CED2120E195970045D491 /* sha1_compress_arm64.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EB020DD5D0100840ABB /* sha1_compress_arm64.s */; };
|
||||
2C6CED2220E1959B0045D491 /* sha1_compress_avx1.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EA720DD5D0100840ABB /* sha1_compress_avx1.s */; };
|
||||
2C6CED2320E1959B0045D491 /* sha1_compress_avx2.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EB220DD5D0100840ABB /* sha1_compress_avx2.s */; };
|
||||
2C6CED2420E1959B0045D491 /* sha1_compress_sse.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EB120DD5D0100840ABB /* sha1_compress_sse.s */; };
|
||||
2C6CED2520E1959B0045D491 /* sha1_compress.c in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EA820DD5D0100840ABB /* sha1_compress.c */; };
|
||||
2C6CED2620E195A80045D491 /* decrypt.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EE120DD5D4600840ABB /* decrypt.s */; };
|
||||
2C6CED2720E195A80045D491 /* encrypt.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EE220DD5D4600840ABB /* encrypt.s */; };
|
||||
2C6CED2820E195A80045D491 /* vpaes-armv7.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EE020DD5D4600840ABB /* vpaes-armv7.s */; };
|
||||
2C6CED2920E195B60045D491 /* decrypt.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EDC20DD5D4600840ABB /* decrypt.s */; };
|
||||
2C6CED2A20E195B60045D491 /* decrypt_ecb.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EDD20DD5D4600840ABB /* decrypt_ecb.s */; };
|
||||
2C6CED2B20E195B60045D491 /* encrypt.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EDE20DD5D4600840ABB /* encrypt.s */; };
|
||||
2C6CED2C20E195B60045D491 /* encrypt_ecb.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EDF20DD5D4600840ABB /* encrypt_ecb.s */; };
|
||||
2C6CED2F20E302B40045D491 /* AccelerateCrypto.h in Headers */ = {isa = PBXBuildFile; fileRef = 2C447E9E20DD5BD600840ABB /* AccelerateCrypto.h */; };
|
||||
2C88436C21B74AD500C49BD9 /* sha1_compress.c in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EA820DD5D0100840ABB /* sha1_compress.c */; };
|
||||
2C88436D21B74AD500C49BD9 /* sha256_compress_avx2.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EC820DD5D1900840ABB /* sha256_compress_avx2.s */; };
|
||||
2C88436E21B74AD500C49BD9 /* sha256_compress_ssse3_64.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EBA20DD5D1800840ABB /* sha256_compress_ssse3_64.s */; };
|
||||
2C88436F21B74AD500C49BD9 /* sha1_compress_sse.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EB120DD5D0100840ABB /* sha1_compress_sse.s */; };
|
||||
2C88437021B74AD500C49BD9 /* encrypt_ecb.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EDF20DD5D4600840ABB /* encrypt_ecb.s */; };
|
||||
2C88437121B74AD500C49BD9 /* encrypt.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EE220DD5D4600840ABB /* encrypt.s */; };
|
||||
2C88437221B74AD500C49BD9 /* encrypt.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EDE20DD5D4600840ABB /* encrypt.s */; };
|
||||
2C88437321B74AD500C49BD9 /* sha1_compress_avx1.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EA720DD5D0100840ABB /* sha1_compress_avx1.s */; };
|
||||
2C88437421B74AD500C49BD9 /* sha256_compress.c in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EBB20DD5D1800840ABB /* sha256_compress.c */; };
|
||||
2C88437521B74AD500C49BD9 /* sha512_compress_ssse3.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447ED220DD5D2C00840ABB /* sha512_compress_ssse3.s */; };
|
||||
2C88437621B74AD500C49BD9 /* sha512_compress_arm64.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447ED820DD5D2C00840ABB /* sha512_compress_arm64.s */; };
|
||||
2C88437721B74AD500C49BD9 /* sha512_compress_avx1.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447ED420DD5D2C00840ABB /* sha512_compress_avx1.s */; };
|
||||
2C88437821B74AD500C49BD9 /* sha256_K.c in Sources */ = {isa = PBXBuildFile; fileRef = 2C447ECA20DD5D1900840ABB /* sha256_K.c */; };
|
||||
2C88437921B74AD500C49BD9 /* sha1_compress_avx2.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EB220DD5D0100840ABB /* sha1_compress_avx2.s */; };
|
||||
2C88437A21B74AD500C49BD9 /* sha512_compress.c in Sources */ = {isa = PBXBuildFile; fileRef = 2C447ED620DD5D2C00840ABB /* sha512_compress.c */; };
|
||||
2C88437B21B74AD500C49BD9 /* sha256_compress_arm64.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EC520DD5D1800840ABB /* sha256_compress_arm64.s */; };
|
||||
2C88437C21B74AD500C49BD9 /* sha256_compress_armv7neon.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EC720DD5D1900840ABB /* sha256_compress_armv7neon.s */; };
|
||||
2C88437D21B74AD500C49BD9 /* vpaes-armv7.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EE020DD5D4600840ABB /* vpaes-armv7.s */; };
|
||||
2C88437E21B74AD500C49BD9 /* sha256_compress_ssse3_32.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EC920DD5D1900840ABB /* sha256_compress_ssse3_32.s */; };
|
||||
2C88437F21B74AD500C49BD9 /* decrypt.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EE120DD5D4600840ABB /* decrypt.s */; };
|
||||
2C88438021B74AD500C49BD9 /* sha1_compress_arm64.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EB020DD5D0100840ABB /* sha1_compress_arm64.s */; };
|
||||
2C88438121B74AD500C49BD9 /* decrypt.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EDC20DD5D4600840ABB /* decrypt.s */; };
|
||||
2C88438221B74AD500C49BD9 /* sha512_K.c in Sources */ = {isa = PBXBuildFile; fileRef = 2C447ED120DD5D2C00840ABB /* sha512_K.c */; };
|
||||
2C88438321B74AD500C49BD9 /* sha512_compress_avx2.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447ED320DD5D2C00840ABB /* sha512_compress_avx2.s */; };
|
||||
2C88438421B74AD500C49BD9 /* sha256_compress_avx1.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EB920DD5D1800840ABB /* sha256_compress_avx1.s */; };
|
||||
2C88438521B74AD500C49BD9 /* sha1_compress_armv7neon.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EB420DD5D0100840ABB /* sha1_compress_armv7neon.s */; };
|
||||
2C88438621B74AD500C49BD9 /* decrypt_ecb.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EDD20DD5D4600840ABB /* decrypt_ecb.s */; };
|
||||
2C88438721B74AD500C49BD9 /* sha512_compress_armv7neon.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EDA20DD5D2C00840ABB /* sha512_compress_armv7neon.s */; };
|
||||
2C8843A921B8AA8200C49BD9 /* crypt_nonaesni.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C8843A421B8AA8200C49BD9 /* crypt_nonaesni.s */; };
|
||||
2C8843AA21B8AA8200C49BD9 /* crypt_nonaesni.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C8843A421B8AA8200C49BD9 /* crypt_nonaesni.s */; };
|
||||
2C8843AB21B8AA8200C49BD9 /* Context.h in Headers */ = {isa = PBXBuildFile; fileRef = 2C8843A521B8AA8200C49BD9 /* Context.h */; };
|
||||
2C8843AC21B8AA8200C49BD9 /* Context.h in Headers */ = {isa = PBXBuildFile; fileRef = 2C8843A521B8AA8200C49BD9 /* Context.h */; };
|
||||
2C8843AD21B8AA8200C49BD9 /* crypt_aesni.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C8843A621B8AA8200C49BD9 /* crypt_aesni.s */; };
|
||||
2C8843AE21B8AA8200C49BD9 /* crypt_aesni.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C8843A621B8AA8200C49BD9 /* crypt_aesni.s */; };
|
||||
2C8843AF21B8AA8200C49BD9 /* aes.c in Sources */ = {isa = PBXBuildFile; fileRef = 2C8843A721B8AA8200C49BD9 /* aes.c */; };
|
||||
2C8843B021B8AA8200C49BD9 /* aes.c in Sources */ = {isa = PBXBuildFile; fileRef = 2C8843A721B8AA8200C49BD9 /* aes.c */; };
|
||||
2C93F58321BAF750009239B3 /* AccelerateCrypto.h in Headers */ = {isa = PBXBuildFile; fileRef = 2C447E9E20DD5BD600840ABB /* AccelerateCrypto.h */; };
|
||||
/* End PBXBuildFile section */
|
||||
|
||||
/* Begin PBXContainerItemProxy section */
|
||||
2C6CED2D20E195E90045D491 /* PBXContainerItemProxy */ = {
|
||||
isa = PBXContainerItemProxy;
|
||||
containerPortal = 2CC8863B20D859F200D17D95 /* Project object */;
|
||||
proxyType = 1;
|
||||
remoteGlobalIDString = 2C6CED0720E195360045D491;
|
||||
remoteInfo = libAccelerateCrypto;
|
||||
};
|
||||
2C88438F21B74BE100C49BD9 /* PBXContainerItemProxy */ = {
|
||||
isa = PBXContainerItemProxy;
|
||||
containerPortal = 2CC8863B20D859F200D17D95 /* Project object */;
|
||||
proxyType = 1;
|
||||
remoteGlobalIDString = 2C88436A21B74AD500C49BD9;
|
||||
remoteInfo = libAccelerateCrypto_kernel;
|
||||
};
|
||||
/* End PBXContainerItemProxy section */
|
||||
|
||||
/* Begin PBXFileReference section */
|
||||
2C447E9E20DD5BD600840ABB /* AccelerateCrypto.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = AccelerateCrypto.h; path = Header/AccelerateCrypto.h; sourceTree = SOURCE_ROOT; };
|
||||
2C447EA020DD5C1300840ABB /* config.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = config.h; path = Include/config.h; sourceTree = SOURCE_ROOT; };
|
||||
2C447EA120DD5C1300840ABB /* arm64_isa_compatibility.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = arm64_isa_compatibility.h; path = Include/arm64_isa_compatibility.h; sourceTree = SOURCE_ROOT; };
|
||||
2C447EA720DD5D0100840ABB /* sha1_compress_avx1.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = sha1_compress_avx1.s; path = Source/sha1/intel/sha1_compress_avx1.s; sourceTree = SOURCE_ROOT; };
|
||||
2C447EA820DD5D0100840ABB /* sha1_compress.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = sha1_compress.c; path = Source/sha1/intel/sha1_compress.c; sourceTree = SOURCE_ROOT; };
|
||||
2C447EAB20DD5D0100840ABB /* sha1_compress.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = sha1_compress.c; sourceTree = "<group>"; };
|
||||
2C447EAC20DD5D0100840ABB /* sha1_compress_avx1.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha1_compress_avx1.s; sourceTree = "<group>"; };
|
||||
2C447EAD20DD5D0100840ABB /* sha1_compress_avx2.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha1_compress_avx2.s; sourceTree = "<group>"; };
|
||||
2C447EAE20DD5D0100840ABB /* sha1_compress_sse.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha1_compress_sse.s; sourceTree = "<group>"; };
|
||||
2C447EB020DD5D0100840ABB /* sha1_compress_arm64.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha1_compress_arm64.s; sourceTree = "<group>"; };
|
||||
2C447EB120DD5D0100840ABB /* sha1_compress_sse.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = sha1_compress_sse.s; path = Source/sha1/intel/sha1_compress_sse.s; sourceTree = SOURCE_ROOT; };
|
||||
2C447EB220DD5D0100840ABB /* sha1_compress_avx2.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = sha1_compress_avx2.s; path = Source/sha1/intel/sha1_compress_avx2.s; sourceTree = SOURCE_ROOT; };
|
||||
2C447EB420DD5D0100840ABB /* sha1_compress_armv7neon.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha1_compress_armv7neon.s; sourceTree = "<group>"; };
|
||||
2C447EB920DD5D1800840ABB /* sha256_compress_avx1.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = sha256_compress_avx1.s; path = Source/sha256/intel/sha256_compress_avx1.s; sourceTree = SOURCE_ROOT; };
|
||||
2C447EBA20DD5D1800840ABB /* sha256_compress_ssse3_64.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = sha256_compress_ssse3_64.s; path = Source/sha256/intel/sha256_compress_ssse3_64.s; sourceTree = SOURCE_ROOT; };
|
||||
2C447EBB20DD5D1800840ABB /* sha256_compress.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = sha256_compress.c; path = Source/sha256/intel/sha256_compress.c; sourceTree = SOURCE_ROOT; };
|
||||
2C447EBD20DD5D1800840ABB /* sha256_compress.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = sha256_compress.c; sourceTree = "<group>"; };
|
||||
2C447EBE20DD5D1800840ABB /* sha256_compress_avx1.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha256_compress_avx1.s; sourceTree = "<group>"; };
|
||||
2C447EBF20DD5D1800840ABB /* sha256_compress_avx2.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha256_compress_avx2.s; sourceTree = "<group>"; };
|
||||
2C447EC020DD5D1800840ABB /* sha256_compress_ssse3_32.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha256_compress_ssse3_32.s; sourceTree = "<group>"; };
|
||||
2C447EC120DD5D1800840ABB /* sha256_compress_ssse3_64.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha256_compress_ssse3_64.s; sourceTree = "<group>"; };
|
||||
2C447EC220DD5D1800840ABB /* sha256_K.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = sha256_K.c; sourceTree = "<group>"; };
|
||||
2C447EC520DD5D1800840ABB /* sha256_compress_arm64.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha256_compress_arm64.s; sourceTree = "<group>"; };
|
||||
2C447EC720DD5D1900840ABB /* sha256_compress_armv7neon.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha256_compress_armv7neon.s; sourceTree = "<group>"; };
|
||||
2C447EC820DD5D1900840ABB /* sha256_compress_avx2.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = sha256_compress_avx2.s; path = Source/sha256/intel/sha256_compress_avx2.s; sourceTree = SOURCE_ROOT; };
|
||||
2C447EC920DD5D1900840ABB /* sha256_compress_ssse3_32.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = sha256_compress_ssse3_32.s; path = Source/sha256/intel/sha256_compress_ssse3_32.s; sourceTree = SOURCE_ROOT; };
|
||||
2C447ECA20DD5D1900840ABB /* sha256_K.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = sha256_K.c; path = Source/sha256/intel/sha256_K.c; sourceTree = SOURCE_ROOT; };
|
||||
2C447ECC20DD5D2C00840ABB /* sha512_compress.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = sha512_compress.c; sourceTree = "<group>"; };
|
||||
2C447ECD20DD5D2C00840ABB /* sha512_compress_avx1.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha512_compress_avx1.s; sourceTree = "<group>"; };
|
||||
2C447ECE20DD5D2C00840ABB /* sha512_compress_avx2.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha512_compress_avx2.s; sourceTree = "<group>"; };
|
||||
2C447ECF20DD5D2C00840ABB /* sha512_compress_ssse3.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha512_compress_ssse3.s; sourceTree = "<group>"; };
|
||||
2C447ED120DD5D2C00840ABB /* sha512_K.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = sha512_K.c; path = Source/sha512/sha512_K.c; sourceTree = SOURCE_ROOT; };
|
||||
2C447ED220DD5D2C00840ABB /* sha512_compress_ssse3.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = sha512_compress_ssse3.s; path = Source/sha512/intel/sha512_compress_ssse3.s; sourceTree = SOURCE_ROOT; };
|
||||
2C447ED320DD5D2C00840ABB /* sha512_compress_avx2.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = sha512_compress_avx2.s; path = Source/sha512/intel/sha512_compress_avx2.s; sourceTree = SOURCE_ROOT; };
|
||||
2C447ED420DD5D2C00840ABB /* sha512_compress_avx1.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = sha512_compress_avx1.s; path = Source/sha512/intel/sha512_compress_avx1.s; sourceTree = SOURCE_ROOT; };
|
||||
2C447ED620DD5D2C00840ABB /* sha512_compress.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = sha512_compress.c; path = Source/sha512/intel/sha512_compress.c; sourceTree = SOURCE_ROOT; };
|
||||
2C447ED820DD5D2C00840ABB /* sha512_compress_arm64.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha512_compress_arm64.s; sourceTree = "<group>"; };
|
||||
2C447EDA20DD5D2C00840ABB /* sha512_compress_armv7neon.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha512_compress_armv7neon.s; sourceTree = "<group>"; };
|
||||
2C447EDC20DD5D4600840ABB /* decrypt.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = decrypt.s; sourceTree = "<group>"; };
|
||||
2C447EDD20DD5D4600840ABB /* decrypt_ecb.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = decrypt_ecb.s; sourceTree = "<group>"; };
|
||||
2C447EDE20DD5D4600840ABB /* encrypt.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = encrypt.s; sourceTree = "<group>"; };
|
||||
2C447EDF20DD5D4600840ABB /* encrypt_ecb.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = encrypt_ecb.s; sourceTree = "<group>"; };
|
||||
2C447EE020DD5D4600840ABB /* vpaes-armv7.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = "vpaes-armv7.s"; path = "Source/aes/arm/vpaes-armv7.s"; sourceTree = SOURCE_ROOT; };
|
||||
2C447EE120DD5D4600840ABB /* decrypt.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = decrypt.s; path = Source/aes/arm/decrypt.s; sourceTree = SOURCE_ROOT; };
|
||||
2C447EE220DD5D4600840ABB /* encrypt.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = encrypt.s; path = Source/aes/arm/encrypt.s; sourceTree = SOURCE_ROOT; };
|
||||
2C447EE420DD5D4700840ABB /* EncryptDecrypt.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = EncryptDecrypt.s; path = Source/aes/arm/EncryptDecrypt.s; sourceTree = SOURCE_ROOT; };
|
||||
2C6CED0820E195360045D491 /* libAccelerateCrypto.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libAccelerateCrypto.a; sourceTree = BUILT_PRODUCTS_DIR; };
|
||||
2C88438E21B74AD500C49BD9 /* libAccelerateCrypto_kernel.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libAccelerateCrypto_kernel.a; sourceTree = BUILT_PRODUCTS_DIR; };
|
||||
2C8843A421B8AA8200C49BD9 /* crypt_nonaesni.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = crypt_nonaesni.s; path = Source/aes/intel/crypt_nonaesni.s; sourceTree = SOURCE_ROOT; };
|
||||
2C8843A521B8AA8200C49BD9 /* Context.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = Context.h; path = Source/aes/intel/Context.h; sourceTree = SOURCE_ROOT; };
|
||||
2C8843A621B8AA8200C49BD9 /* crypt_aesni.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = crypt_aesni.s; path = Source/aes/intel/crypt_aesni.s; sourceTree = SOURCE_ROOT; };
|
||||
2C8843A721B8AA8200C49BD9 /* aes.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = aes.c; path = Source/aes/intel/aes.c; sourceTree = SOURCE_ROOT; };
|
||||
2C8843A821B8AA8200C49BD9 /* Data.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = Data.s; path = Source/aes/intel/Data.s; sourceTree = SOURCE_ROOT; };
|
||||
2C8843B321B8AA9700C49BD9 /* EncryptDecrypt.s */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.asm; name = EncryptDecrypt.s; path = Source/aes/intel/EncryptDecrypt.s; sourceTree = SOURCE_ROOT; };
|
||||
/* End PBXFileReference section */
|
||||
|
||||
/* Begin PBXFrameworksBuildPhase section */
|
||||
2C6CED0520E195360045D491 /* Frameworks */ = {
|
||||
isa = PBXFrameworksBuildPhase;
|
||||
buildActionMask = 2147483647;
|
||||
files = (
|
||||
);
|
||||
runOnlyForDeploymentPostprocessing = 0;
|
||||
};
|
||||
2C88438821B74AD500C49BD9 /* Frameworks */ = {
|
||||
isa = PBXFrameworksBuildPhase;
|
||||
buildActionMask = 2147483647;
|
||||
files = (
|
||||
);
|
||||
runOnlyForDeploymentPostprocessing = 0;
|
||||
};
|
||||
/* End PBXFrameworksBuildPhase section */
|
||||
|
||||
/* Begin PBXGroup section */
|
||||
2C447E9D20DD5B2600840ABB /* Header */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
2C447E9E20DD5BD600840ABB /* AccelerateCrypto.h */,
|
||||
);
|
||||
path = Header;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
2C447E9F20DD5BF300840ABB /* Include */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
2C447EA120DD5C1300840ABB /* arm64_isa_compatibility.h */,
|
||||
2C447EA020DD5C1300840ABB /* config.h */,
|
||||
);
|
||||
path = Include;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
2C447EA220DD5C2400840ABB /* Source */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
2C447EA620DD5C5F00840ABB /* sha512 */,
|
||||
2C447EA520DD5C5600840ABB /* sha256 */,
|
||||
2C447EA420DD5C4F00840ABB /* sha1 */,
|
||||
2C447EA320DD5C4400840ABB /* aes */,
|
||||
);
|
||||
path = Source;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
2C447EA320DD5C4400840ABB /* aes */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
2C8843A321B8AA4900C49BD9 /* intel */,
|
||||
2C447EE320DD5D4600840ABB /* arm */,
|
||||
2C447EDB20DD5D4600840ABB /* arm64 */,
|
||||
);
|
||||
path = aes;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
2C447EA420DD5C4F00840ABB /* sha1 */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
2C447EB320DD5D0100840ABB /* arm */,
|
||||
2C447EAF20DD5D0100840ABB /* arm64 */,
|
||||
2C447EAA20DD5D0100840ABB /* intel */,
|
||||
2C447EA720DD5D0100840ABB /* sha1_compress_avx1.s */,
|
||||
2C447EB220DD5D0100840ABB /* sha1_compress_avx2.s */,
|
||||
2C447EB120DD5D0100840ABB /* sha1_compress_sse.s */,
|
||||
2C447EA820DD5D0100840ABB /* sha1_compress.c */,
|
||||
);
|
||||
path = sha1;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
2C447EA520DD5C5600840ABB /* sha256 */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
2C447EC620DD5D1900840ABB /* arm */,
|
||||
2C447EC320DD5D1800840ABB /* arm64 */,
|
||||
2C447EBC20DD5D1800840ABB /* intel */,
|
||||
2C447EB920DD5D1800840ABB /* sha256_compress_avx1.s */,
|
||||
2C447EC820DD5D1900840ABB /* sha256_compress_avx2.s */,
|
||||
2C447EC920DD5D1900840ABB /* sha256_compress_ssse3_32.s */,
|
||||
2C447EBA20DD5D1800840ABB /* sha256_compress_ssse3_64.s */,
|
||||
2C447EBB20DD5D1800840ABB /* sha256_compress.c */,
|
||||
2C447ECA20DD5D1900840ABB /* sha256_K.c */,
|
||||
);
|
||||
path = sha256;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
2C447EA620DD5C5F00840ABB /* sha512 */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
2C447ED920DD5D2C00840ABB /* arm */,
|
||||
2C447ED720DD5D2C00840ABB /* arm64 */,
|
||||
2C447ECB20DD5D2C00840ABB /* intel */,
|
||||
2C447ED420DD5D2C00840ABB /* sha512_compress_avx1.s */,
|
||||
2C447ED320DD5D2C00840ABB /* sha512_compress_avx2.s */,
|
||||
2C447ED220DD5D2C00840ABB /* sha512_compress_ssse3.s */,
|
||||
2C447ED620DD5D2C00840ABB /* sha512_compress.c */,
|
||||
2C447ED120DD5D2C00840ABB /* sha512_K.c */,
|
||||
);
|
||||
path = sha512;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
2C447EAA20DD5D0100840ABB /* intel */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
2C447EAB20DD5D0100840ABB /* sha1_compress.c */,
|
||||
2C447EAC20DD5D0100840ABB /* sha1_compress_avx1.s */,
|
||||
2C447EAD20DD5D0100840ABB /* sha1_compress_avx2.s */,
|
||||
2C447EAE20DD5D0100840ABB /* sha1_compress_sse.s */,
|
||||
);
|
||||
name = intel;
|
||||
path = Source/sha1/intel;
|
||||
sourceTree = SOURCE_ROOT;
|
||||
};
|
||||
2C447EAF20DD5D0100840ABB /* arm64 */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
2C447EB020DD5D0100840ABB /* sha1_compress_arm64.s */,
|
||||
);
|
||||
name = arm64;
|
||||
path = Source/sha1/arm64;
|
||||
sourceTree = SOURCE_ROOT;
|
||||
};
|
||||
2C447EB320DD5D0100840ABB /* arm */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
2C447EB420DD5D0100840ABB /* sha1_compress_armv7neon.s */,
|
||||
);
|
||||
name = arm;
|
||||
path = Source/sha1/arm;
|
||||
sourceTree = SOURCE_ROOT;
|
||||
};
|
||||
2C447EBC20DD5D1800840ABB /* intel */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
2C447EBD20DD5D1800840ABB /* sha256_compress.c */,
|
||||
2C447EBE20DD5D1800840ABB /* sha256_compress_avx1.s */,
|
||||
2C447EBF20DD5D1800840ABB /* sha256_compress_avx2.s */,
|
||||
2C447EC020DD5D1800840ABB /* sha256_compress_ssse3_32.s */,
|
||||
2C447EC120DD5D1800840ABB /* sha256_compress_ssse3_64.s */,
|
||||
2C447EC220DD5D1800840ABB /* sha256_K.c */,
|
||||
);
|
||||
name = intel;
|
||||
path = Source/sha256/intel;
|
||||
sourceTree = SOURCE_ROOT;
|
||||
};
|
||||
2C447EC320DD5D1800840ABB /* arm64 */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
2C447EC520DD5D1800840ABB /* sha256_compress_arm64.s */,
|
||||
);
|
||||
name = arm64;
|
||||
path = Source/sha256/arm64;
|
||||
sourceTree = SOURCE_ROOT;
|
||||
};
|
||||
2C447EC620DD5D1900840ABB /* arm */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
2C447EC720DD5D1900840ABB /* sha256_compress_armv7neon.s */,
|
||||
);
|
||||
name = arm;
|
||||
path = Source/sha256/arm;
|
||||
sourceTree = SOURCE_ROOT;
|
||||
};
|
||||
2C447ECB20DD5D2C00840ABB /* intel */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
2C447ECC20DD5D2C00840ABB /* sha512_compress.c */,
|
||||
2C447ECD20DD5D2C00840ABB /* sha512_compress_avx1.s */,
|
||||
2C447ECE20DD5D2C00840ABB /* sha512_compress_avx2.s */,
|
||||
2C447ECF20DD5D2C00840ABB /* sha512_compress_ssse3.s */,
|
||||
);
|
||||
name = intel;
|
||||
path = Source/sha512/intel;
|
||||
sourceTree = SOURCE_ROOT;
|
||||
};
|
||||
2C447ED720DD5D2C00840ABB /* arm64 */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
2C447ED820DD5D2C00840ABB /* sha512_compress_arm64.s */,
|
||||
);
|
||||
name = arm64;
|
||||
path = Source/sha512/arm64;
|
||||
sourceTree = SOURCE_ROOT;
|
||||
};
|
||||
2C447ED920DD5D2C00840ABB /* arm */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
2C447EDA20DD5D2C00840ABB /* sha512_compress_armv7neon.s */,
|
||||
);
|
||||
name = arm;
|
||||
path = Source/sha512/arm;
|
||||
sourceTree = SOURCE_ROOT;
|
||||
};
|
||||
2C447EDB20DD5D4600840ABB /* arm64 */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
2C447EDC20DD5D4600840ABB /* decrypt.s */,
|
||||
2C447EDD20DD5D4600840ABB /* decrypt_ecb.s */,
|
||||
2C447EDE20DD5D4600840ABB /* encrypt.s */,
|
||||
2C447EDF20DD5D4600840ABB /* encrypt_ecb.s */,
|
||||
);
|
||||
name = arm64;
|
||||
path = Source/aes/arm64;
|
||||
sourceTree = SOURCE_ROOT;
|
||||
};
|
||||
2C447EE320DD5D4600840ABB /* arm */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
2C447EE120DD5D4600840ABB /* decrypt.s */,
|
||||
2C447EE220DD5D4600840ABB /* encrypt.s */,
|
||||
2C447EE420DD5D4700840ABB /* EncryptDecrypt.s */,
|
||||
2C447EE020DD5D4600840ABB /* vpaes-armv7.s */,
|
||||
);
|
||||
name = arm;
|
||||
path = Source/aes/arm;
|
||||
sourceTree = SOURCE_ROOT;
|
||||
};
|
||||
2C447EEA20DD5FA700840ABB /* Products */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
2C6CED0820E195360045D491 /* libAccelerateCrypto.a */,
|
||||
2C88438E21B74AD500C49BD9 /* libAccelerateCrypto_kernel.a */,
|
||||
);
|
||||
name = Products;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
2C8843A321B8AA4900C49BD9 /* intel */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
2C8843B321B8AA9700C49BD9 /* EncryptDecrypt.s */,
|
||||
2C8843A721B8AA8200C49BD9 /* aes.c */,
|
||||
2C8843A521B8AA8200C49BD9 /* Context.h */,
|
||||
2C8843A621B8AA8200C49BD9 /* crypt_aesni.s */,
|
||||
2C8843A421B8AA8200C49BD9 /* crypt_nonaesni.s */,
|
||||
2C8843A821B8AA8200C49BD9 /* Data.s */,
|
||||
);
|
||||
name = intel;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
2CC8863A20D859F200D17D95 = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
2C447EA220DD5C2400840ABB /* Source */,
|
||||
2C447E9F20DD5BF300840ABB /* Include */,
|
||||
2C447E9D20DD5B2600840ABB /* Header */,
|
||||
2C447EEA20DD5FA700840ABB /* Products */,
|
||||
);
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
/* End PBXGroup section */
|
||||
|
||||
/* Begin PBXHeadersBuildPhase section */
|
||||
2C6CED0620E195360045D491 /* Headers */ = {
|
||||
isa = PBXHeadersBuildPhase;
|
||||
buildActionMask = 2147483647;
|
||||
files = (
|
||||
2C8843AB21B8AA8200C49BD9 /* Context.h in Headers */,
|
||||
2C6CED2F20E302B40045D491 /* AccelerateCrypto.h in Headers */,
|
||||
);
|
||||
runOnlyForDeploymentPostprocessing = 0;
|
||||
};
|
||||
2C88438921B74AD500C49BD9 /* Headers */ = {
|
||||
isa = PBXHeadersBuildPhase;
|
||||
buildActionMask = 2147483647;
|
||||
files = (
|
||||
2C93F58321BAF750009239B3 /* AccelerateCrypto.h in Headers */,
|
||||
2C8843AC21B8AA8200C49BD9 /* Context.h in Headers */,
|
||||
);
|
||||
runOnlyForDeploymentPostprocessing = 0;
|
||||
};
|
||||
/* End PBXHeadersBuildPhase section */
|
||||
|
||||
/* Begin PBXNativeTarget section */
|
||||
2C6CED0720E195360045D491 /* libAccelerateCrypto */ = {
|
||||
isa = PBXNativeTarget;
|
||||
buildConfigurationList = 2C6CED1020E195360045D491 /* Build configuration list for PBXNativeTarget "libAccelerateCrypto" */;
|
||||
buildPhases = (
|
||||
2C6CED0420E195360045D491 /* Sources */,
|
||||
2C6CED0520E195360045D491 /* Frameworks */,
|
||||
2C6CED0620E195360045D491 /* Headers */,
|
||||
);
|
||||
buildRules = (
|
||||
);
|
||||
dependencies = (
|
||||
);
|
||||
name = libAccelerateCrypto;
|
||||
productName = libAccelerateCrypto;
|
||||
productReference = 2C6CED0820E195360045D491 /* libAccelerateCrypto.a */;
|
||||
productType = "com.apple.product-type.library.static";
|
||||
};
|
||||
2C88436A21B74AD500C49BD9 /* libAccelerateCrypto_kernel */ = {
|
||||
isa = PBXNativeTarget;
|
||||
buildConfigurationList = 2C88438B21B74AD500C49BD9 /* Build configuration list for PBXNativeTarget "libAccelerateCrypto_kernel" */;
|
||||
buildPhases = (
|
||||
2C88436B21B74AD500C49BD9 /* Sources */,
|
||||
2C88438821B74AD500C49BD9 /* Frameworks */,
|
||||
2C88438921B74AD500C49BD9 /* Headers */,
|
||||
);
|
||||
buildRules = (
|
||||
);
|
||||
dependencies = (
|
||||
);
|
||||
name = libAccelerateCrypto_kernel;
|
||||
productName = libAccelerateCrypto;
|
||||
productReference = 2C88438E21B74AD500C49BD9 /* libAccelerateCrypto_kernel.a */;
|
||||
productType = "com.apple.product-type.library.static";
|
||||
};
|
||||
/* End PBXNativeTarget section */
|
||||
|
||||
/* Begin PBXProject section */
|
||||
2CC8863B20D859F200D17D95 /* Project object */ = {
|
||||
isa = PBXProject;
|
||||
attributes = {
|
||||
LastUpgradeCheck = 1000;
|
||||
TargetAttributes = {
|
||||
2C6CED0720E195360045D491 = {
|
||||
CreatedOnToolsVersion = 10.0;
|
||||
};
|
||||
2CD5E9C120D85B370097F130 = {
|
||||
CreatedOnToolsVersion = 10.0;
|
||||
};
|
||||
};
|
||||
};
|
||||
buildConfigurationList = 2CC8863E20D859F200D17D95 /* Build configuration list for PBXProject "AccelerateCrypto" */;
|
||||
compatibilityVersion = "Xcode 9.3";
|
||||
developmentRegion = en;
|
||||
hasScannedForEncodings = 0;
|
||||
knownRegions = (
|
||||
en,
|
||||
);
|
||||
mainGroup = 2CC8863A20D859F200D17D95;
|
||||
productRefGroup = 2C447EEA20DD5FA700840ABB /* Products */;
|
||||
projectDirPath = "";
|
||||
projectRoot = "";
|
||||
targets = (
|
||||
2CD5E9C120D85B370097F130 /* AccelerateCrypto */,
|
||||
2C6CED0720E195360045D491 /* libAccelerateCrypto */,
|
||||
2C88436A21B74AD500C49BD9 /* libAccelerateCrypto_kernel */,
|
||||
);
|
||||
};
|
||||
/* End PBXProject section */
|
||||
|
||||
/* Begin PBXSourcesBuildPhase section */
|
||||
2C6CED0420E195360045D491 /* Sources */ = {
|
||||
isa = PBXSourcesBuildPhase;
|
||||
buildActionMask = 2147483647;
|
||||
files = (
|
||||
2C6CED2520E1959B0045D491 /* sha1_compress.c in Sources */,
|
||||
2C6CED1B20E1958D0045D491 /* sha256_compress_avx2.s in Sources */,
|
||||
2C6CED1D20E1958D0045D491 /* sha256_compress_ssse3_64.s in Sources */,
|
||||
2C6CED2420E1959B0045D491 /* sha1_compress_sse.s in Sources */,
|
||||
2C6CED2C20E195B60045D491 /* encrypt_ecb.s in Sources */,
|
||||
2C6CED2720E195A80045D491 /* encrypt.s in Sources */,
|
||||
2C6CED2B20E195B60045D491 /* encrypt.s in Sources */,
|
||||
2C6CED2220E1959B0045D491 /* sha1_compress_avx1.s in Sources */,
|
||||
2C6CED1E20E1958D0045D491 /* sha256_compress.c in Sources */,
|
||||
2C6CED1520E1957F0045D491 /* sha512_compress_ssse3.s in Sources */,
|
||||
2C6CED1220E195710045D491 /* sha512_compress_arm64.s in Sources */,
|
||||
2C8843AD21B8AA8200C49BD9 /* crypt_aesni.s in Sources */,
|
||||
2C6CED1320E1957F0045D491 /* sha512_compress_avx1.s in Sources */,
|
||||
2C6CED1F20E1958D0045D491 /* sha256_K.c in Sources */,
|
||||
2C6CED2320E1959B0045D491 /* sha1_compress_avx2.s in Sources */,
|
||||
2C6CED1620E1957F0045D491 /* sha512_compress.c in Sources */,
|
||||
2C6CED1920E195890045D491 /* sha256_compress_arm64.s in Sources */,
|
||||
2C6CED1820E195850045D491 /* sha256_compress_armv7neon.s in Sources */,
|
||||
2C6CED2820E195A80045D491 /* vpaes-armv7.s in Sources */,
|
||||
2C6CED1C20E1958D0045D491 /* sha256_compress_ssse3_32.s in Sources */,
|
||||
2C6CED2620E195A80045D491 /* decrypt.s in Sources */,
|
||||
2C6CED2120E195970045D491 /* sha1_compress_arm64.s in Sources */,
|
||||
2C6CED2920E195B60045D491 /* decrypt.s in Sources */,
|
||||
2C6CED1720E1957F0045D491 /* sha512_K.c in Sources */,
|
||||
2C6CED1420E1957F0045D491 /* sha512_compress_avx2.s in Sources */,
|
||||
2C6CED1A20E1958D0045D491 /* sha256_compress_avx1.s in Sources */,
|
||||
2C8843A921B8AA8200C49BD9 /* crypt_nonaesni.s in Sources */,
|
||||
2C6CED2020E195930045D491 /* sha1_compress_armv7neon.s in Sources */,
|
||||
2C6CED2A20E195B60045D491 /* decrypt_ecb.s in Sources */,
|
||||
2C8843AF21B8AA8200C49BD9 /* aes.c in Sources */,
|
||||
2C6CED1120E1956A0045D491 /* sha512_compress_armv7neon.s in Sources */,
|
||||
);
|
||||
runOnlyForDeploymentPostprocessing = 0;
|
||||
};
|
||||
2C88436B21B74AD500C49BD9 /* Sources */ = {
|
||||
isa = PBXSourcesBuildPhase;
|
||||
buildActionMask = 2147483647;
|
||||
files = (
|
||||
2C88436C21B74AD500C49BD9 /* sha1_compress.c in Sources */,
|
||||
2C88436D21B74AD500C49BD9 /* sha256_compress_avx2.s in Sources */,
|
||||
2C88436E21B74AD500C49BD9 /* sha256_compress_ssse3_64.s in Sources */,
|
||||
2C88436F21B74AD500C49BD9 /* sha1_compress_sse.s in Sources */,
|
||||
2C88437021B74AD500C49BD9 /* encrypt_ecb.s in Sources */,
|
||||
2C88437121B74AD500C49BD9 /* encrypt.s in Sources */,
|
||||
2C88437221B74AD500C49BD9 /* encrypt.s in Sources */,
|
||||
2C88437321B74AD500C49BD9 /* sha1_compress_avx1.s in Sources */,
|
||||
2C88437421B74AD500C49BD9 /* sha256_compress.c in Sources */,
|
||||
2C88437521B74AD500C49BD9 /* sha512_compress_ssse3.s in Sources */,
|
||||
2C88437621B74AD500C49BD9 /* sha512_compress_arm64.s in Sources */,
|
||||
2C8843AE21B8AA8200C49BD9 /* crypt_aesni.s in Sources */,
|
||||
2C88437721B74AD500C49BD9 /* sha512_compress_avx1.s in Sources */,
|
||||
2C88437821B74AD500C49BD9 /* sha256_K.c in Sources */,
|
||||
2C88437921B74AD500C49BD9 /* sha1_compress_avx2.s in Sources */,
|
||||
2C88437A21B74AD500C49BD9 /* sha512_compress.c in Sources */,
|
||||
2C88437B21B74AD500C49BD9 /* sha256_compress_arm64.s in Sources */,
|
||||
2C88437C21B74AD500C49BD9 /* sha256_compress_armv7neon.s in Sources */,
|
||||
2C88437D21B74AD500C49BD9 /* vpaes-armv7.s in Sources */,
|
||||
2C88437E21B74AD500C49BD9 /* sha256_compress_ssse3_32.s in Sources */,
|
||||
2C88437F21B74AD500C49BD9 /* decrypt.s in Sources */,
|
||||
2C88438021B74AD500C49BD9 /* sha1_compress_arm64.s in Sources */,
|
||||
2C88438121B74AD500C49BD9 /* decrypt.s in Sources */,
|
||||
2C88438221B74AD500C49BD9 /* sha512_K.c in Sources */,
|
||||
2C88438321B74AD500C49BD9 /* sha512_compress_avx2.s in Sources */,
|
||||
2C88438421B74AD500C49BD9 /* sha256_compress_avx1.s in Sources */,
|
||||
2C8843AA21B8AA8200C49BD9 /* crypt_nonaesni.s in Sources */,
|
||||
2C88438521B74AD500C49BD9 /* sha1_compress_armv7neon.s in Sources */,
|
||||
2C88438621B74AD500C49BD9 /* decrypt_ecb.s in Sources */,
|
||||
2C8843B021B8AA8200C49BD9 /* aes.c in Sources */,
|
||||
2C88438721B74AD500C49BD9 /* sha512_compress_armv7neon.s in Sources */,
|
||||
);
|
||||
runOnlyForDeploymentPostprocessing = 0;
|
||||
};
|
||||
/* End PBXSourcesBuildPhase section */
|
||||
|
||||
/* Begin PBXTargetDependency section */
|
||||
2C6CED2E20E195E90045D491 /* PBXTargetDependency */ = {
|
||||
isa = PBXTargetDependency;
|
||||
target = 2C6CED0720E195360045D491 /* libAccelerateCrypto */;
|
||||
targetProxy = 2C6CED2D20E195E90045D491 /* PBXContainerItemProxy */;
|
||||
};
|
||||
2C88439021B74BE100C49BD9 /* PBXTargetDependency */ = {
|
||||
isa = PBXTargetDependency;
|
||||
target = 2C88436A21B74AD500C49BD9 /* libAccelerateCrypto_kernel */;
|
||||
targetProxy = 2C88438F21B74BE100C49BD9 /* PBXContainerItemProxy */;
|
||||
};
|
||||
/* End PBXTargetDependency section */
|
||||
|
||||
/* Begin XCBuildConfiguration section */
|
||||
2C6CED0E20E195360045D491 /* Debug */ = {
|
||||
isa = XCBuildConfiguration;
|
||||
buildSettings = {
|
||||
ALWAYS_SEARCH_USER_PATHS = NO;
|
||||
CLANG_ANALYZER_NONNULL = YES;
|
||||
CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
|
||||
CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
|
||||
CLANG_CXX_LIBRARY = "libc++";
|
||||
CLANG_ENABLE_MODULES = YES;
|
||||
CLANG_ENABLE_OBJC_ARC = YES;
|
||||
CLANG_ENABLE_OBJC_WEAK = YES;
|
||||
CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
|
||||
CLANG_WARN_BOOL_CONVERSION = YES;
|
||||
CLANG_WARN_COMMA = YES;
|
||||
CLANG_WARN_CONSTANT_CONVERSION = YES;
|
||||
CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
|
||||
CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
|
||||
CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
|
||||
CLANG_WARN_EMPTY_BODY = YES;
|
||||
CLANG_WARN_ENUM_CONVERSION = YES;
|
||||
CLANG_WARN_INFINITE_RECURSION = YES;
|
||||
CLANG_WARN_INT_CONVERSION = YES;
|
||||
CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
|
||||
CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
|
||||
CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
|
||||
CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
|
||||
CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
|
||||
CLANG_WARN_STRICT_PROTOTYPES = YES;
|
||||
CLANG_WARN_SUSPICIOUS_MOVE = YES;
|
||||
CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
|
||||
CLANG_WARN_UNREACHABLE_CODE = YES;
|
||||
CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
|
||||
CODE_SIGN_IDENTITY = "-";
|
||||
CODE_SIGN_STYLE = Automatic;
|
||||
COPY_PHASE_STRIP = NO;
|
||||
DEBUG_INFORMATION_FORMAT = dwarf;
|
||||
ENABLE_STRICT_OBJC_MSGSEND = YES;
|
||||
ENABLE_TESTABILITY = YES;
|
||||
EXECUTABLE_PREFIX = "";
|
||||
GCC_C_LANGUAGE_STANDARD = gnu11;
|
||||
GCC_DYNAMIC_NO_PIC = NO;
|
||||
GCC_NO_COMMON_BLOCKS = YES;
|
||||
GCC_OPTIMIZATION_LEVEL = 0;
|
||||
GCC_PREPROCESSOR_DEFINITIONS = (
|
||||
"DEBUG=1",
|
||||
"$(inherited)",
|
||||
);
|
||||
GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
|
||||
GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
|
||||
GCC_WARN_UNDECLARED_SELECTOR = YES;
|
||||
GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
|
||||
GCC_WARN_UNUSED_FUNCTION = YES;
|
||||
GCC_WARN_UNUSED_VARIABLE = YES;
|
||||
INSTALL_PATH = "";
|
||||
MACOSX_DEPLOYMENT_TARGET = 10.14;
|
||||
MTL_ENABLE_DEBUG_INFO = YES;
|
||||
ONLY_ACTIVE_ARCH = NO;
|
||||
PRODUCT_NAME = "$(TARGET_NAME)";
|
||||
PUBLIC_HEADERS_FOLDER_PATH = /usr/local/include;
|
||||
SDKROOT = macosx.internal;
|
||||
SKIP_INSTALL = YES;
|
||||
};
|
||||
name = Debug;
|
||||
};
|
||||
2C6CED0F20E195360045D491 /* Release */ = {
|
||||
isa = XCBuildConfiguration;
|
||||
buildSettings = {
|
||||
ALWAYS_SEARCH_USER_PATHS = NO;
|
||||
CLANG_ANALYZER_NONNULL = YES;
|
||||
CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
|
||||
CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
|
||||
CLANG_CXX_LIBRARY = "libc++";
|
||||
CLANG_ENABLE_MODULES = YES;
|
||||
CLANG_ENABLE_OBJC_ARC = YES;
|
||||
CLANG_ENABLE_OBJC_WEAK = YES;
|
||||
CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
|
||||
CLANG_WARN_BOOL_CONVERSION = YES;
|
||||
CLANG_WARN_COMMA = YES;
|
||||
CLANG_WARN_CONSTANT_CONVERSION = YES;
|
||||
CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
|
||||
CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
|
||||
CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
|
||||
CLANG_WARN_EMPTY_BODY = YES;
|
||||
CLANG_WARN_ENUM_CONVERSION = YES;
|
||||
CLANG_WARN_INFINITE_RECURSION = YES;
|
||||
CLANG_WARN_INT_CONVERSION = YES;
|
||||
CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
|
||||
CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
|
||||
CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
|
||||
CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
|
||||
CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
|
||||
CLANG_WARN_STRICT_PROTOTYPES = YES;
|
||||
CLANG_WARN_SUSPICIOUS_MOVE = YES;
|
||||
CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
|
||||
CLANG_WARN_UNREACHABLE_CODE = YES;
|
||||
CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
|
||||
CODE_SIGN_IDENTITY = "-";
|
||||
CODE_SIGN_STYLE = Automatic;
|
||||
COPY_PHASE_STRIP = NO;
|
||||
DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
|
||||
ENABLE_NS_ASSERTIONS = NO;
|
||||
ENABLE_STRICT_OBJC_MSGSEND = YES;
|
||||
EXECUTABLE_PREFIX = "";
|
||||
GCC_C_LANGUAGE_STANDARD = gnu11;
|
||||
GCC_NO_COMMON_BLOCKS = YES;
|
||||
GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
|
||||
GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
|
||||
GCC_WARN_UNDECLARED_SELECTOR = YES;
|
||||
GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
|
||||
GCC_WARN_UNUSED_FUNCTION = YES;
|
||||
GCC_WARN_UNUSED_VARIABLE = YES;
|
||||
INSTALL_PATH = "";
|
||||
MACOSX_DEPLOYMENT_TARGET = 10.14;
|
||||
MTL_ENABLE_DEBUG_INFO = NO;
|
||||
PRODUCT_NAME = "$(TARGET_NAME)";
|
||||
PUBLIC_HEADERS_FOLDER_PATH = /usr/local/include;
|
||||
SDKROOT = macosx.internal;
|
||||
SKIP_INSTALL = YES;
|
||||
};
|
||||
name = Release;
|
||||
};
|
||||
2C88438C21B74AD500C49BD9 /* Debug */ = {
|
||||
isa = XCBuildConfiguration;
|
||||
buildSettings = {
|
||||
ALWAYS_SEARCH_USER_PATHS = NO;
|
||||
CLANG_ANALYZER_NONNULL = YES;
|
||||
CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
|
||||
CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
|
||||
CLANG_CXX_LIBRARY = "libc++";
|
||||
CLANG_ENABLE_MODULES = YES;
|
||||
CLANG_ENABLE_OBJC_ARC = YES;
|
||||
CLANG_ENABLE_OBJC_WEAK = YES;
|
||||
CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
|
||||
CLANG_WARN_BOOL_CONVERSION = YES;
|
||||
CLANG_WARN_COMMA = YES;
|
||||
CLANG_WARN_CONSTANT_CONVERSION = YES;
|
||||
CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
|
||||
CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
|
||||
CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
|
||||
CLANG_WARN_EMPTY_BODY = YES;
|
||||
CLANG_WARN_ENUM_CONVERSION = YES;
|
||||
CLANG_WARN_INFINITE_RECURSION = YES;
|
||||
CLANG_WARN_INT_CONVERSION = YES;
|
||||
CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
|
||||
CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
|
||||
CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
|
||||
CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
|
||||
CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
|
||||
CLANG_WARN_STRICT_PROTOTYPES = YES;
|
||||
CLANG_WARN_SUSPICIOUS_MOVE = YES;
|
||||
CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
|
||||
CLANG_WARN_UNREACHABLE_CODE = YES;
|
||||
CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
|
||||
CODE_SIGN_IDENTITY = "-";
|
||||
CODE_SIGN_STYLE = Automatic;
|
||||
COPY_PHASE_STRIP = NO;
|
||||
DEBUG_INFORMATION_FORMAT = dwarf;
|
||||
ENABLE_STRICT_OBJC_MSGSEND = YES;
|
||||
ENABLE_TESTABILITY = YES;
|
||||
EXECUTABLE_PREFIX = "";
|
||||
GCC_C_LANGUAGE_STANDARD = gnu11;
|
||||
GCC_DYNAMIC_NO_PIC = NO;
|
||||
GCC_NO_COMMON_BLOCKS = YES;
|
||||
GCC_OPTIMIZATION_LEVEL = 0;
|
||||
GCC_PREPROCESSOR_DEFINITIONS = (
|
||||
"DEBUG=1",
|
||||
"$(inherited)",
|
||||
);
|
||||
GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
|
||||
GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
|
||||
GCC_WARN_UNDECLARED_SELECTOR = YES;
|
||||
GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
|
||||
GCC_WARN_UNUSED_FUNCTION = YES;
|
||||
GCC_WARN_UNUSED_VARIABLE = YES;
|
||||
INSTALL_PATH = "";
|
||||
MACOSX_DEPLOYMENT_TARGET = 10.14;
|
||||
MTL_ENABLE_DEBUG_INFO = YES;
|
||||
ONLY_ACTIVE_ARCH = NO;
|
||||
OTHER_CFLAGS = "-DBUILDKERNEL=1";
|
||||
"OTHER_CFLAGS[arch=*]" = "-DBUILDKERNEL=1";
|
||||
PRODUCT_NAME = "$(TARGET_NAME)";
|
||||
PUBLIC_HEADERS_FOLDER_PATH = /usr/local/standalone/firmware/include;
|
||||
SDKROOT = macosx.internal;
|
||||
SYSTEM_HEADER_SEARCH_PATHS = "$(SDKROOT)/System/Library/Frameworks/Kernel.framework/Headers";
|
||||
};
|
||||
name = Debug;
|
||||
};
|
||||
2C88438D21B74AD500C49BD9 /* Release */ = {
|
||||
isa = XCBuildConfiguration;
|
||||
buildSettings = {
|
||||
ALWAYS_SEARCH_USER_PATHS = NO;
|
||||
CLANG_ANALYZER_NONNULL = YES;
|
||||
CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
|
||||
CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
|
||||
CLANG_CXX_LIBRARY = "libc++";
|
||||
CLANG_ENABLE_MODULES = YES;
|
||||
CLANG_ENABLE_OBJC_ARC = YES;
|
||||
CLANG_ENABLE_OBJC_WEAK = YES;
|
||||
CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
|
||||
CLANG_WARN_BOOL_CONVERSION = YES;
|
||||
CLANG_WARN_COMMA = YES;
|
||||
CLANG_WARN_CONSTANT_CONVERSION = YES;
|
||||
CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
|
||||
CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
|
||||
CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
|
||||
CLANG_WARN_EMPTY_BODY = YES;
|
||||
CLANG_WARN_ENUM_CONVERSION = YES;
|
||||
CLANG_WARN_INFINITE_RECURSION = YES;
|
||||
CLANG_WARN_INT_CONVERSION = YES;
|
||||
CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
|
||||
CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
|
||||
CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
|
||||
CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
|
||||
CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
|
||||
CLANG_WARN_STRICT_PROTOTYPES = YES;
|
||||
CLANG_WARN_SUSPICIOUS_MOVE = YES;
|
||||
CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
|
||||
CLANG_WARN_UNREACHABLE_CODE = YES;
|
||||
CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
|
||||
CODE_SIGN_IDENTITY = "-";
|
||||
CODE_SIGN_STYLE = Automatic;
|
||||
COPY_PHASE_STRIP = NO;
|
||||
DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
|
||||
ENABLE_NS_ASSERTIONS = NO;
|
||||
ENABLE_STRICT_OBJC_MSGSEND = YES;
|
||||
EXECUTABLE_PREFIX = "";
|
||||
GCC_C_LANGUAGE_STANDARD = gnu11;
|
||||
GCC_NO_COMMON_BLOCKS = YES;
|
||||
GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
|
||||
GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
|
||||
GCC_WARN_UNDECLARED_SELECTOR = YES;
|
||||
GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
|
||||
GCC_WARN_UNUSED_FUNCTION = YES;
|
||||
GCC_WARN_UNUSED_VARIABLE = YES;
|
||||
INSTALL_PATH = "";
|
||||
MACOSX_DEPLOYMENT_TARGET = 10.14;
|
||||
MTL_ENABLE_DEBUG_INFO = NO;
|
||||
OTHER_CFLAGS = "-DBUILDKERNEL=1";
|
||||
PRODUCT_NAME = "$(TARGET_NAME)";
|
||||
PUBLIC_HEADERS_FOLDER_PATH = /usr/local/standalone/firmware/include;
|
||||
SDKROOT = macosx.internal;
|
||||
SYSTEM_HEADER_SEARCH_PATHS = "$(SDKROOT)/System/Library/Frameworks/Kernel.framework/Headers";
|
||||
};
|
||||
name = Release;
|
||||
};
|
||||
2CC8863F20D859F200D17D95 /* Debug */ = {
|
||||
isa = XCBuildConfiguration;
|
||||
buildSettings = {
|
||||
SUPPORTED_PLATFORMS = "macosx iphoneos tvos watchos";
|
||||
};
|
||||
name = Debug;
|
||||
};
|
||||
2CC8864020D859F200D17D95 /* Release */ = {
|
||||
isa = XCBuildConfiguration;
|
||||
buildSettings = {
|
||||
SUPPORTED_PLATFORMS = "macosx iphoneos tvos watchos";
|
||||
};
|
||||
name = Release;
|
||||
};
|
||||
2CD5E9C220D85B370097F130 /* Debug */ = {
|
||||
isa = XCBuildConfiguration;
|
||||
buildSettings = {
|
||||
CODE_SIGN_STYLE = Automatic;
|
||||
PRODUCT_NAME = "$(TARGET_NAME)";
|
||||
};
|
||||
name = Debug;
|
||||
};
|
||||
2CD5E9C320D85B370097F130 /* Release */ = {
|
||||
isa = XCBuildConfiguration;
|
||||
buildSettings = {
|
||||
CODE_SIGN_STYLE = Automatic;
|
||||
PRODUCT_NAME = "$(TARGET_NAME)";
|
||||
};
|
||||
name = Release;
|
||||
};
|
||||
/* End XCBuildConfiguration section */
|
||||
|
||||
/* Begin XCConfigurationList section */
|
||||
2C6CED1020E195360045D491 /* Build configuration list for PBXNativeTarget "libAccelerateCrypto" */ = {
|
||||
isa = XCConfigurationList;
|
||||
buildConfigurations = (
|
||||
2C6CED0E20E195360045D491 /* Debug */,
|
||||
2C6CED0F20E195360045D491 /* Release */,
|
||||
);
|
||||
defaultConfigurationIsVisible = 0;
|
||||
defaultConfigurationName = Release;
|
||||
};
|
||||
2C88438B21B74AD500C49BD9 /* Build configuration list for PBXNativeTarget "libAccelerateCrypto_kernel" */ = {
|
||||
isa = XCConfigurationList;
|
||||
buildConfigurations = (
|
||||
2C88438C21B74AD500C49BD9 /* Debug */,
|
||||
2C88438D21B74AD500C49BD9 /* Release */,
|
||||
);
|
||||
defaultConfigurationIsVisible = 0;
|
||||
defaultConfigurationName = Release;
|
||||
};
|
||||
2CC8863E20D859F200D17D95 /* Build configuration list for PBXProject "AccelerateCrypto" */ = {
|
||||
isa = XCConfigurationList;
|
||||
buildConfigurations = (
|
||||
2CC8863F20D859F200D17D95 /* Debug */,
|
||||
2CC8864020D859F200D17D95 /* Release */,
|
||||
);
|
||||
defaultConfigurationIsVisible = 0;
|
||||
defaultConfigurationName = Release;
|
||||
};
|
||||
2CD5E9C420D85B370097F130 /* Build configuration list for PBXAggregateTarget "AccelerateCrypto" */ = {
|
||||
isa = XCConfigurationList;
|
||||
buildConfigurations = (
|
||||
2CD5E9C220D85B370097F130 /* Debug */,
|
||||
2CD5E9C320D85B370097F130 /* Release */,
|
||||
);
|
||||
defaultConfigurationIsVisible = 0;
|
||||
defaultConfigurationName = Release;
|
||||
};
|
||||
/* End XCConfigurationList section */
|
||||
};
|
||||
rootObject = 2CC8863B20D859F200D17D95 /* Project object */;
|
||||
}
|
||||
|
|
@ -0,0 +1,121 @@
|
|||
/* Copyright (c) (2019,2020) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#ifndef AccelerateCrypto_h
|
||||
#define AccelerateCrypto_h
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif // __cplusplus
|
||||
|
||||
/*! @abstract SHA-1 160-bit digest update for numBlocks chunks of 64-byte (512-bit) data.
|
||||
*
|
||||
* @discussion
|
||||
* This routine is optimized for x86_64 (SSE3,AVX1,AVX2), arm64 (CRYPTO), and armv7 (NEON).
|
||||
*
|
||||
* @param state (input/output) Array of 5 uint32_t elements.
|
||||
*
|
||||
* @param numBlocks (input) Number of 64-byte data chunks.
|
||||
*
|
||||
* @param data (input) Array of size numBlocks*64 input bytes.
|
||||
*/
|
||||
void AccelerateCrypto_SHA1_compress(uint32_t *state, size_t numBlocks, const void *data);
|
||||
|
||||
/*! @abstract SHA-256 256-bit digest update for numBlocks chunks of 64-byte (512-bit) data.
|
||||
*
|
||||
* @discussion
|
||||
* This routine is optimized for x86_64 (SSE3,AVX1,AVX2), arm64 (CRYPTO), and armv7 (NEON).
|
||||
*
|
||||
* @param state (input/output) Array of 8 uint32_t elements.
|
||||
*
|
||||
* @param numBlocks (input) Number of 64-byte data chunks.
|
||||
*
|
||||
* @param data (input) Array of size numBlocks*64 input bytes.
|
||||
*/
|
||||
void AccelerateCrypto_SHA256_compress(uint32_t *state, size_t numBlocks, const void *data);
|
||||
|
||||
#if defined(__arm64__)
|
||||
void AccelerateCrypto_SHA256_compress_arm64neon(uint32_t *state, size_t numBlocks, const void *data);
|
||||
#endif
|
||||
|
||||
/*! @abstract SHA-512 512-bit digest update for numBlocks chunks of 128-byte (1,024-bit) data.
|
||||
*
|
||||
* @discussion
|
||||
* This routine is optimized for x86_64 (SSE3,AVX1,AVX2), arm64 (NEON), and armv7 (NEON).
|
||||
*
|
||||
* @param state (input/output) Array of 8 uint64_t elements.
|
||||
*
|
||||
* @param numBlocks (input) Number of 128-byte data chunks.
|
||||
*
|
||||
* @param data (input) Array of size numBlocks*128 input bytes.
|
||||
*/
|
||||
void AccelerateCrypto_SHA512_compress(uint64_t *state, size_t numBlocks, const void *data);
|
||||
|
||||
#if defined(__arm64__)
|
||||
void AccelerateCrypto_SHA512_compress_hwassist(uint64_t *state, size_t numBlocks, const void *data);
|
||||
#endif
|
||||
|
||||
/* AES expanded key context */
|
||||
#define KS_LENGTH 60
|
||||
typedef struct
|
||||
{ uint32_t ks[KS_LENGTH]; // maximum expanded key length = (14+1)*16 bytes = 15*16/4 = 60 uint32 words
|
||||
uint32_t rn; // rn = 16*(10,12,14) for AES-128,192,256
|
||||
} AccelerateCrypto_AES_ctx;
|
||||
|
||||
|
||||
/*! @abstract AES function encrypts a 16-byte input buffer to a 16-byte output buffer according to
|
||||
* a given input expanded key context.
|
||||
*
|
||||
* @discussion
|
||||
* This routine is optimized for x86_64 (aesni), arm64 (CRYPTO), and armv7 (NEON).
|
||||
*
|
||||
* @param in (input) Array of 16-byte message.
|
||||
*
|
||||
* @param out (output) Array of 16-byte encrypted message.
|
||||
*
|
||||
* @param key (input) Expanded key context for encryption.
|
||||
*
|
||||
* @return 0 on success; otherwise a nonzero number indicating failure in the encrypt function.
|
||||
*
|
||||
*/
|
||||
int AccelerateCrypto_AES_encrypt(const void *in, void *out, const AccelerateCrypto_AES_ctx *key);
|
||||
|
||||
/*! @abstract AES function decrypts a 16-byte input buffer to a 16-byte output buffer according to
|
||||
* a given input expanded key context.
|
||||
*
|
||||
* @discussion
|
||||
* This routine is optimized for x86_64 (aesni), arm64 (CRYPTO), and armv7 (NEON).
|
||||
*
|
||||
* @param in (input) Array of 16-byte encrypted message.
|
||||
*
|
||||
* @param out (output) Array of 16-byte decrypted message.
|
||||
*
|
||||
* @param key (input) Expanded key context for decryption.
|
||||
*
|
||||
* @return 0 on success; otherwise a nonzero number indicating failure in the decrypt function.
|
||||
*
|
||||
*/
|
||||
int AccelerateCrypto_AES_decrypt(const void *in, void *out, const AccelerateCrypto_AES_ctx *key);
|
||||
|
||||
#if defined(__arm64__)
|
||||
int AccelerateCrypto_ecb_AES_encrypt(const AccelerateCrypto_AES_ctx *key, uint32_t nblocks, const void *in, void *out);
|
||||
int AccelerateCrypto_ecb_AES_decrypt(const AccelerateCrypto_AES_ctx *key, uint32_t nblocks, const void *in, void *out);
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif // __cplusplus
|
||||
|
||||
#endif /* AccelerateCrypto_h */
|
||||
|
||||
|
|
@ -0,0 +1,167 @@
|
|||
/* Copyright (c) (2013,2015,2016,2019) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
|
||||
// #include <Availability.h>
|
||||
#include <sys/cdefs.h>
|
||||
|
||||
#if defined(__clang__) && ((defined(__apple_build_version__) && __apple_build_version__ > 5010000))
|
||||
#define __USES_V_CRYPTO_INTRINSICS 1
|
||||
#else
|
||||
#define __USES_V_CRYPTO_INTRINSICS 0
|
||||
#endif
|
||||
|
||||
|
||||
// AES INSTRUCTIONS
|
||||
// aese.16b v0, v1
|
||||
// aesd.16b v0, v1
|
||||
// aesmc.16b v0, v1
|
||||
// aesimc.16b v0, v1
|
||||
|
||||
// SHA1 INTRINSICS
|
||||
// sha1su0.4s v0, v1, v2
|
||||
// sha1su1.4s v0, v1
|
||||
// sha1c.4s v0, v1, v2 // or q0, s1, v2.4s
|
||||
// sha1m.4s v0, v1, v2 // or q0, s1, v2.4s
|
||||
// sha1p.4s v0, v1, v2 // or q0, s1, v2.4s
|
||||
// sha1h.4s v0, v1 // or s0, s1
|
||||
|
||||
// SHA256 INTRINSICS
|
||||
// sha256su0.4s v0, v1
|
||||
// sha256su1.4s v0, v1, v2
|
||||
// sha256h.4s v0, v1, v2 // or q0, q1, v2.4s
|
||||
// sha256h2.4s v0, v1, v2 // or q0, q1, v2.4s
|
||||
|
||||
|
||||
#if __USES_V_CRYPTO_INTRINSICS == 1
|
||||
.macro AESE
|
||||
aese.16b v$0, v$1
|
||||
.endm
|
||||
|
||||
.macro AESD
|
||||
aesd.16b v$0, v$1
|
||||
.endm
|
||||
|
||||
.macro AESMC
|
||||
aesmc.16b v$0, v$1
|
||||
.endm
|
||||
|
||||
.macro AESIMC
|
||||
aesimc.16b v$0, v$1
|
||||
.endm
|
||||
|
||||
|
||||
#else
|
||||
|
||||
.macro AESE
|
||||
aese q$0, q$1
|
||||
.endm
|
||||
|
||||
.macro AESD
|
||||
aesd q$0, q$1
|
||||
.endm
|
||||
|
||||
.macro AESMC
|
||||
aesmc q$0, q$1
|
||||
.endm
|
||||
|
||||
.macro AESIMC
|
||||
aesimc q$0, q$1
|
||||
.endm
|
||||
|
||||
#endif
|
||||
|
||||
#if __USES_V_CRYPTO_INTRINSICS == 1
|
||||
|
||||
.macro SHA1SU0
|
||||
sha1su0 v$0.4s, v$1.4s, v$2.4s
|
||||
.endm
|
||||
|
||||
.macro SHA1SU1
|
||||
sha1su1 v$0.4s, v$1.4s
|
||||
.endm
|
||||
|
||||
.macro SHA1C
|
||||
sha1c q$0, s$1, v$2.4s
|
||||
.endm
|
||||
|
||||
.macro SHA1M
|
||||
sha1m q$0, s$1, v$2.4s
|
||||
.endm
|
||||
|
||||
.macro SHA1P
|
||||
sha1p q$0, s$1, v$2.4s
|
||||
.endm
|
||||
|
||||
.macro SHA1H
|
||||
sha1h s$0, s$1
|
||||
.endm
|
||||
|
||||
.macro SHA256SU0
|
||||
sha256su0 v$0.4s, v$1.4s
|
||||
.endm
|
||||
|
||||
.macro SHA256SU1
|
||||
sha256su1 v$0.4s, v$1.4s, v$2.4s
|
||||
.endm
|
||||
|
||||
.macro SHA256H
|
||||
sha256h q$0, q$1, v$2.4s
|
||||
.endm
|
||||
|
||||
.macro SHA256H2
|
||||
sha256h2 q$0, q$1, v$2.4s
|
||||
.endm
|
||||
|
||||
#else
|
||||
|
||||
.macro SHA1SU0
|
||||
sha1su0 q$0, q$1, q$2
|
||||
.endm
|
||||
|
||||
.macro SHA1SU1
|
||||
sha1su1 q$0, q$1
|
||||
.endm
|
||||
|
||||
.macro SHA1C
|
||||
sha1c q$0, q$1, q$2
|
||||
.endm
|
||||
|
||||
.macro SHA1M
|
||||
sha1m q$0, q$1, q$2
|
||||
.endm
|
||||
|
||||
.macro SHA1P
|
||||
sha1p q$0, q$1, q$2
|
||||
.endm
|
||||
|
||||
.macro SHA1H
|
||||
sha1h q$0, q$1
|
||||
.endm
|
||||
|
||||
.macro SHA256SU0
|
||||
sha256su0 q$0, q$1
|
||||
.endm
|
||||
|
||||
.macro SHA256SU1
|
||||
sha256su1 q$0, q$1, q$2
|
||||
.endm
|
||||
|
||||
.macro SHA256H
|
||||
sha256h q$0, q$1, q$2
|
||||
.endm
|
||||
|
||||
.macro SHA256H2
|
||||
sha256h2 q$0, q$1, q$2
|
||||
.endm
|
||||
|
||||
#endif
|
||||
|
||||
|
|
@ -0,0 +1,66 @@
|
|||
/* Copyright (c) (2019) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
|
||||
#if (defined(__x86_64__) || defined(__i386__))
|
||||
|
||||
#if BUILDKERNEL
|
||||
|
||||
#include <i386/cpuid.h>
|
||||
#define HAS_AESNI() ((cpuid_features() & CPUID_FEATURE_AES) != 0)
|
||||
#define HAS_SupplementalSSE3() ((cpuid_features() & CPUID_FEATURE_SSSE3) != 0)
|
||||
#define HAS_AVX1() ((cpuid_features() & CPUID_FEATURE_AVX1_0) != 0)
|
||||
#define HAS_AVX2() ((cpuid_info()->cpuid_leaf7_features & CPUID_LEAF7_FEATURE_AVX2) != 0)
|
||||
#define HAS_AVX512_AND_IN_KERNEL() ((cpuid_info()->cpuid_leaf7_features & CPUID_LEAF7_FEATURE_AVX512F) !=0)
|
||||
|
||||
#elif (defined(__APPLE__) && defined(__MACH__) && (__has_include(<System/i386/cpu_capabilities.h>) || __has_include(<System/arm/cpu_capabilities.h>))) // XNU_KERNEL_AVAILABLE
|
||||
|
||||
#include <System/i386/cpu_capabilities.h>
|
||||
|
||||
extern int _cpu_capabilities;
|
||||
#define HAS_AESNI() (_cpu_capabilities & kHasAES)
|
||||
#define HAS_SupplementalSSE3() (_cpu_capabilities & kHasSupplementalSSE3)
|
||||
#define HAS_AVX1() (_cpu_capabilities & kHasAVX1_0)
|
||||
#define HAS_AVX2() (_cpu_capabilities & kHasAVX2_0)
|
||||
#define HAS_AVX512_AND_IN_KERNEL() 0
|
||||
|
||||
#else
|
||||
|
||||
#if (defined(__AES__))
|
||||
#define HAS_AESNI() __AES__
|
||||
#else
|
||||
#define HAS_AESNI() 0
|
||||
#endif // defined(__AES__)
|
||||
|
||||
#if (defined(__SSSE3__))
|
||||
#define HAS_SupplementalSSE3() __SSSE3__
|
||||
#else
|
||||
#define HAS_SupplementalSSE3() 0
|
||||
#endif // defined(__SSE3__)
|
||||
|
||||
#if (defined(__AVX__))
|
||||
#define HAS_AVX1() __AVX__
|
||||
#else
|
||||
#define HAS_AVX1() 0
|
||||
#endif // defined(__AVX__)
|
||||
|
||||
#if (defined(__AVX2__))
|
||||
#define HAS_AVX2() __AVX2__
|
||||
#else
|
||||
#define HAS_AVX2() 0
|
||||
#endif // defined(__AVX2__)
|
||||
|
||||
#define HAS_AVX512_AND_IN_KERNEL() 0
|
||||
|
||||
#endif
|
||||
|
||||
#endif // (defined(__x86_64__) || defined(__i386__))
|
||||
|
||||
|
|
@ -0,0 +1,12 @@
|
|||
# Copyright (c) (2019) Apple Inc. All rights reserved.
|
||||
#
|
||||
# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
# is contained in the License.txt file distributed with corecrypto) and only to
|
||||
# people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
# Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
# devices and computers you own or control, for the sole purpose of verifying the
|
||||
# security characteristics and correct functioning of the Apple Software. You may
|
||||
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
#
|
||||
|
||||
This is a clone of AccelerateCrypto-2.
|
||||
|
|
@ -0,0 +1,477 @@
|
|||
# Copyright (c) (2011,2012,2013,2014,2015,2016,2019) Apple Inc. All rights reserved.
|
||||
#
|
||||
# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
# is contained in the License.txt file distributed with corecrypto) and only to
|
||||
# people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
# Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
# devices and computers you own or control, for the sole purpose of verifying the
|
||||
# security characteristics and correct functioning of the Apple Software. You may
|
||||
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
#if defined(__arm__)
|
||||
|
||||
#define S0 r0
|
||||
#define S1 r1
|
||||
#define S2 r2
|
||||
#define S3 r3
|
||||
|
||||
#if Select == 0
|
||||
#define Name _AccelerateCrypto_AES_encrypt // Routine name.
|
||||
#define MTable _AESEncryptTable // Main table.
|
||||
#define FTable _AESSubBytesWordTable // Final table.
|
||||
#define P0 S0 // State permutation.
|
||||
#define P1 S1
|
||||
#define P2 S2
|
||||
#define P3 S3
|
||||
#define Increment +16 // ExpandedKey increment.
|
||||
#elif Select == 1
|
||||
#define Name _AccelerateCrypto_AES_decrypt // Routine name.
|
||||
#define MTable _AESDecryptTable // Main table.
|
||||
#define FTable _AESInvSubBytesWordTable // Final table.
|
||||
#define P0 S2 // State permutation.
|
||||
#define P1 S3
|
||||
#define P2 S0
|
||||
#define P3 S1
|
||||
#define Increment -16 // ExpandedKey increment.
|
||||
#endif // Select
|
||||
|
||||
#if defined(__ARM_NEON__) // vpaes uses NEON instructions
|
||||
.extern _AccelerateCrypto_vpaes_encrypt
|
||||
.extern _AccelerateCrypto_vpaes_decrypt
|
||||
#endif
|
||||
|
||||
#define ExpandedKey r11
|
||||
#define ExpandedKeyEnd lr
|
||||
#define ContextKeyLength 240
|
||||
#define t r12
|
||||
|
||||
.subsections_via_symbols
|
||||
.text
|
||||
.syntax unified
|
||||
.p2align 2
|
||||
.code 16
|
||||
.thumb_func Name
|
||||
.globl Name
|
||||
Name:
|
||||
#if defined(__ARM_NEON__) // if neon is available, use cache-attack resilient vector permute AES
|
||||
|
||||
#if Select == 0
|
||||
b _AccelerateCrypto_vpaes_encrypt
|
||||
#else
|
||||
b _AccelerateCrypto_vpaes_decrypt
|
||||
#endif
|
||||
|
||||
#else // __ARM_NEON__
|
||||
|
||||
// set up debug trace frame pointer
|
||||
push {r7,lr}
|
||||
mov r7, sp
|
||||
|
||||
// now setup the stack for the current function
|
||||
push {r1,r4-r6,r8-r11}
|
||||
sub sp, #(16+8) // make sp 16-byte aligned
|
||||
|
||||
// copy r0,r2 to r4,r11 to release r0,r2 (r1 is saved in stack) for use as S0-S3
|
||||
mov r4, r0
|
||||
mov ExpandedKey, r2
|
||||
|
||||
// Get and check "key length".
|
||||
ldr t, [ExpandedKey, #ContextKeyLength]
|
||||
cmp t, #160
|
||||
beq 2f
|
||||
cmp t, #192
|
||||
beq 2f
|
||||
cmp t, #224
|
||||
beq 2f
|
||||
mov r0, #-1 // Return error.
|
||||
b 9f
|
||||
2:
|
||||
|
||||
#if (Select == 0)
|
||||
// For encryption, prepare to iterate forward through expanded key.
|
||||
add ExpandedKeyEnd, ExpandedKey, t
|
||||
#else
|
||||
// For decryption, prepare to iterate backward through expanded key.
|
||||
mov ExpandedKeyEnd, ExpandedKey
|
||||
add ExpandedKey, t
|
||||
#endif
|
||||
|
||||
/*
|
||||
we need to do this for otherwise ldmia $0, {$1-$4} will hit memory access error when $0 is not word-aligned in thumb state
|
||||
*/
|
||||
.macro thumb2_ldmia
|
||||
ldr $1, [$0, #0]
|
||||
ldr $2, [$0, #4]
|
||||
ldr $3, [$0, #8]
|
||||
ldr $4, [$0, #12]
|
||||
.endm
|
||||
|
||||
.macro thumb2_stmia
|
||||
str $1, [$0, #0]
|
||||
str $2, [$0, #4]
|
||||
str $3, [$0, #8]
|
||||
str $4, [$0, #12]
|
||||
.endm
|
||||
|
||||
// Initialize State from input text.
|
||||
// we need to do this otherwise ldmia will crash when input (pointed by r4) is not word aligned
|
||||
thumb2_ldmia r4, S0, S1, S2, S3
|
||||
|
||||
// Add round key and save results.
|
||||
thumb2_ldmia ExpandedKey, r4, r5, r8, r10
|
||||
add ExpandedKey, #Increment
|
||||
|
||||
eor S0, r4
|
||||
eor S1, r5
|
||||
eor S2, r8
|
||||
eor S3, r10
|
||||
|
||||
// Set up r6 = _AESEncryptTable or _AESDecryptTable
|
||||
ldr r6, L_table1
|
||||
L_table0:
|
||||
mov r12, pc
|
||||
ldr r6, [r12, r6]
|
||||
|
||||
// save S0-S3 in the stack memory
|
||||
stmia sp, {S0-S3}
|
||||
|
||||
// use this to extract byte from a shifted word, tried use uxtb, same complexity, but then limit to armv6 or above
|
||||
mov r9, #0xff
|
||||
|
||||
// Get round key.
|
||||
thumb2_ldmia ExpandedKey, S0, S1, S2, S3
|
||||
add ExpandedKey, #Increment
|
||||
|
||||
// per round operation
|
||||
|
||||
/*
|
||||
the following macro defines the per round operation for aes
|
||||
the state computed from the previous round is now saved in sp[0:15]
|
||||
and r0-r3 has been initialized with the next expanded round key
|
||||
the macro reads those 16 bytes in sp[0:15] and for each byte does a table look up
|
||||
the result (4-byte) word is xor-ed to one of r0-r3
|
||||
the final r0-r3 is the aes state
|
||||
r6 : points to Main or Final table
|
||||
r9 : 0xff is used as a byte mask
|
||||
*/
|
||||
|
||||
.macro aes_per_round
|
||||
|
||||
#if defined (__ARM_ARCH_7S__)
|
||||
// better for swift and (old cortex-a8)
|
||||
|
||||
// S0 process
|
||||
ldr t, [sp, #0] // load 4 bytes for S0 process
|
||||
and r4, r9, t // byte 0
|
||||
and r5, r9, t, lsr #8 // byte 1
|
||||
ldr r4, [r6, r4, lsl #2] // 1st table lookup
|
||||
and r8, r9, t, lsr #16 // byte 2
|
||||
ldr r5, [r6, r5, lsl #2] // 2nd table lookup
|
||||
and r10, r9, t, lsr #24 // byte 3
|
||||
ldr r8, [r6, r8, lsl #2] // 3rd table lookup
|
||||
eor S0, r4 // S0 ^= 1st table lookup
|
||||
ldr r10, [r6, r10, lsl #2] // 4th table lookup
|
||||
eor P3, r5, ror #24 // P3 ^= 2nd table lookup
|
||||
ldr t, [sp, #4] // read Word for next S1 process
|
||||
eor S2, r8, ror #16 // S2 ^= 3rd table lookup
|
||||
eor P1, r10, ror #8 // P1 ^= 4th table lookup
|
||||
|
||||
// S1 process
|
||||
and r4, r9, t
|
||||
and r5, r9, t, lsr #8
|
||||
ldr r4, [r6, r4, lsl #2]
|
||||
and r8, r9, t, lsr #16
|
||||
ldr r5, [r6, r5, lsl #2]
|
||||
and r10, r9, t, lsr #24
|
||||
ldr r8, [r6, r8, lsl #2]
|
||||
eor S1, r4
|
||||
ldr r10, [r6, r10, lsl #2]
|
||||
eor P0, r5, ror #24
|
||||
ldr t, [sp, #8]
|
||||
eor S3, r8, ror #16
|
||||
eor P2, r10, ror #8
|
||||
|
||||
// S2 process
|
||||
and r4, r9, t
|
||||
and r5, r9, t, lsr #8
|
||||
ldr r4, [r6, r4, lsl #2]
|
||||
and r8, r9, t, lsr #16
|
||||
ldr r5, [r6, r5, lsl #2]
|
||||
and r10, r9, t, lsr #24
|
||||
ldr r8, [r6, r8, lsl #2]
|
||||
eor S2, r4
|
||||
ldr r10, [r6, r10, lsl #2]
|
||||
eor P1, r5, ror #24
|
||||
ldr t, [sp, #12]
|
||||
eor S0, r8, ror #16
|
||||
eor P3, r10, ror #8
|
||||
|
||||
// S3 process
|
||||
and r4, r9, t
|
||||
and r5, r9, t, lsr #8
|
||||
ldr r4, [r6, r4, lsl #2]
|
||||
and r8, r9, t, lsr #16
|
||||
ldr r5, [r6, r5, lsl #2]
|
||||
and r10, r9, t, lsr #24
|
||||
ldr r8, [r6, r8, lsl #2]
|
||||
eor S3, r4
|
||||
ldr r10, [r6, r10, lsl #2]
|
||||
eor P2, r5, ror #24
|
||||
eor S1, r8, ror #16
|
||||
eor P0, r10, ror #8
|
||||
|
||||
#else
|
||||
|
||||
// better for cortex-a7 and cortex-a9
|
||||
|
||||
// S0 process
|
||||
ldrb r4, [sp, #0] // byte 0
|
||||
ldrb r5, [sp, #1] // byte 1
|
||||
ldrb r8, [sp, #2] // byte 2
|
||||
ldrb r10, [sp, #3] // byte 3
|
||||
ldr r4, [r6, r4, lsl #2] // 1st table lookup
|
||||
ldr r5, [r6, r5, lsl #2] // 2nd table lookup
|
||||
ldr r8, [r6, r8, lsl #2] // 1st table lookup
|
||||
eor S0, r4 // S0 ^= 1st table lookup
|
||||
ldr r10, [r6, r10, lsl #2] // 2nd table lookup
|
||||
eor P3, r5, ror #24 // P3 ^= 2nd table lookup
|
||||
eor S2, r8, ror #16 // S2 ^= 3rd table lookup
|
||||
eor P1, r10, ror #8 // P1 ^= 4th table lookup
|
||||
|
||||
// S1 process
|
||||
ldrb r4, [sp, #4] // byte 0
|
||||
ldrb r5, [sp, #5] // byte 1
|
||||
ldrb r8, [sp, #6] // byte 2
|
||||
ldrb r10, [sp, #7] // byte 3
|
||||
ldr r4, [r6, r4, lsl #2]
|
||||
ldr r5, [r6, r5, lsl #2]
|
||||
ldr r8, [r6, r8, lsl #2]
|
||||
eor S1, r4
|
||||
ldr r10, [r6, r10, lsl #2]
|
||||
eor P0, r5, ror #24
|
||||
eor S3, r8, ror #16
|
||||
eor P2, r10, ror #8
|
||||
|
||||
// S2 process
|
||||
ldrb r4, [sp, #8] // byte 0
|
||||
ldrb r5, [sp, #9] // byte 1
|
||||
ldrb r8, [sp, #10] // byte 2
|
||||
ldrb r10, [sp, #11] // byte 3
|
||||
ldr r4, [r6, r4, lsl #2]
|
||||
ldr r5, [r6, r5, lsl #2]
|
||||
ldr r8, [r6, r8, lsl #2]
|
||||
eor S2, r4
|
||||
ldr r10, [r6, r10, lsl #2]
|
||||
eor P1, r5, ror #24
|
||||
eor S0, r8, ror #16
|
||||
eor P3, r10, ror #8
|
||||
|
||||
// S3 process
|
||||
ldrb r4, [sp, #12] // byte 0
|
||||
ldrb r5, [sp, #13] // byte 1
|
||||
ldrb r8, [sp, #14] // byte 2
|
||||
ldrb r10, [sp, #15] // byte 3
|
||||
ldr r4, [r6, r4, lsl #2]
|
||||
ldr r5, [r6, r5, lsl #2]
|
||||
ldr r8, [r6, r8, lsl #2]
|
||||
eor S3, r4
|
||||
ldr r10, [r6, r10, lsl #2]
|
||||
eor P2, r5, ror #24
|
||||
eor S1, r8, ror #16
|
||||
eor P0, r10, ror #8
|
||||
|
||||
#endif
|
||||
|
||||
.endm
|
||||
|
||||
.macro aes_last_round
|
||||
#if defined (__ARM_ARCH_7S__)
|
||||
// better for swift (and old cortex-a8)
|
||||
|
||||
// S0 process
|
||||
ldr t, [sp, #0] // load 4 bytes for S0 process
|
||||
and r4, r9, t // byte 0
|
||||
and r5, r9, t, lsr #8 // byte 1
|
||||
ldrb r4, [r6, r4] // 1st table lookup
|
||||
and r8, r9, t, lsr #16 // byte 2
|
||||
ldrb r5, [r6, r5] // 2nd table lookup
|
||||
and r10, r9, t, lsr #24 // byte 3
|
||||
ldrb r8, [r6, r8] // 3rd table lookup
|
||||
eor S0, r4 // S0 ^= 1st table lookup
|
||||
ldrb r10, [r6, r10] // 4th table lookup
|
||||
eor P3, r5, ror #24 // P3 ^= 2nd table lookup
|
||||
ldr t, [sp, #4] // read Word for next S1 process
|
||||
eor S2, r8, ror #16 // S2 ^= 3rd table lookup
|
||||
eor P1, r10, ror #8 // P1 ^= 4th table lookup
|
||||
|
||||
// S1 process
|
||||
and r4, r9, t
|
||||
and r5, r9, t, lsr #8
|
||||
ldrb r4, [r6, r4]
|
||||
and r8, r9, t, lsr #16
|
||||
ldrb r5, [r6, r5]
|
||||
and r10, r9, t, lsr #24
|
||||
ldrb r8, [r6, r8]
|
||||
eor S1, r4
|
||||
ldrb r10, [r6, r10]
|
||||
eor P0, r5, ror #24
|
||||
ldr t, [sp, #8]
|
||||
eor S3, r8, ror #16
|
||||
eor P2, r10, ror #8
|
||||
|
||||
// S2 process
|
||||
and r4, r9, t
|
||||
and r5, r9, t, lsr #8
|
||||
ldrb r4, [r6, r4]
|
||||
and r8, r9, t, lsr #16
|
||||
ldrb r5, [r6, r5]
|
||||
and r10, r9, t, lsr #24
|
||||
ldrb r8, [r6, r8]
|
||||
eor S2, r4
|
||||
ldrb r10, [r6, r10]
|
||||
eor P1, r5, ror #24
|
||||
ldr t, [sp, #12]
|
||||
eor S0, r8, ror #16
|
||||
eor P3, r10, ror #8
|
||||
|
||||
// S3 process
|
||||
and r4, r9, t
|
||||
and r5, r9, t, lsr #8
|
||||
ldrb r4, [r6, r4]
|
||||
and r8, r9, t, lsr #16
|
||||
ldrb r5, [r6, r5]
|
||||
and r10, r9, t, lsr #24
|
||||
ldrb r8, [r6, r8]
|
||||
eor S3, r4
|
||||
ldrb r10, [r6, r10]
|
||||
eor P2, r5, ror #24
|
||||
eor S1, r8, ror #16
|
||||
eor P0, r10, ror #8
|
||||
|
||||
#else
|
||||
// better for cortex-a7 and cortex-a9
|
||||
|
||||
// S0 process
|
||||
ldrb r4, [sp, #0] // byte 0
|
||||
ldrb r5, [sp, #1] // byte 1
|
||||
ldrb r8, [sp, #2] // byte 2
|
||||
ldrb r10, [sp, #3] // byte 3
|
||||
ldrb r4, [r6, r4] // 1st table lookup
|
||||
ldrb r5, [r6, r5] // 2nd table lookup
|
||||
ldrb r8, [r6, r8] // 3rd table lookup
|
||||
eor S0, r4 // S0 ^= 1st table lookup
|
||||
ldrb r10, [r6, r10] // 4th table lookup
|
||||
eor P3, r5, ror #24 // P3 ^= 2nd table lookup
|
||||
eor S2, r8, ror #16 // S2 ^= 3rd table lookup
|
||||
eor P1, r10, ror #8 // P1 ^= 4th table lookup
|
||||
|
||||
// S1 process
|
||||
ldrb r4, [sp, #4] // byte 0
|
||||
ldrb r5, [sp, #5] // byte 1
|
||||
ldrb r8, [sp, #6] // byte 2
|
||||
ldrb r10, [sp, #7] // byte 3
|
||||
ldrb r4, [r6, r4]
|
||||
ldrb r5, [r6, r5]
|
||||
ldrb r8, [r6, r8]
|
||||
eor S1, r4
|
||||
ldrb r10, [r6, r10]
|
||||
eor P0, r5, ror #24
|
||||
eor S3, r8, ror #16
|
||||
eor P2, r10, ror #8
|
||||
|
||||
// S2 process
|
||||
ldrb r4, [sp, #8] // byte 0
|
||||
ldrb r5, [sp, #9] // byte 1
|
||||
ldrb r8, [sp, #10] // byte 2
|
||||
ldrb r10, [sp, #11] // byte 3
|
||||
ldrb r4, [r6, r4]
|
||||
ldrb r5, [r6, r5]
|
||||
ldrb r8, [r6, r8]
|
||||
eor S2, r4
|
||||
ldrb r10, [r6, r10]
|
||||
eor P1, r5, ror #24
|
||||
eor S0, r8, ror #16
|
||||
eor P3, r10, ror #8
|
||||
|
||||
// S3 process
|
||||
ldrb r4, [sp, #12] // byte 0
|
||||
ldrb r5, [sp, #13] // byte 1
|
||||
ldrb r8, [sp, #14] // byte 2
|
||||
ldrb r10, [sp, #15] // byte 3
|
||||
ldrb r4, [r6, r4]
|
||||
ldrb r5, [r6, r5]
|
||||
ldrb r8, [r6, r8]
|
||||
eor S3, r4
|
||||
ldrb r10, [r6, r10]
|
||||
eor P2, r5, ror #24
|
||||
eor S1, r8, ror #16
|
||||
eor P0, r10, ror #8
|
||||
#endif
|
||||
|
||||
.endm
|
||||
|
||||
1:
|
||||
aes_per_round
|
||||
|
||||
// Save state for next iteration and load next round key.
|
||||
stmia sp,{S0-S3}
|
||||
thumb2_ldmia ExpandedKey, S0, S1, S2, S3
|
||||
|
||||
cmp ExpandedKeyEnd, ExpandedKey
|
||||
add ExpandedKey, #Increment
|
||||
bne 1b
|
||||
|
||||
// setup r6 = _AESSubBytesWordTable or _AESInvSubBytesWordTable
|
||||
ldr r6, L_table3
|
||||
L_table2:
|
||||
mov r12, pc
|
||||
ldr r6, [r12, r6]
|
||||
|
||||
aes_last_round
|
||||
|
||||
ldr r4, [sp, #(16+8)] // restore OutputText
|
||||
thumb2_stmia r4, S0, S1, S2, S3
|
||||
eor r0, r0 // Return success.
|
||||
|
||||
9:
|
||||
|
||||
add sp, #(4+16+8) // skip r1 restore
|
||||
pop {r4-r6,r8-r11}
|
||||
pop {r7, pc}
|
||||
|
||||
|
||||
.p2align 2
|
||||
L_table1:
|
||||
.long L_Tab$non_lazy_ptr-(L_table0+4)
|
||||
|
||||
.p2align 2
|
||||
L_table3:
|
||||
.long L_Tab$non_lazy_ptr2-(L_table2+4)
|
||||
|
||||
.section __DATA,__nl_symbol_ptr,non_lazy_symbol_pointers
|
||||
.p2align 2
|
||||
L_Tab$non_lazy_ptr:
|
||||
.indirect_symbol MTable
|
||||
.long 0
|
||||
|
||||
.p2align 2
|
||||
L_Tab$non_lazy_ptr2:
|
||||
.indirect_symbol FTable
|
||||
.long 0
|
||||
|
||||
#endif // __ARM_NEON__
|
||||
|
||||
#undef S0
|
||||
#undef S1
|
||||
#undef S2
|
||||
#undef S3
|
||||
#undef Name
|
||||
#undef MTable
|
||||
#undef FTable
|
||||
#undef P0
|
||||
#undef P1
|
||||
#undef P2
|
||||
#undef P3
|
||||
#undef Increment
|
||||
|
||||
#endif /* defined(__arm__) */
|
||||
|
||||
|
|
@ -0,0 +1,13 @@
|
|||
# Copyright (c) (2019) Apple Inc. All rights reserved.
|
||||
#
|
||||
# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
# is contained in the License.txt file distributed with corecrypto) and only to
|
||||
# people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
# Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
# devices and computers you own or control, for the sole purpose of verifying the
|
||||
# security characteristics and correct functioning of the Apple Software. You may
|
||||
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
|
||||
#define Select 1
|
||||
#include "EncryptDecrypt.s"
|
||||
#undef Select
|
||||
|
|
@ -0,0 +1,13 @@
|
|||
# Copyright (c) (2019) Apple Inc. All rights reserved.
|
||||
#
|
||||
# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
# is contained in the License.txt file distributed with corecrypto) and only to
|
||||
# people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
# Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
# devices and computers you own or control, for the sole purpose of verifying the
|
||||
# security characteristics and correct functioning of the Apple Software. You may
|
||||
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
|
||||
#define Select 0
|
||||
#include "EncryptDecrypt.s"
|
||||
#undef Select
|
||||
|
|
@ -0,0 +1,751 @@
|
|||
# Copyright (c) (2015,2016,2019) Apple Inc. All rights reserved.
|
||||
#
|
||||
# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
# is contained in the License.txt file distributed with corecrypto) and only to
|
||||
# people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
# Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
# devices and computers you own or control, for the sole purpose of verifying the
|
||||
# security characteristics and correct functioning of the Apple Software. You may
|
||||
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
|
||||
|
||||
#if !defined(__arm64__) && defined(__ARM_NEON__)
|
||||
|
||||
#define ekey r2
|
||||
#define eax r4
|
||||
|
||||
.macro save_all_neon
|
||||
#if BUILDKERNEL
|
||||
vstmdb sp!, {q12-q15}
|
||||
vstmdb sp!, {q8-q11}
|
||||
vstmdb sp!, {q0-q3}
|
||||
#endif
|
||||
vstmdb sp!, {q4-q7}
|
||||
.endm
|
||||
|
||||
.macro restore_all_neon
|
||||
vldmia sp!, {q4-q7}
|
||||
#if BUILDKERNEL
|
||||
vldmia sp!, {q0-q3}
|
||||
vldmia sp!, {q8-q11}
|
||||
vldmia sp!, {q12-q15}
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro vpaes_push
|
||||
push {r4-r7,lr}
|
||||
add r7, sp, #12
|
||||
push {r8,r10,r11}
|
||||
.endm
|
||||
|
||||
.macro vpaes_pop
|
||||
pop {r8,r10,r11}
|
||||
pop {r4-r7,pc}
|
||||
.endm
|
||||
|
||||
.p2align 6
|
||||
.Lk_ipt:
|
||||
.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
|
||||
.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
|
||||
|
||||
.Lk_sbo:
|
||||
.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
|
||||
.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
|
||||
|
||||
.Lk_mc_forward:
|
||||
.quad 0x0407060500030201, 0x0C0F0E0D080B0A09
|
||||
.quad 0x080B0A0904070605, 0x000302010C0F0E0D
|
||||
.quad 0x0C0F0E0D080B0A09, 0x0407060500030201
|
||||
.quad 0x000302010C0F0E0D, 0x080B0A0904070605
|
||||
|
||||
.Lk_mc_backward:
|
||||
.quad 0x0605040702010003, 0x0E0D0C0F0A09080B
|
||||
.quad 0x020100030E0D0C0F, 0x0A09080B06050407
|
||||
.quad 0x0E0D0C0F0A09080B, 0x0605040702010003
|
||||
.quad 0x0A09080B06050407, 0x020100030E0D0C0F
|
||||
|
||||
.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
|
||||
.quad 0x030E09040F0A0500, 0x0B06010C07020D08
|
||||
.quad 0x0F060D040B020900, 0x070E050C030A0108
|
||||
.quad 0x0B0E0104070A0D00, 0x0306090C0F020508
|
||||
|
||||
|
||||
.p2align 4
|
||||
vpaes_encrypt_core:
|
||||
mov r9, ekey
|
||||
mov r11, #16
|
||||
adr r10, .Lk_ipt
|
||||
ldr eax, [ekey, #240]
|
||||
vldmia r10!,{q3-q4}
|
||||
vbic q1, q0, q9
|
||||
vld1.8 {q5}, [r9]!
|
||||
vshr.u32 q1, q1, #4
|
||||
vand q0, q0, q9
|
||||
|
||||
vtbl.8 d4, {q3}, d0
|
||||
vtbl.8 d5, {q3}, d1
|
||||
|
||||
adr r10, .Lk_mc_backward
|
||||
|
||||
vtbl.8 d0, {q4}, d2
|
||||
vtbl.8 d1, {q4}, d3
|
||||
veor q2, q2, q5
|
||||
veor q0, q0, q2
|
||||
cmp eax, #0
|
||||
b .Lenc_entry
|
||||
|
||||
.p2align 4
|
||||
.Lenc_loop:
|
||||
|
||||
vtbl.8 d8, {q13}, d4
|
||||
vtbl.8 d9, {q13}, d5
|
||||
vtbl.8 d0, {q12}, d6
|
||||
vtbl.8 d1, {q12}, d7
|
||||
veor q4, q4, q5
|
||||
add r12, r10, r11
|
||||
veor q5, q0, q4
|
||||
vld1.8 {q4}, [r12 :128]
|
||||
sub r12, r12, #64
|
||||
vtbl.8 d12, {q15}, d4
|
||||
vtbl.8 d13, {q15}, d5
|
||||
vld1.8 {q1}, [r12 :128]
|
||||
|
||||
vtbl.8 d4, {q14}, d6
|
||||
vtbl.8 d5, {q14}, d7
|
||||
|
||||
veor q2, q2, q6
|
||||
|
||||
vtbl.8 d6, {q5}, d8
|
||||
vtbl.8 d7, {q5}, d9
|
||||
vtbl.8 d0, {q5}, d2
|
||||
vtbl.8 d1, {q5}, d3
|
||||
veor q5, q0, q2
|
||||
|
||||
add r11, r11, #16
|
||||
veor q3, q3, q5
|
||||
vtbl.8 d0, {q5}, d2
|
||||
vtbl.8 d1, {q5}, d3
|
||||
and r11, r11, #48
|
||||
subs eax, eax, #1
|
||||
veor q0, q0, q3
|
||||
|
||||
.Lenc_entry:
|
||||
|
||||
|
||||
vbic q1, q0, q9
|
||||
vand q0, q0, q9
|
||||
vshr.u32 q1, q1, #4
|
||||
|
||||
vtbl.8 d10, {q11}, d0
|
||||
vtbl.8 d11, {q11}, d1
|
||||
|
||||
veor q0, q0, q1
|
||||
|
||||
vtbl.8 d6, {q10}, d2
|
||||
vtbl.8 d7, {q10}, d3
|
||||
vtbl.8 d8, {q10}, d0
|
||||
vtbl.8 d9, {q10}, d1
|
||||
|
||||
veor q3, q3, q5
|
||||
veor q4, q4, q5
|
||||
|
||||
vtbl.8 d4, {q10}, d6
|
||||
vtbl.8 d5, {q10}, d7
|
||||
vtbl.8 d6, {q10}, d8
|
||||
vtbl.8 d7, {q10}, d9
|
||||
|
||||
veor q2, q2, q0
|
||||
veor q3, q3, q1
|
||||
|
||||
vld1.8 {q5}, [r9]!
|
||||
bgt .Lenc_loop
|
||||
|
||||
adr r12, .Lk_sbo
|
||||
|
||||
vld1.8 {q1}, [r12]!
|
||||
vtbl.8 d8, {q1}, d4
|
||||
vtbl.8 d9, {q1}, d5
|
||||
vld1.8 {q2}, [r12]
|
||||
add r12, r10, r11
|
||||
veor q4, q4, q5
|
||||
add r12, r12, #64
|
||||
vtbl.8 d0, {q2}, d6
|
||||
vtbl.8 d1, {q2}, d7
|
||||
vld1.8 {q1}, [r12]
|
||||
veor q2, q0, q4
|
||||
vtbl.8 d0, {q2}, d2
|
||||
vtbl.8 d1, {q2}, d3
|
||||
bx lr
|
||||
|
||||
|
||||
.p2align 4
|
||||
.Lk_dipt:
|
||||
.quad 0x0F505B040B545F00, 0x154A411E114E451A
|
||||
.quad 0x86E383E660056500, 0x12771772F491F194
|
||||
.quad 0x000302010C0F0E0D, 0x080B0A0904070605 // .Lk_mc_forward+48
|
||||
|
||||
.Lk_dsb9:
|
||||
.quad 0x851C03539A86D600, 0xCAD51F504F994CC9
|
||||
.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
|
||||
.Lk_dsbd:
|
||||
.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
|
||||
.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
|
||||
.Lk_dsbb:
|
||||
.quad 0xD022649296B44200, 0x602646F6B0F2D404
|
||||
.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
|
||||
.Lk_dsbe:
|
||||
.quad 0x46F2929626D4D000, 0x2242600464B4F6B0
|
||||
.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
|
||||
.Lk_dsbo:
|
||||
.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
|
||||
.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
|
||||
|
||||
.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
|
||||
.quad 0x0F060D040B020900, 0x070E050C030A0108
|
||||
|
||||
|
||||
.p2align 4
|
||||
vpaes_decrypt_core:
|
||||
mov r9, r2 // dkey
|
||||
ldr eax, [r2, #240] // Nr
|
||||
adr r12, .Lk_dipt
|
||||
vbic q1, q0, q9
|
||||
vld1.64 {q3}, [r12 :128]!
|
||||
vshr.u32 q1, q1, #4
|
||||
vld1.8 {q5}, [r9]!
|
||||
lsl r11, eax, #4
|
||||
vand q2, q0, q9
|
||||
vtbl.8 d4, {q3}, d4
|
||||
vtbl.8 d5, {q3}, d5
|
||||
vld1.64 {q4}, [r12 :128]!
|
||||
eor r11, r11, #48
|
||||
adr r10, .Lk_dsbd
|
||||
vtbl.8 d0, {q4}, d2
|
||||
vtbl.8 d1, {q4}, d3
|
||||
and r11, r11, #48
|
||||
veor q2, q2, q5
|
||||
vld1.64 {q5}, [r12 :128]!
|
||||
veor q0, q0, q2
|
||||
cmp eax, #0
|
||||
b .Ldec_entry
|
||||
|
||||
.p2align 4
|
||||
.Ldec_loop:
|
||||
|
||||
sub r12, r10, 32
|
||||
vld1.64 {q6-q7}, [r12 :128]!
|
||||
vtbl.8 d8, {q6}, d4
|
||||
vtbl.8 d9, {q6}, d5
|
||||
vtbl.8 d2, {q7}, d6
|
||||
vtbl.8 d3, {q7}, d7
|
||||
vld1.64 {q6-q7}, [r12 :128]!
|
||||
veor q0, q0, q4
|
||||
vtbl.8 d8, {q6}, d4
|
||||
vtbl.8 d9, {q6}, d5
|
||||
veor q6, q0, q1
|
||||
vtbl.8 d2, {q7}, d6
|
||||
vtbl.8 d3, {q7}, d7
|
||||
vtbl.8 d0, {q6}, d10
|
||||
vtbl.8 d1, {q6}, d11
|
||||
vld1.64 {q6-q7}, [r12 :128]!
|
||||
|
||||
veor q0, q0, q4
|
||||
vtbl.8 d8, {q6}, d4
|
||||
vtbl.8 d9, {q6}, d5
|
||||
veor q6, q0, q1
|
||||
vtbl.8 d2, {q7}, d6
|
||||
vtbl.8 d3, {q7}, d7
|
||||
vtbl.8 d0, {q6}, d10
|
||||
vtbl.8 d1, {q6}, d11
|
||||
vld1.64 {q6-q7}, [r12 :128]!
|
||||
|
||||
veor q0, q0, q4
|
||||
vtbl.8 d8, {q6}, d4
|
||||
vtbl.8 d9, {q6}, d5
|
||||
veor q6, q0, q1
|
||||
vtbl.8 d2, {q7}, d6
|
||||
vtbl.8 d3, {q7}, d7
|
||||
vtbl.8 d0, {q6}, d10
|
||||
vtbl.8 d1, {q6}, d11
|
||||
|
||||
veor q0, q0, q4
|
||||
|
||||
vext.8 q5, q5, q5, #12
|
||||
veor q0, q0, q1
|
||||
subs eax, eax, #1
|
||||
|
||||
.Ldec_entry:
|
||||
|
||||
vbic q1, q0, q9
|
||||
vand q0, q0, q9
|
||||
vshr.u32 q1, q1, #4
|
||||
vtbl.8 d4, {q11}, d0
|
||||
vtbl.8 d5, {q11}, d1
|
||||
|
||||
|
||||
veor q0, q0, q1
|
||||
vtbl.8 d6, {q10}, d2
|
||||
vtbl.8 d7, {q10}, d3
|
||||
|
||||
|
||||
veor q3, q3, q2
|
||||
vtbl.8 d8, {q10}, d0
|
||||
vtbl.8 d9, {q10}, d1
|
||||
|
||||
veor q4, q4, q2
|
||||
vtbl.8 d4, {q10}, d6
|
||||
vtbl.8 d5, {q10}, d7
|
||||
|
||||
veor q2, q2, q0
|
||||
vtbl.8 d6, {q10}, d8
|
||||
vtbl.8 d7, {q10}, d9
|
||||
|
||||
vld1.8 {q0}, [r9]!
|
||||
veor q3, q3, q1
|
||||
bne .Ldec_loop
|
||||
|
||||
vld1.64 {q6-q7}, [r12 :128]!
|
||||
|
||||
vtbl.8 d8, {q6}, d4
|
||||
vtbl.8 d9, {q6}, d5
|
||||
add r12, r12, r11, lsr #1
|
||||
vtbl.8 d6, {q7}, d6
|
||||
vtbl.8 d7, {q7}, d7
|
||||
vld1.64 {q2}, [r12]
|
||||
veor q0, q0, q4
|
||||
veor q1, q0, q3
|
||||
|
||||
vtbl.8 d0, {q1}, d4
|
||||
vtbl.8 d1, {q1}, d5
|
||||
bx lr
|
||||
|
||||
.p2align 6
|
||||
.Lk_ipt2:
|
||||
.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
|
||||
.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
|
||||
.Lk_rcon:
|
||||
.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
|
||||
.Lk_sr:
|
||||
.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
|
||||
.quad 0x030E09040F0A0500, 0x0B06010C07020D08
|
||||
.quad 0x0F060D040B020900, 0x070E050C030A0108
|
||||
.quad 0x0B0E0104070A0D00, 0x0306090C0F020508
|
||||
|
||||
|
||||
.p2align 4
|
||||
vpaes_schedule_core:
|
||||
bl vpaes_preheat
|
||||
adr r10, .Lk_rcon
|
||||
vld1.8 {q0}, [r0]
|
||||
vld1.64 {q8}, [r10 :128]!
|
||||
vmov q3, q0
|
||||
adr r11, .Lk_ipt2
|
||||
bl vpaes_schedule_transform
|
||||
vmov q7, q0
|
||||
|
||||
cmp r3, #0
|
||||
bne .Lschedule_am_decrypting
|
||||
|
||||
vst1.8 {q0}, [r2]
|
||||
|
||||
b .Lschedule_go
|
||||
|
||||
.Lschedule_am_decrypting:
|
||||
|
||||
add r12, r10, r8
|
||||
vmov q1, q3
|
||||
vld1.8 {q3}, [r12]
|
||||
vtbl.8 d6, {q1}, d6
|
||||
vtbl.8 d7, {q1}, d7
|
||||
eor r8, r8, #48
|
||||
vst1.8 {q3}, [r2]
|
||||
|
||||
|
||||
.Lschedule_go:
|
||||
cmp r1, #192
|
||||
bgt .Lschedule_256
|
||||
beq .Lschedule_192
|
||||
|
||||
.Lschedule_128:
|
||||
mov r1, #10
|
||||
|
||||
.Loop_schedule_128:
|
||||
bl vpaes_schedule_round
|
||||
subs r1, r1, #1
|
||||
beq .Lschedule_mangle_last
|
||||
bl vpaes_schedule_mangle
|
||||
b .Loop_schedule_128
|
||||
|
||||
.p2align 4
|
||||
.Lschedule_192:
|
||||
add r12, r0, #8
|
||||
vld1.8 {q0}, [r12]
|
||||
bl vpaes_schedule_transform
|
||||
vmov d13, d1
|
||||
veor d12, d12, d12
|
||||
mov r1, #4
|
||||
|
||||
.Loop_schedule_192:
|
||||
bl vpaes_schedule_round
|
||||
vext.8 q0, q6, q0, #8
|
||||
|
||||
bl vpaes_schedule_mangle
|
||||
bl vpaes_schedule_192_smear
|
||||
bl vpaes_schedule_mangle
|
||||
bl vpaes_schedule_round
|
||||
subs r1, r1, #1
|
||||
beq .Lschedule_mangle_last
|
||||
bl vpaes_schedule_mangle
|
||||
bl vpaes_schedule_192_smear
|
||||
b .Loop_schedule_192
|
||||
|
||||
.p2align 4
|
||||
.Lschedule_256:
|
||||
add r12, r0, #16
|
||||
vld1.8 {q0}, [r12]
|
||||
bl vpaes_schedule_transform
|
||||
mov r1, #7
|
||||
|
||||
.Loop_schedule_256:
|
||||
bl vpaes_schedule_mangle
|
||||
vmov q6, q0
|
||||
|
||||
bl vpaes_schedule_round
|
||||
subs r1, r1, #1
|
||||
beq .Lschedule_mangle_last
|
||||
bl vpaes_schedule_mangle
|
||||
|
||||
vdup.32 q0, d1[1]
|
||||
vmov q5, q7
|
||||
vmov q7, q6
|
||||
bl vpaes_schedule_low_round
|
||||
vmov q7, q5
|
||||
|
||||
b .Loop_schedule_256
|
||||
|
||||
.p2align 4
|
||||
.Lk_opt:
|
||||
.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
|
||||
.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
|
||||
|
||||
.Lk_deskew:
|
||||
.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
|
||||
.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
|
||||
|
||||
.p2align 4
|
||||
.Lschedule_mangle_last:
|
||||
|
||||
adr r11, .Lk_deskew
|
||||
cmp r3, #0
|
||||
bne .Lschedule_mangle_last_dec
|
||||
|
||||
add r12, r8, r10
|
||||
vld1.8 {q1}, [r12]
|
||||
adr r11, .Lk_opt
|
||||
vtbl.8 d2, {q0}, d2
|
||||
vtbl.8 d3, {q0}, d3
|
||||
vmov q0, q1
|
||||
add r2, r2, #32
|
||||
|
||||
.Lschedule_mangle_last_dec:
|
||||
adr r12, .Lk_s63
|
||||
sub r2, r2, #16
|
||||
vld1.8 {q1}, [r12]
|
||||
veor q0, q0, q1
|
||||
bl vpaes_schedule_transform
|
||||
vst1.8 {q0}, [r2]
|
||||
|
||||
restore_all_neon
|
||||
|
||||
eor r0, r0, r0
|
||||
vpaes_pop
|
||||
|
||||
|
||||
.p2align 4
|
||||
vpaes_schedule_192_smear:
|
||||
vdup.32 q1, d12[0]
|
||||
vdup.32 q0, d15[1]
|
||||
vmov s7, s26
|
||||
vmov s0, s30
|
||||
veor q6, q6, q1
|
||||
veor q6, q6, q0
|
||||
vmov q0, q6
|
||||
veor d12, d12, d12
|
||||
bx lr
|
||||
|
||||
|
||||
.p2align 4
|
||||
vpaes_schedule_round:
|
||||
|
||||
veor q1, q1, q1
|
||||
vext.8 q1, q8, q1, #15
|
||||
vext.8 q8, q8, q8, #15
|
||||
veor q7, q7, q1
|
||||
vdup.32 q0, d1[1]
|
||||
vext.8 q0, q0, q0, #1
|
||||
|
||||
vpaes_schedule_low_round:
|
||||
|
||||
veor q1, q1, q1
|
||||
adr r12, .Lk_s63
|
||||
vext.8 q1, q1, q7, #12
|
||||
veor q2, q2, q2
|
||||
veor q7, q7, q1
|
||||
vld1.8 {q1}, [r12]
|
||||
vext.8 q2, q2, q7, #8
|
||||
veor q7, q7, q1
|
||||
veor q7, q7, q2
|
||||
|
||||
|
||||
vbic q1, q0, q9
|
||||
vshr.u32 q1, q1, #4
|
||||
vand q0, q0, q9
|
||||
|
||||
vtbl.8 d4, {q11}, d0
|
||||
vtbl.8 d5, {q11}, d1
|
||||
|
||||
veor q0, q0, q1
|
||||
|
||||
vtbl.8 d6, {q10}, d2
|
||||
vtbl.8 d7, {q10}, d3
|
||||
|
||||
veor q3, q3, q2
|
||||
|
||||
vtbl.8 d8, {q10}, d0
|
||||
vtbl.8 d9, {q10}, d1
|
||||
|
||||
veor q4, q4, q2
|
||||
|
||||
vtbl.8 d4, {q10}, d6
|
||||
vtbl.8 d5, {q10}, d7
|
||||
|
||||
veor q2, q2, q0
|
||||
|
||||
|
||||
vtbl.8 d6, {q10}, d8
|
||||
vtbl.8 d7, {q10}, d9
|
||||
|
||||
veor q3, q3, q1
|
||||
|
||||
vtbl.8 d8, {q13}, d4
|
||||
vtbl.8 d9, {q13}, d5
|
||||
|
||||
vtbl.8 d0, {q12}, d6
|
||||
vtbl.8 d1, {q12}, d7
|
||||
|
||||
veor q0, q0, q4
|
||||
veor q0, q0, q7
|
||||
vmov q7, q0
|
||||
|
||||
bx lr
|
||||
|
||||
.p2align 4
|
||||
vpaes_schedule_transform:
|
||||
vbic q1, q0, q9
|
||||
vldmia r11, {q4-q5}
|
||||
vand q0, q0, q9
|
||||
vshr.u32 q1, q1, #4
|
||||
vtbl.8 d0, {q4}, d0
|
||||
vtbl.8 d1, {q4}, d1
|
||||
vtbl.8 d2, {q5}, d2
|
||||
vtbl.8 d3, {q5}, d3
|
||||
veor q0, q0, q1
|
||||
bx lr
|
||||
|
||||
|
||||
.p2align 4
|
||||
.Lk_mc_forward2:
|
||||
.quad 0x0407060500030201, 0x0C0F0E0D080B0A09
|
||||
.Lk_s63:
|
||||
.quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
|
||||
|
||||
.Lk_dksd:
|
||||
.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
|
||||
.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
|
||||
.Lk_dksb:
|
||||
.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
|
||||
.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
|
||||
.Lk_dkse:
|
||||
.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
|
||||
.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
|
||||
.Lk_dks9:
|
||||
.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
|
||||
.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
|
||||
|
||||
.p2align 4
|
||||
vpaes_schedule_mangle:
|
||||
vstmdb sp!, {q6-q7}
|
||||
adr r12, .Lk_mc_forward2
|
||||
vmov q4, q0
|
||||
cmp r3, #0
|
||||
vldmia r12!, {q5-q6} // q5 = Lk_mc_forward2, q6 = Lk_s63
|
||||
bne .Lschedule_mangle_dec
|
||||
add r2, r2, #16
|
||||
veor q4, q4, q6
|
||||
|
||||
vtbl.8 d6, {q4}, d10
|
||||
vtbl.8 d7, {q4}, d11
|
||||
vtbl.8 d8, {q3}, d10
|
||||
vtbl.8 d9, {q3}, d11
|
||||
vtbl.8 d2, {q4}, d10
|
||||
vtbl.8 d3, {q4}, d11
|
||||
veor q3, q3, q4
|
||||
veor q3, q3, q1
|
||||
b .Lschedule_mangle_both
|
||||
|
||||
.p2align 4
|
||||
.Lschedule_mangle_dec:
|
||||
|
||||
vbic q1, q4, q9
|
||||
vldmia r12!, {q6-q7}
|
||||
vshr.u32 q1, q1, #4
|
||||
vand q4, q4, q9
|
||||
|
||||
vtbl.8 d4, {q6}, d8
|
||||
vtbl.8 d5, {q6}, d9
|
||||
vtbl.8 d6, {q7}, d2
|
||||
vtbl.8 d7, {q7}, d3
|
||||
vldmia r12!, {q6-q7}
|
||||
veor q2, q3, q2
|
||||
vtbl.8 d6, {q2}, d10
|
||||
vtbl.8 d7, {q2}, d11
|
||||
|
||||
|
||||
vtbl.8 d4, {q6}, d8
|
||||
vtbl.8 d5, {q6}, d9
|
||||
veor q2, q2, q3
|
||||
vtbl.8 d6, {q7}, d2
|
||||
vtbl.8 d7, {q7}, d3
|
||||
vldmia r12!, {q6-q7}
|
||||
veor q2, q3, q2
|
||||
vtbl.8 d6, {q2}, d10
|
||||
vtbl.8 d7, {q2}, d11
|
||||
|
||||
vtbl.8 d4, {q6}, d8
|
||||
vtbl.8 d5, {q6}, d9
|
||||
veor q2, q2, q3
|
||||
vtbl.8 d6, {q7}, d2
|
||||
vtbl.8 d7, {q7}, d3
|
||||
vldmia r12!, {q6-q7}
|
||||
veor q2, q3, q2
|
||||
vtbl.8 d6, {q2}, d10
|
||||
vtbl.8 d7, {q2}, d11
|
||||
|
||||
vtbl.8 d4, {q6}, d8
|
||||
vtbl.8 d5, {q6}, d9
|
||||
veor q2, q2, q3
|
||||
vtbl.8 d6, {q7}, d2
|
||||
vtbl.8 d7, {q7}, d3
|
||||
veor q3, q3, q2
|
||||
|
||||
sub r2, r2, #16
|
||||
|
||||
.Lschedule_mangle_both:
|
||||
add r12, r10, r8
|
||||
vld1.8 {q1}, [r12]
|
||||
sub r8, r8, #16
|
||||
vtbl.8 d4, {q3}, d2
|
||||
vtbl.8 d5, {q3}, d3
|
||||
and r8, r8, #48
|
||||
vst1.8 {q2}, [r2]
|
||||
vldmia sp!, {q6-q7}
|
||||
bx lr
|
||||
|
||||
|
||||
|
||||
|
||||
/*
|
||||
int vpaes_set_encrypt_key(const uint8_t *userKey, int bits, void *key);
|
||||
*/
|
||||
|
||||
#define userKey r0
|
||||
#define AES_bits r1
|
||||
#define key r2
|
||||
#define t r12
|
||||
.globl _AccelerateCrypto_vpaes_set_encrypt_key
|
||||
.p2align 4
|
||||
_AccelerateCrypto_vpaes_set_encrypt_key:
|
||||
|
||||
|
||||
// 128/192/256 divide by 32 = 4/6/8 + 5 - 9/11/13
|
||||
lsr t, AES_bits, #5
|
||||
vpaes_push
|
||||
mov r11, t
|
||||
save_all_neon
|
||||
add t, r11, #5
|
||||
mov r3, #0
|
||||
str t, [key, #240]
|
||||
mov r8, #48
|
||||
b vpaes_schedule_core
|
||||
|
||||
.globl _AccelerateCrypto_vpaes_set_decrypt_key
|
||||
.p2align 4
|
||||
_AccelerateCrypto_vpaes_set_decrypt_key:
|
||||
lsr t, AES_bits, #5
|
||||
vpaes_push
|
||||
mov r11, t
|
||||
save_all_neon
|
||||
mov r8, #32
|
||||
add t, r11, #5
|
||||
and r8, r8, AES_bits, lsr #1
|
||||
mov r3, #1
|
||||
str t, [key, #240]
|
||||
add key, key, #16
|
||||
eor r8, r8, #32
|
||||
add key, key, t, lsl #4
|
||||
b vpaes_schedule_core
|
||||
|
||||
/*
|
||||
void vpaes_encrypt(const unsigned char *in, unsigned char *out, const AES_KEY *key);
|
||||
*/
|
||||
#define in r0
|
||||
#define out r1
|
||||
#define key r2
|
||||
|
||||
.globl _AccelerateCrypto_vpaes_encrypt
|
||||
.p2align 4
|
||||
_AccelerateCrypto_vpaes_encrypt:
|
||||
vpaes_push
|
||||
save_all_neon
|
||||
vld1.8 {q0}, [in]
|
||||
bl vpaes_preheat
|
||||
bl vpaes_encrypt_core
|
||||
vst1.8 {q0}, [out]
|
||||
restore_all_neon
|
||||
eor r0, r0 // return 0 for SUCCESS
|
||||
vpaes_pop
|
||||
|
||||
.globl _AccelerateCrypto_vpaes_decrypt
|
||||
.p2align 4
|
||||
_AccelerateCrypto_vpaes_decrypt:
|
||||
vpaes_push
|
||||
save_all_neon
|
||||
vld1.8 {q0}, [in]
|
||||
bl vpaes_preheat
|
||||
bl vpaes_decrypt_core
|
||||
vst1.8 {q0}, [out]
|
||||
restore_all_neon
|
||||
eor r0, r0 // return 0 for SUCCESS
|
||||
vpaes_pop
|
||||
|
||||
.p2align 4
|
||||
vpaes_preheat:
|
||||
adr r12, .Lk_s0F
|
||||
vldmia r12, {q9-q15}
|
||||
bx lr
|
||||
|
||||
.p2align 6
|
||||
// the following 7 16-bytes words are loaded into
|
||||
.Lk_s0F:
|
||||
.quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
|
||||
.Lk_inv:
|
||||
.quad 0x0E05060F0D080180, 0x040703090A0B0C02
|
||||
.quad 0x01040A060F0B0780, 0x030D0E0C02050809
|
||||
.Lk_sb1:
|
||||
.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
|
||||
.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
|
||||
.Lk_sb2:
|
||||
.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
|
||||
.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
|
||||
|
||||
#endif // !defined(__arm64__) && defined(__ARM_NEON__)
|
||||
|
|
@ -0,0 +1,65 @@
|
|||
# Copyright (c) (2019,2020) Apple Inc. All rights reserved.
|
||||
#
|
||||
# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
# is contained in the License.txt file distributed with corecrypto) and only to
|
||||
# people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
# Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
# devices and computers you own or control, for the sole purpose of verifying the
|
||||
# security characteristics and correct functioning of the Apple Software. You may
|
||||
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
|
||||
#if defined(__arm64__)
|
||||
#include "arm64_isa_compatibility.h"
|
||||
#include "ccarm_pac_bti_macros.h"
|
||||
// per block
|
||||
#define in x0
|
||||
#define out x1
|
||||
#define key x2
|
||||
#define keylen x3
|
||||
#define t x5
|
||||
|
||||
|
||||
.subsections_via_symbols
|
||||
.text
|
||||
.p2align 4
|
||||
.globl _AccelerateCrypto_AES_decrypt
|
||||
_AccelerateCrypto_AES_decrypt:
|
||||
BRANCH_TARGET_CALL
|
||||
#if BUILDKERNEL
|
||||
// save used vector registers
|
||||
sub sp, sp, #3*16
|
||||
st1.4s {v0,v1,v2}, [sp]
|
||||
#endif
|
||||
|
||||
ldr w3, [key, #240] // keylength = 32-bit
|
||||
ldr q0, [in] // plain data
|
||||
mov t, keylen
|
||||
ldr q1, [key, t] // expanded key
|
||||
sub t, t, #16
|
||||
ldr q2, [key] // expanded key
|
||||
0:
|
||||
AESD 0, 1
|
||||
AESIMC 0, 0
|
||||
ldr q1, [key, t] // expanded key
|
||||
subs t, t, #16
|
||||
b.gt 0b
|
||||
AESD 0, 1
|
||||
eor.16b v0, v0, v2
|
||||
str q0, [out]
|
||||
|
||||
#if BUILDKERNEL
|
||||
// restore used vector registers
|
||||
ld1.4s {v0,v1,v2}, [sp], #48
|
||||
#endif
|
||||
|
||||
mov x0, #0
|
||||
ret lr
|
||||
|
||||
#undef in
|
||||
#undef out
|
||||
#undef key
|
||||
#undef keylen
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
|
@ -0,0 +1,114 @@
|
|||
# Copyright (c) (2011-2016,2018-2020) Apple Inc. All rights reserved.
|
||||
#
|
||||
# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
# is contained in the License.txt file distributed with corecrypto) and only to
|
||||
# people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
# Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
# devices and computers you own or control, for the sole purpose of verifying the
|
||||
# security characteristics and correct functioning of the Apple Software. You may
|
||||
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
|
||||
#if defined(__arm64__)
|
||||
#include "arm64_isa_compatibility.h"
|
||||
#include "ccarm_pac_bti_macros.h"
|
||||
// ecb mode
|
||||
|
||||
#define key x0
|
||||
#define nblocks w1
|
||||
#define in x2
|
||||
#define out x3
|
||||
#define keylen x4
|
||||
#define t x5
|
||||
|
||||
.subsections_via_symbols
|
||||
.text
|
||||
|
||||
.globl _AccelerateCrypto_ecb_AES_decrypt
|
||||
.p2align 4
|
||||
_AccelerateCrypto_ecb_AES_decrypt:
|
||||
BRANCH_TARGET_CALL
|
||||
#if BUILDKERNEL
|
||||
// save used vector registers
|
||||
sub x4, sp, #6*16
|
||||
sub sp, sp, #6*16
|
||||
st1.4s {v0,v1,v2,v3}, [x4], #4*16
|
||||
st1.4s {v4,v5}, [x4], #2*16
|
||||
#endif
|
||||
|
||||
ldr w4, [key, #240] // keylength = 32-bit
|
||||
ldr q5, [key] // expanded key
|
||||
subs nblocks, nblocks, #4
|
||||
b.lt L_lessthan4
|
||||
|
||||
L_4blocks:
|
||||
mov t, keylen
|
||||
ld1.4s {v0,v1,v2,v3}, [in], #4*16
|
||||
ldr q4, [key, t] // expanded key
|
||||
sub t, t, #16
|
||||
0:
|
||||
AESD 0, 4
|
||||
AESIMC 0, 0
|
||||
AESD 1, 4
|
||||
AESIMC 1, 1
|
||||
AESD 2, 4
|
||||
AESIMC 2, 2
|
||||
AESD 3, 4
|
||||
AESIMC 3, 3
|
||||
ldr q4, [key, t] // expanded key
|
||||
subs t, t, #16
|
||||
b.gt 0b
|
||||
AESD 0, 4
|
||||
eor.16b v0, v0, v5
|
||||
AESD 1, 4
|
||||
eor.16b v1, v1, v5
|
||||
AESD 2, 4
|
||||
eor.16b v2, v2, v5
|
||||
AESD 3, 4
|
||||
eor.16b v3, v3, v5
|
||||
|
||||
st1.4s {v0,v1,v2,v3}, [out], #4*16
|
||||
|
||||
subs nblocks, nblocks, #4
|
||||
b.ge L_4blocks
|
||||
|
||||
L_lessthan4:
|
||||
ands nblocks, nblocks, #3
|
||||
b.eq 9f
|
||||
|
||||
L_1block:
|
||||
mov t, keylen
|
||||
ldr q0, [in], #16 // plain data
|
||||
ldr q4, [key, t] // expanded key
|
||||
sub t, t, #16
|
||||
0:
|
||||
AESD 0, 4
|
||||
AESIMC 0, 0
|
||||
ldr q4, [key, t] // expanded key
|
||||
subs t, t, #16
|
||||
b.gt 0b
|
||||
|
||||
AESD 0, 4
|
||||
eor.16b v0, v0, v5
|
||||
|
||||
str q0, [out], #16
|
||||
subs nblocks, nblocks, #1
|
||||
b.gt L_1block
|
||||
|
||||
9:
|
||||
#if BUILDKERNEL
|
||||
// restore used vector registers
|
||||
ld1.4s {v0,v1,v2,v3}, [sp], #4*16
|
||||
ld1.4s {v4,v5}, [sp], #2*16
|
||||
#endif
|
||||
|
||||
mov x0, #0
|
||||
ret lr
|
||||
|
||||
#undef in
|
||||
#undef out
|
||||
#undef key
|
||||
#undef nblocks
|
||||
#undef keylen
|
||||
|
||||
#endif
|
||||
|
||||
|
|
@ -0,0 +1,66 @@
|
|||
# Copyright (c) (2019,2020) Apple Inc. All rights reserved.
|
||||
#
|
||||
# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
# is contained in the License.txt file distributed with corecrypto) and only to
|
||||
# people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
# Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
# devices and computers you own or control, for the sole purpose of verifying the
|
||||
# security characteristics and correct functioning of the Apple Software. You may
|
||||
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
|
||||
#if defined(__arm64__)
|
||||
#include "arm64_isa_compatibility.h"
|
||||
#include "ccarm_pac_bti_macros.h"
|
||||
// per block implementation
|
||||
|
||||
#define in x0
|
||||
#define out x1
|
||||
#define key x2
|
||||
#define keylen x3
|
||||
#define t x5
|
||||
|
||||
.subsections_via_symbols
|
||||
.text
|
||||
.p2align 4
|
||||
.globl _AccelerateCrypto_AES_encrypt
|
||||
_AccelerateCrypto_AES_encrypt:
|
||||
BRANCH_TARGET_CALL
|
||||
#if BUILDKERNEL
|
||||
// save used vector registers
|
||||
sub sp, sp, #3*16
|
||||
st1.4s {v0,v1,v2}, [sp]
|
||||
#endif
|
||||
|
||||
ldr w3, [key, #240] // keylength = 32-bit, 160/192/224
|
||||
ldr q0, [in] // plain data
|
||||
ldr q1, [key] // expanded key
|
||||
ldr q2, [key, keylen] // final expanded key
|
||||
mov t, #16
|
||||
0:
|
||||
AESE 0, 1
|
||||
AESMC 0, 0
|
||||
ldr q1, [key, t] // expanded key
|
||||
add t, t, #16
|
||||
cmp t, keylen
|
||||
b.lt 0b
|
||||
|
||||
AESE 0, 1
|
||||
eor.16b v0, v0, v2
|
||||
|
||||
str q0, [out]
|
||||
|
||||
#if BUILDKERNEL
|
||||
// restore used vector registers
|
||||
ld1.4s {v0,v1,v2}, [sp], #48
|
||||
#endif
|
||||
|
||||
mov x0, #0
|
||||
ret lr
|
||||
|
||||
#undef in
|
||||
#undef out
|
||||
#undef key
|
||||
#undef keylen
|
||||
|
||||
#endif
|
||||
|
||||
|
|
@ -0,0 +1,119 @@
|
|||
# Copyright (c) (2011-2016,2018-2020) Apple Inc. All rights reserved.
|
||||
#
|
||||
# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
# is contained in the License.txt file distributed with corecrypto) and only to
|
||||
# people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
# Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
# devices and computers you own or control, for the sole purpose of verifying the
|
||||
# security characteristics and correct functioning of the Apple Software. You may
|
||||
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
|
||||
#if defined(__arm64__)
|
||||
#include "arm64_isa_compatibility.h"
|
||||
#include "ccarm_pac_bti_macros.h"
|
||||
#define key x0
|
||||
#define nblocks w1
|
||||
#define in x2
|
||||
#define out x3
|
||||
#define keylen x4
|
||||
#define t x5
|
||||
|
||||
.subsections_via_symbols
|
||||
.text
|
||||
|
||||
.p2align 4
|
||||
.globl _AccelerateCrypto_ecb_AES_encrypt
|
||||
_AccelerateCrypto_ecb_AES_encrypt:
|
||||
BRANCH_TARGET_CALL
|
||||
#if BUILDKERNEL
|
||||
// save used vector registers
|
||||
sub x4, sp, #6*16
|
||||
sub sp, sp, #6*16
|
||||
st1.4s {v0,v1,v2,v3}, [x4], #4*16
|
||||
st1.4s {v4,v5}, [x4], #2*16
|
||||
#endif
|
||||
|
||||
ldr w4, [key, #240] // keylength = 32-bit, 160/192/224
|
||||
subs nblocks, nblocks, #4 // pre-decrement nblocks by 4
|
||||
ldr q5, [key, keylen] // expanded key
|
||||
b.lt 1f // if nblocks < 4, go to scalar loop
|
||||
|
||||
|
||||
L_4blocks:
|
||||
|
||||
// handle 4 blocks per iteration
|
||||
ldr q4, [key] // expanded key
|
||||
mov t, #16
|
||||
ld1.4s {v0,v1,v2,v3}, [in], #4*16
|
||||
0:
|
||||
AESE 0, 4
|
||||
AESMC 0, 0
|
||||
AESE 1, 4
|
||||
AESMC 1, 1
|
||||
AESE 2, 4
|
||||
AESMC 2, 2
|
||||
AESE 3, 4
|
||||
AESMC 3, 3
|
||||
ldr q4, [key, t] // expanded key
|
||||
add t, t, #16
|
||||
cmp t, keylen
|
||||
b.lt 0b
|
||||
|
||||
AESE 0, 4
|
||||
eor.16b v0, v0, v5
|
||||
AESE 1, 4
|
||||
eor.16b v1, v1, v5
|
||||
AESE 2, 4
|
||||
eor.16b v2, v2, v5
|
||||
AESE 3, 4
|
||||
eor.16b v3, v3, v5
|
||||
|
||||
st1.4s {v0,v1,v2,v3}, [out], #4*16
|
||||
subs nblocks, nblocks, #4
|
||||
b.ge L_4blocks
|
||||
|
||||
|
||||
|
||||
1: // handle 1 block per iteration
|
||||
ands nblocks, nblocks, #3
|
||||
b.eq 9f
|
||||
|
||||
L_1block:
|
||||
ldr q4, [key] // expanded key
|
||||
mov t, #16
|
||||
ldr q0, [in], #16 // plain data
|
||||
0:
|
||||
AESE 0, 4
|
||||
AESMC 0, 0
|
||||
ldr q4, [key, t] // expanded key
|
||||
add t, t, #16
|
||||
cmp t, keylen
|
||||
b.lt 0b
|
||||
|
||||
AESE 0, 4
|
||||
eor.16b v0, v0, v5
|
||||
|
||||
str q0, [out], #16
|
||||
|
||||
subs nblocks, nblocks, #1
|
||||
b.gt L_1block
|
||||
|
||||
9:
|
||||
#if BUILDKERNEL
|
||||
// restore used vector registers
|
||||
ld1.4s {v0,v1,v2,v3}, [sp], #4*16
|
||||
ld1.4s {v4,v5}, [sp], #2*16
|
||||
#endif
|
||||
|
||||
mov x0, #0
|
||||
ret lr
|
||||
|
||||
#undef in
|
||||
#undef out
|
||||
#undef key
|
||||
#undef nblocks
|
||||
#undef keylen
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
|
@ -0,0 +1,25 @@
|
|||
/* Copyright (c) (2012,2015,2016,2019) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#ifndef _CORECRYPTO_CONTEXT_H_
|
||||
#define _CORECRYPTO_CONTEXT_H_
|
||||
|
||||
// Define byte offset of key within context structure.
|
||||
#define ContextKey 0
|
||||
|
||||
/* Define byte offset of key length within context structure. The number
|
||||
stored there is the number of bytes from the start of the first round key
|
||||
to the start of the last round key. That is 16 less than the number of
|
||||
bytes in the entire key.
|
||||
*/
|
||||
#define ContextKeyLength 240
|
||||
|
||||
#endif /* _CORECRYPTO_CONTEXT_H_ */
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,576 @@
|
|||
# Copyright (c) (2012,2015,2016,2019) Apple Inc. All rights reserved.
|
||||
#
|
||||
# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
# is contained in the License.txt file distributed with corecrypto) and only to
|
||||
# people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
# Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
# devices and computers you own or control, for the sole purpose of verifying the
|
||||
# security characteristics and correct functioning of the Apple Software. You may
|
||||
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
#if defined(__i386__) || defined(__x86_64__)
|
||||
|
||||
/* This file defines _vng_aes_encrypt or _vng_aes_decrypt, according to the value of
|
||||
the Select preprocessor symbol. This file is designed to be included in
|
||||
another assembly file using the preprocessor #include directive, to benefit
|
||||
from some assembly-time calculations.
|
||||
|
||||
These two routines are nearly identical. They differ only in the tables
|
||||
they use, the direction they iterate through the key, and the permutation
|
||||
performed on part of the state.
|
||||
|
||||
Written by Eric Postpischil, January 2008.
|
||||
*/
|
||||
|
||||
#if Select == 0
|
||||
#define Name _aes_encrypt_nonaesni // Routine name.
|
||||
#define MTable _AESEncryptTable // Main table.
|
||||
#define FTable _AESSubBytesWordTable // Final table.
|
||||
#define P0 S0 // State permutation.
|
||||
#define P1 S1
|
||||
#define P2 S2
|
||||
#define P3 S3
|
||||
#define Increment +16 // ExpandedKey increment.
|
||||
#elif Select == 1
|
||||
#define Name _aes_decrypt_nonaesni // Routine name.
|
||||
#define MTable _AESDecryptTable // Main table.
|
||||
#define FTable _AESInvSubBytesWordTable // Final table.
|
||||
#define P0 S2 // State permutation.
|
||||
#define P1 S3
|
||||
#define P2 S0
|
||||
#define P3 S1
|
||||
#define Increment -16 // ExpandedKey increment.
|
||||
#endif // Select
|
||||
|
||||
|
||||
/* Routine:
|
||||
|
||||
_AESEncryptWithExpandedKey (if Select is 0) or
|
||||
_AESDecryptWithExpandedKey (if Select is 1).
|
||||
|
||||
Function:
|
||||
|
||||
Perform the AES cipher or its inverse as defined in Federal Information
|
||||
Processing Standards Publication 197 (FIPS-197), November 26, 2001.
|
||||
|
||||
The inverse cipher here is the "Equivalent Inverse Cipher" in FIPS-197.
|
||||
|
||||
Input:
|
||||
|
||||
Constant data:
|
||||
|
||||
The following names must be locally defined so the assembler
|
||||
can calculate certain offsets.
|
||||
|
||||
For encryption:
|
||||
|
||||
static const Word _AESEncryptTable[4][256].
|
||||
|
||||
_AESEncryptTable[i] contains the tables T[i] defined in AES
|
||||
Proposal: Rijndael, version 2, 03/09/99, by Joan Daemen and
|
||||
Vincent Rijmen, section 5.2.1, page 18. These tables
|
||||
combine the SubBytes and MixColumns operations.
|
||||
|
||||
static const Word _AESSubBytesWordTable[256].
|
||||
|
||||
_AESSubBytesWordTable[i][j] = SubBytes(j) << 8*i, where
|
||||
SubBytes is defined in FIPS-197. _AESSubBytesWordTable
|
||||
differs from _AESEncryptTable in that it does not include
|
||||
the MixColumn operation. It is used in performing the last
|
||||
round, which differs fromm the previous rounds in that it
|
||||
does not include the MixColumn operation.
|
||||
|
||||
For decryption:
|
||||
|
||||
static const Word _AESDecryptTable[4][256].
|
||||
|
||||
The analog of _AESEncryptTable for decryption.
|
||||
|
||||
static const Word _AESSubBytesWordTable[256].
|
||||
|
||||
_AESInvSubBytesWordTable[i][j] = InvSubBytes(j) << 8*i,
|
||||
where InvSubBytes is defined in FIPS-197.
|
||||
_AESInvSubBytesWordTable differs from _AESDecryptTable in
|
||||
that it does not include the InvMixColumn operation. It is
|
||||
used in performing the last round, which differs from the
|
||||
previous rounds in that it does not include the
|
||||
InvMixColumn operation.
|
||||
|
||||
Arguments:
|
||||
|
||||
const Byte *InputText.
|
||||
|
||||
Address of input, 16 bytes. Best if four-byte aligned.
|
||||
|
||||
Byte *OutputText.
|
||||
|
||||
Address of output, 16 bytes. Best if four-byte aligned.
|
||||
|
||||
vng_aes_encrypt_ctx *Context or vng_aes_decrypt_ctx *Context
|
||||
|
||||
vng_aes_encrypt_ctx and vng_aes_decrypt_ctx are identical except the
|
||||
former is used for encryption and the latter for decryption.
|
||||
|
||||
Each is a structure containing the expanded key beginning at
|
||||
offset ContextKey and a four-byte "key length" beginning at
|
||||
offset ContextKeyLength. The "key length" is the number of
|
||||
bytes from the start of the first round key to the start of the
|
||||
last round key. That is 16 less than the number of bytes in
|
||||
the entire key.
|
||||
|
||||
Output:
|
||||
|
||||
Encrypted or decrypted data is written to *OutputText.
|
||||
|
||||
Return:
|
||||
|
||||
aes_rval // -1 if "key length" is invalid. 0 otherwise.
|
||||
*/
|
||||
|
||||
.text
|
||||
.globl Name
|
||||
Name:
|
||||
|
||||
// Jimmur removed the capabilities check and the just to the AESNI code. This
|
||||
// will be handled by the C code.
|
||||
|
||||
// Push new stack frame.
|
||||
push r5
|
||||
|
||||
/* Save registers and set SaveSize to the number of bytes pushed onto the
|
||||
stack so far, including the caller's return address.
|
||||
*/
|
||||
push r3
|
||||
#if defined __i386__
|
||||
push r6
|
||||
push r7
|
||||
#define SaveSize (5*4)
|
||||
#else
|
||||
#define SaveSize (3*8)
|
||||
#endif
|
||||
|
||||
/* Number of bytes used for local variables:
|
||||
|
||||
4 (i386) or 0 (x86_64) bytes for ExpandedKeyEnd.
|
||||
|
||||
5 (i386) or 3 (x86_64) 16-byte spaces to save XMM registers.
|
||||
*/
|
||||
#define LocalsSize (Arch(4, 0) + Arch(5, 3)*16)
|
||||
|
||||
#if 0 < LocalsSize
|
||||
// Padding to position stack pointer at a multiple of 16 bytes.
|
||||
#define Padding (15 & -(SaveSize + LocalsSize))
|
||||
sub $Padding + LocalsSize, r4 // Allocate space on stack.
|
||||
#else
|
||||
#define Padding 0
|
||||
#endif
|
||||
|
||||
#if BUILDKERNEL
|
||||
|
||||
// Save XMM registers.
|
||||
movaps %xmm0, 0*16(r4)
|
||||
movaps %xmm1, 1*16(r4)
|
||||
movaps %xmm2, 2*16(r4)
|
||||
|
||||
#if defined __i386__
|
||||
movaps %xmm3, 3*16(r4)
|
||||
movaps %xmm4, 4*16(r4)
|
||||
#endif
|
||||
|
||||
#endif // BUILDKERNEL
|
||||
|
||||
#if defined __i386__
|
||||
|
||||
// Number of bytes from caller's stack pointer to ours.
|
||||
#define StackFrame (SaveSize + Padding + LocalsSize)
|
||||
|
||||
// Define location of argument i (presuming 4-byte arguments).
|
||||
#define Argument(i) StackFrame+4*(i)(%esp)
|
||||
|
||||
#define ArgInputText Argument(0)
|
||||
#define ArgOutputText Argument(1)
|
||||
#define ArgContext Argument(2)
|
||||
|
||||
#elif defined __x86_64__
|
||||
|
||||
// Arguments.
|
||||
#define InputText r7 // Used early then overwritten for other use.
|
||||
#define OutputText r6 // Needed near end of routine.
|
||||
#define ArgContext r2
|
||||
/* The argument passed in r2 overlaps registers we need for other
|
||||
work, so it must be moved early in the routine.
|
||||
*/
|
||||
|
||||
#endif
|
||||
|
||||
#define BaseP Arch(r6, r9) // Base pointer for addressing global data.
|
||||
#define ExpandedKey Arch(t0, r10) // Address of expanded key.
|
||||
|
||||
/* The Work registers defined below are used to hold parts of the AES state
|
||||
while we dissect or assemble it. They must be assigned to the A, B, C, and
|
||||
D registers so that we can access the bytes in %al, %ah, and so on.
|
||||
*/
|
||||
#define Work0d r0d
|
||||
#define Work0l r0l
|
||||
#define Work0h r0h
|
||||
#define Work1d r3d
|
||||
#define Work1l r3l
|
||||
#define Work1h r3h
|
||||
#define Work2d r1d
|
||||
#define Work2l r1l
|
||||
#define Work2h r1h
|
||||
#define Work3d r2d
|
||||
#define Work3l r2l
|
||||
#define Work3h r2h
|
||||
|
||||
#define t0 r5
|
||||
#define t0d r5d // Low 32 bits of t0.
|
||||
#define t0l r5l // Low byte of t0.
|
||||
|
||||
#define t1 r7
|
||||
|
||||
/* S0, S1, S2, and S3 are where we assemble the new AES state when computing
|
||||
a regular round. S1, S2, and S3 are assigned to the Work registers, but
|
||||
S0 needs to go somewhere else because Work0 holds part of the old state.
|
||||
*/
|
||||
#define S0 Arch(t1, r8d)
|
||||
#define S1 Work1d
|
||||
#define S2 Work2d
|
||||
#define S3 Work3d
|
||||
|
||||
/* These XMM registers are used as holding space, because it is faster to
|
||||
spill to these registers than to the stack. (On x86_64, we do not need
|
||||
to spill, because there are additional general registers available.
|
||||
However, using more general registers requires saving them to the stack
|
||||
and restoring them. I timed it, and no time was saved.)
|
||||
*/
|
||||
#define vS1 %xmm0
|
||||
#define vS2 %xmm1
|
||||
#define vS3 %xmm2
|
||||
#if defined __i386__
|
||||
#define vExpandedKey %xmm3
|
||||
#define vIncrement %xmm4
|
||||
#endif
|
||||
|
||||
// Get address of expanded key.
|
||||
mov ArgContext, ExpandedKey
|
||||
#if 0 != ContextKey
|
||||
add $ContextKey, ExpandedKey
|
||||
#endif
|
||||
|
||||
/* Store sentinel value of ExpandedKey on the stack on i386, a register on
|
||||
x86_64.
|
||||
*/
|
||||
#define ExpandedKeyEnd Arch(5*16(r4), r11)
|
||||
|
||||
// Get and check "key length".
|
||||
movzb ContextKeyLength(ExpandedKey), r0
|
||||
cmp $160, r0
|
||||
je 2f
|
||||
cmp $192, r0
|
||||
je 2f
|
||||
cmp $224, r0
|
||||
je 2f
|
||||
mov $-1, r0 // Return error.
|
||||
jmp 9f
|
||||
2:
|
||||
|
||||
#if (Select == 0 || Select == 2)
|
||||
// For encryption, prepare to iterate forward through expanded key.
|
||||
add ExpandedKey, r0
|
||||
mov r0, ExpandedKeyEnd
|
||||
#else
|
||||
// For decryption, prepare to iterate backward through expanded key.
|
||||
mov ExpandedKey, ExpandedKeyEnd
|
||||
add r0, ExpandedKey
|
||||
#endif
|
||||
|
||||
// Initialize State from input text.
|
||||
#if defined __i386__
|
||||
mov ArgInputText, BaseP
|
||||
#define InputText BaseP
|
||||
#endif
|
||||
mov 0*4(InputText), Work0d
|
||||
mov 1*4(InputText), S1
|
||||
mov 2*4(InputText), S2
|
||||
mov 3*4(InputText), S3
|
||||
#undef InputText // Register is reused after this for other purposes.
|
||||
|
||||
// Add round key and save results.
|
||||
xor 0*4(ExpandedKey), Work0d // S0 is in dissection register.
|
||||
xor 1*4(ExpandedKey), S1
|
||||
movd S1, vS1 // Save S1 to S3 in vector registers.
|
||||
xor 2*4(ExpandedKey), S2
|
||||
movd S2, vS2
|
||||
xor 3*4(ExpandedKey), S3
|
||||
movd S3, vS3
|
||||
|
||||
add $Increment, ExpandedKey // Advance to next round key.
|
||||
|
||||
#if defined __i386__
|
||||
// Save expanded key address and increment in vector registers.
|
||||
mov $Increment, t1
|
||||
movp ExpandedKey, vExpandedKey
|
||||
movp t1, vIncrement
|
||||
#endif
|
||||
|
||||
// Set up relative addressing.
|
||||
#if defined __i386__
|
||||
|
||||
// Get address of 0 in BaseP.
|
||||
call 0f // Push program counter onto stack.
|
||||
0:
|
||||
pop BaseP // Get program counter.
|
||||
|
||||
// Define macros to help address data.
|
||||
#define LookupM(table, index) MTable-0b+(table)*TableSize(BaseP, index, 4)
|
||||
#define LookupF(table, index) FTable-0b+(table)*TableSize(BaseP, index, 4)
|
||||
|
||||
#elif defined __x86_64__
|
||||
|
||||
lea MTable(%rip), BaseP
|
||||
|
||||
// Define macros to help address data.
|
||||
#define LookupM(table, index) (table)*TableSize(BaseP, index, 4)
|
||||
#define LookupF(table, index) (table)*TableSize(BaseP, index, 4)
|
||||
|
||||
/* With these definitions of LookupM and LookupF, BaseP must be loaded with
|
||||
the address of the table at the point where it is used. So we need an
|
||||
instruction to change BaseP after we are done with MTable and before we
|
||||
start using FTable. I would prefer to use something like:
|
||||
|
||||
.set FMinusM, FTable - MTable
|
||||
#define LookupF(table, index) \
|
||||
FMinusM+(table)*TableSize(BaseP, index, 4)
|
||||
|
||||
Then BaseP would not need to change. However, this fails due to an
|
||||
assembler/linker bug.
|
||||
*/
|
||||
|
||||
#endif
|
||||
|
||||
// Get round key.
|
||||
mov 0*4(ExpandedKey), S0
|
||||
mov 1*4(ExpandedKey), S1
|
||||
mov 2*4(ExpandedKey), S2
|
||||
mov 3*4(ExpandedKey), S3
|
||||
|
||||
1:
|
||||
/* Word 0 of the current state must be in Work0 now, and the next round
|
||||
key must be in S0 to S3.
|
||||
*/
|
||||
|
||||
// Process previous S0.
|
||||
movzb Work0l, t0
|
||||
xor LookupM(0, t0), S0
|
||||
movzb Work0h, t0d
|
||||
xor LookupM(1, t0), P3
|
||||
shr $16, Work0d
|
||||
movzb Work0l, t0d
|
||||
xor LookupM(2, t0), S2
|
||||
movzb Work0h, t0d
|
||||
xor LookupM(3, t0), P1
|
||||
|
||||
// Process previous S1.
|
||||
movd vS1, Work0d
|
||||
movzb Work0l, t0d
|
||||
xor LookupM(0, t0), S1
|
||||
movzb Work0h, t0d
|
||||
xor LookupM(1, t0), P0
|
||||
shr $16, Work0d
|
||||
movzb Work0l, t0d
|
||||
xor LookupM(2, t0), S3
|
||||
movzb Work0h, t0d
|
||||
xor LookupM(3, t0), P2
|
||||
|
||||
// Process previous S2.
|
||||
movd vS2, Work0d
|
||||
movzb Work0l, t0d
|
||||
xor LookupM(0, t0), S2
|
||||
movzb Work0h, t0d
|
||||
xor LookupM(1, t0), P1
|
||||
shr $16, Work0d
|
||||
movzb Work0l, t0d
|
||||
xor LookupM(2, t0), S0
|
||||
movzb Work0h, t0d
|
||||
xor LookupM(3, t0), P3
|
||||
|
||||
// Process previous S3.
|
||||
movd vS3, Work0d
|
||||
movzb Work0l, t0d
|
||||
xor LookupM(0, t0), S3
|
||||
movzb Work0h, t0d
|
||||
xor LookupM(1, t0), P2
|
||||
shr $16, Work0d
|
||||
movzb Work0l, t0d
|
||||
xor LookupM(2, t0), S1
|
||||
movzb Work0h, t0d
|
||||
xor LookupM(3, t0), P0
|
||||
|
||||
#if defined __i386__
|
||||
paddd vIncrement, vExpandedKey
|
||||
movp vExpandedKey, ExpandedKey
|
||||
#else
|
||||
add $Increment, ExpandedKey
|
||||
#endif
|
||||
|
||||
// Save state for next iteration and load next round key.
|
||||
mov S0, Work0d
|
||||
mov 0*4(ExpandedKey), S0
|
||||
movd S1, vS1
|
||||
mov 1*4(ExpandedKey), S1
|
||||
movd S2, vS2
|
||||
mov 2*4(ExpandedKey), S2
|
||||
movd S3, vS3
|
||||
mov 3*4(ExpandedKey), S3
|
||||
|
||||
cmp ExpandedKeyEnd, ExpandedKey
|
||||
jne 1b
|
||||
|
||||
/* Word 0 of the current state must be in Work0 now, and the next round
|
||||
key must be in S0 to S3.
|
||||
*/
|
||||
|
||||
// Work around assembler bug. See comments above about Radar 5683882.
|
||||
#if defined __x86_64__
|
||||
lea FTable(%rip), BaseP
|
||||
#endif
|
||||
|
||||
// Process previous S0.
|
||||
movzb Work0l, t0
|
||||
xor LookupF(0, t0), S0
|
||||
movzb Work0h, t0d
|
||||
xor LookupF(1, t0), P3
|
||||
shr $16, Work0d
|
||||
movzb Work0l, t0d
|
||||
xor LookupF(2, t0), S2
|
||||
movzb Work0h, t0d
|
||||
xor LookupF(3, t0), P1
|
||||
|
||||
// Process previous S1.
|
||||
movd vS1, Work0d
|
||||
movzb Work0l, t0d
|
||||
xor LookupF(0, t0), S1
|
||||
movzb Work0h, t0d
|
||||
xor LookupF(1, t0), P0
|
||||
shr $16, Work0d
|
||||
movzb Work0l, t0d
|
||||
xor LookupF(2, t0), S3
|
||||
movzb Work0h, t0d
|
||||
xor LookupF(3, t0), P2
|
||||
|
||||
// Process previous S2.
|
||||
movd vS2, Work0d
|
||||
movzb Work0l, t0d
|
||||
xor LookupF(0, t0), S2
|
||||
movzb Work0h, t0d
|
||||
xor LookupF(1, t0), P1
|
||||
shr $16, Work0d
|
||||
movzb Work0l, t0d
|
||||
xor LookupF(2, t0), S0
|
||||
movzb Work0h, t0d
|
||||
xor LookupF(3, t0), P3
|
||||
|
||||
// Process previous S3.
|
||||
movd vS3, Work0d
|
||||
movzb Work0l, t0d
|
||||
xor LookupF(0, t0), S3
|
||||
movzb Work0h, t0d
|
||||
xor LookupF(1, t0), P2
|
||||
shr $16, Work0d
|
||||
movzb Work0l, t0d
|
||||
xor LookupF(2, t0), S1
|
||||
movzb Work0h, t0d
|
||||
xor LookupF(3, t0), P0
|
||||
|
||||
#if defined __i386__ // Architecture.
|
||||
// Get OutputText address.
|
||||
#define OutputText BaseP
|
||||
mov ArgOutputText, OutputText
|
||||
#endif // Architecture.
|
||||
|
||||
// Write output.
|
||||
mov S0, 0*4(OutputText)
|
||||
mov S1, 1*4(OutputText)
|
||||
mov S2, 2*4(OutputText)
|
||||
mov S3, 3*4(OutputText)
|
||||
|
||||
xor r0, r0 // Return success.
|
||||
|
||||
9:
|
||||
// Pop stack and restore registers.
|
||||
#if BUILDKERNEL
|
||||
#if defined __i386__
|
||||
movaps 4*16(r4), %xmm4
|
||||
movaps 3*16(r4), %xmm3
|
||||
#endif
|
||||
movaps 2*16(r4), %xmm2
|
||||
movaps 1*16(r4), %xmm1
|
||||
movaps 0*16(r4), %xmm0
|
||||
#endif // BUILDKERNEL
|
||||
#if 0 < LocalsSize
|
||||
add $Padding + LocalsSize, r4
|
||||
#endif
|
||||
#if defined __i386__
|
||||
pop r7
|
||||
pop r6
|
||||
#elif defined __x86_64__
|
||||
#endif
|
||||
pop r3
|
||||
pop r5
|
||||
|
||||
ret
|
||||
|
||||
|
||||
#undef ArgExpandedKey
|
||||
#undef ArgInputText
|
||||
#undef ArgNr
|
||||
#undef ArgOutputText
|
||||
#undef Argument
|
||||
#undef BaseP
|
||||
#undef ExpandedKey
|
||||
#undef ExpandedKeyEnd
|
||||
#undef FTable
|
||||
#undef InputText
|
||||
#undef LocalsSize
|
||||
#undef LookupM
|
||||
#undef LookupF
|
||||
#undef MTable
|
||||
#undef OutputText
|
||||
#undef Padding
|
||||
#undef SaveSize
|
||||
#undef S0
|
||||
#undef S1
|
||||
#undef S2
|
||||
#undef S3
|
||||
#undef StackFrame
|
||||
#undef Work0d
|
||||
#undef Work0h
|
||||
#undef Work0l
|
||||
#undef Work1d
|
||||
#undef Work1h
|
||||
#undef Work1l
|
||||
#undef Work2d
|
||||
#undef Work2h
|
||||
#undef Work2l
|
||||
#undef Work3d
|
||||
#undef Work3h
|
||||
#undef Work3l
|
||||
#undef t0
|
||||
#undef t0d
|
||||
#undef t0l
|
||||
#undef t1
|
||||
#undef vExpandedKey
|
||||
#undef vS1
|
||||
#undef vS2
|
||||
#undef vS3
|
||||
|
||||
#undef Name
|
||||
#undef MTable
|
||||
#undef FTable
|
||||
#undef P0
|
||||
#undef P1
|
||||
#undef P2
|
||||
#undef P3
|
||||
#undef Increment
|
||||
|
||||
#endif // defined(__x86_64__) || defined(__i386__)
|
||||
|
|
@ -0,0 +1,38 @@
|
|||
/* Copyright (c) (2019) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#if (defined(__x86_64__) || defined(__i386__))
|
||||
#include <stddef.h>
|
||||
#include "config.h"
|
||||
#include "AccelerateCrypto.h"
|
||||
|
||||
|
||||
extern int aes_encrypt_aesni(const void *in, void *out, const AccelerateCrypto_AES_ctx *key);
|
||||
extern int aes_decrypt_aesni(const void *in, void *out, const AccelerateCrypto_AES_ctx *key);
|
||||
extern int aes_encrypt_nonaesni(const void *in, void *out, const AccelerateCrypto_AES_ctx *key);
|
||||
extern int aes_decrypt_nonaesni(const void *in, void *out, const AccelerateCrypto_AES_ctx *key);
|
||||
|
||||
int AccelerateCrypto_AES_encrypt(const void *in, void *out, const AccelerateCrypto_AES_ctx *key)
|
||||
{
|
||||
if (HAS_AESNI()) return aes_encrypt_aesni(in, out, key);
|
||||
else
|
||||
return aes_encrypt_nonaesni(in, out, key);
|
||||
}
|
||||
|
||||
int AccelerateCrypto_AES_decrypt(const void *in, void *out, const AccelerateCrypto_AES_ctx *key)
|
||||
{
|
||||
if (HAS_AESNI()) return aes_decrypt_aesni(in, out, key);
|
||||
else
|
||||
return aes_decrypt_nonaesni(in, out, key);
|
||||
}
|
||||
|
||||
#endif // (defined(__x86_64__) || defined(__i386__))
|
||||
|
||||
|
|
@ -0,0 +1,483 @@
|
|||
# Copyright (c) (2012,2015,2016,2018,2019) Apple Inc. All rights reserved.
|
||||
#
|
||||
# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
# is contained in the License.txt file distributed with corecrypto) and only to
|
||||
# people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
# Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
# devices and computers you own or control, for the sole purpose of verifying the
|
||||
# security characteristics and correct functioning of the Apple Software. You may
|
||||
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
/* This files defines _aes_encrypt_aesni and _aes_decrypt_aesni --- Intel Westmere HW AES-based implementation
|
||||
of _aes_encrypt and _aes_decrypt.
|
||||
|
||||
These 2 functions SHOULD BE entried ONLY after the AES HW is verified to be available.
|
||||
They SHOULD NOT be called without AES HW detection. It might cause xnu to crash.
|
||||
|
||||
The AES HW is detected 1st thing in
|
||||
_aes_encrypt (EncryptDecrypt.s)
|
||||
_aes_decrypt (EncryptDecrypt.s)
|
||||
and, if AES HW is detected, branch without link (ie, jump) to the functions here.
|
||||
|
||||
The implementation here follows the examples in an Intel White Paper
|
||||
"Intel Advanced Encryption Standard (AES) Instruction Set" Rev.2 01
|
||||
|
||||
Note: Rev. 03 Final 2010 01 26 is available. Looks like some code change from Rev.2 01
|
||||
|
||||
*/
|
||||
|
||||
#if (defined __i386__ || defined __x86_64__)
|
||||
|
||||
.text
|
||||
.p2align 4,0x90
|
||||
.globl _aes_encrypt_aesni
|
||||
_aes_encrypt_aesni:
|
||||
|
||||
#if defined __i386__
|
||||
movl 4(%esp), %eax // in
|
||||
movl 12(%esp), %edx // ctx
|
||||
movl 8(%esp), %ecx // out
|
||||
|
||||
#define LOCAL_SIZE (12+16+16) // 16-byte align (-4 for return address) + 16 (xmm0) + 16 (xmm1)
|
||||
#define in %eax
|
||||
#define ctx %edx
|
||||
#define out %ecx
|
||||
#define r13 %esp
|
||||
|
||||
#else // x86_64
|
||||
|
||||
#define LOCAL_SIZE (8+16+16) // 16-byte align (-8 for return address) + 16 (xmm0) + 16 (xmm1)
|
||||
#define in %rdi
|
||||
#define ctx %rdx
|
||||
#define out %rsi
|
||||
#define r13 %rsp
|
||||
|
||||
#endif // i386 or x86_64
|
||||
|
||||
#if BUILDKERNEL
|
||||
sub $LOCAL_SIZE, r13
|
||||
movaps %xmm0, (r13)
|
||||
#endif
|
||||
movups (in), %xmm0
|
||||
|
||||
// key length identification
|
||||
movl 240(ctx), %eax // key length
|
||||
cmp $160, %eax
|
||||
je L_AES_128
|
||||
cmp $192, %eax
|
||||
je L_AES_192
|
||||
cmp $224, %eax
|
||||
je L_AES_256
|
||||
mov $-1, %eax // return ERROR
|
||||
#if BUILDKERNEL
|
||||
movaps (r13), %xmm0
|
||||
add $LOCAL_SIZE, r13
|
||||
#endif
|
||||
ret
|
||||
|
||||
L_AES_128:
|
||||
testb $15, %dl // check whether expanded key is 16-byte aligned
|
||||
jne 0f // if not 16-byte aligned, aesenc xmm, m128 won't work
|
||||
pxor (ctx), %xmm0
|
||||
aesenc 16(ctx), %xmm0
|
||||
aesenc 32(ctx), %xmm0
|
||||
aesenc 48(ctx), %xmm0
|
||||
aesenc 64(ctx), %xmm0
|
||||
aesenc 80(ctx), %xmm0
|
||||
aesenc 96(ctx), %xmm0
|
||||
aesenc 112(ctx), %xmm0
|
||||
aesenc 128(ctx), %xmm0
|
||||
aesenc 144(ctx), %xmm0
|
||||
aesenclast 160(ctx), %xmm0
|
||||
xorl %eax, %eax
|
||||
movups %xmm0, (out)
|
||||
#if BUILDKERNEL
|
||||
movaps (r13), %xmm0
|
||||
add $LOCAL_SIZE, r13
|
||||
#endif
|
||||
ret
|
||||
0: // special case expanded key is not 16-byte aligned
|
||||
#if BUILDKERNEL
|
||||
movaps %xmm1, 16(r13) // save xmm1 into stack
|
||||
#endif
|
||||
movups (ctx), %xmm1
|
||||
pxor %xmm1, %xmm0
|
||||
movups 16(ctx), %xmm1
|
||||
aesenc %xmm1, %xmm0
|
||||
movups 32(ctx), %xmm1
|
||||
aesenc %xmm1, %xmm0
|
||||
movups 48(ctx), %xmm1
|
||||
aesenc %xmm1, %xmm0
|
||||
movups 64(ctx), %xmm1
|
||||
aesenc %xmm1, %xmm0
|
||||
movups 80(ctx), %xmm1
|
||||
aesenc %xmm1, %xmm0
|
||||
movups 96(ctx), %xmm1
|
||||
aesenc %xmm1, %xmm0
|
||||
movups 112(ctx), %xmm1
|
||||
aesenc %xmm1, %xmm0
|
||||
movups 128(ctx), %xmm1
|
||||
aesenc %xmm1, %xmm0
|
||||
movups 144(ctx), %xmm1
|
||||
aesenc %xmm1, %xmm0
|
||||
movups 160(ctx), %xmm1
|
||||
aesenclast %xmm1, %xmm0
|
||||
xorl %eax, %eax
|
||||
movups %xmm0, (out)
|
||||
#if BUILDKERNEL
|
||||
movaps (r13), %xmm0
|
||||
movaps 16(r13), %xmm1
|
||||
add $LOCAL_SIZE, r13
|
||||
#endif
|
||||
ret
|
||||
|
||||
L_AES_192:
|
||||
testb $15, %dl // check whether expanded key is 16-byte aligned
|
||||
jne 0f // if not 16-byte aligned, aesenc xmm, m128 won't work
|
||||
pxor (ctx), %xmm0
|
||||
aesenc 16(ctx), %xmm0
|
||||
aesenc 32(ctx), %xmm0
|
||||
aesenc 48(ctx), %xmm0
|
||||
aesenc 64(ctx), %xmm0
|
||||
aesenc 80(ctx), %xmm0
|
||||
aesenc 96(ctx), %xmm0
|
||||
aesenc 112(ctx), %xmm0
|
||||
aesenc 128(ctx), %xmm0
|
||||
aesenc 144(ctx), %xmm0
|
||||
aesenc 160(ctx), %xmm0
|
||||
aesenc 176(ctx), %xmm0
|
||||
aesenclast 192(ctx), %xmm0
|
||||
xorl %eax, %eax
|
||||
movups %xmm0, (out)
|
||||
#if BUILDKERNEL
|
||||
movaps (r13), %xmm0
|
||||
add $LOCAL_SIZE, r13
|
||||
#endif
|
||||
ret
|
||||
0: // special case expanded key is not 16-byte aligned
|
||||
#if BUILDKERNEL
|
||||
movaps %xmm1, 16(r13) // save xmm1 into stack
|
||||
#endif
|
||||
movups (ctx), %xmm1
|
||||
pxor %xmm1, %xmm0
|
||||
movups 16(ctx), %xmm1
|
||||
aesenc %xmm1, %xmm0
|
||||
movups 32(ctx), %xmm1
|
||||
aesenc %xmm1, %xmm0
|
||||
movups 48(ctx), %xmm1
|
||||
aesenc %xmm1, %xmm0
|
||||
movups 64(ctx), %xmm1
|
||||
aesenc %xmm1, %xmm0
|
||||
movups 80(ctx), %xmm1
|
||||
aesenc %xmm1, %xmm0
|
||||
movups 96(ctx), %xmm1
|
||||
aesenc %xmm1, %xmm0
|
||||
movups 112(ctx), %xmm1
|
||||
aesenc %xmm1, %xmm0
|
||||
movups 128(ctx), %xmm1
|
||||
aesenc %xmm1, %xmm0
|
||||
movups 144(ctx), %xmm1
|
||||
aesenc %xmm1, %xmm0
|
||||
movups 160(ctx), %xmm1
|
||||
aesenc %xmm1, %xmm0
|
||||
movups 176(ctx), %xmm1
|
||||
aesenc %xmm1, %xmm0
|
||||
movups 192(ctx), %xmm1
|
||||
aesenclast %xmm1, %xmm0
|
||||
xorl %eax, %eax
|
||||
movups %xmm0, (out)
|
||||
#if BUILDKERNEL
|
||||
movaps (r13), %xmm0
|
||||
movaps 16(r13), %xmm1
|
||||
add $LOCAL_SIZE, r13
|
||||
#endif
|
||||
ret
|
||||
|
||||
L_AES_256:
|
||||
testb $15, %dl // check whether expanded key is 16-byte aligned
|
||||
jne 0f // if not 16-byte aligned, aesenc xmm, m128 won't work
|
||||
pxor (ctx), %xmm0
|
||||
aesenc 16(ctx), %xmm0
|
||||
aesenc 32(ctx), %xmm0
|
||||
aesenc 48(ctx), %xmm0
|
||||
aesenc 64(ctx), %xmm0
|
||||
aesenc 80(ctx), %xmm0
|
||||
aesenc 96(ctx), %xmm0
|
||||
aesenc 112(ctx), %xmm0
|
||||
aesenc 128(ctx), %xmm0
|
||||
aesenc 144(ctx), %xmm0
|
||||
aesenc 160(ctx), %xmm0
|
||||
aesenc 176(ctx), %xmm0
|
||||
aesenc 192(ctx), %xmm0
|
||||
aesenc 208(ctx), %xmm0
|
||||
aesenclast 224(ctx), %xmm0
|
||||
xorl %eax, %eax
|
||||
movups %xmm0, (out)
|
||||
#if BUILDKERNEL
|
||||
movaps (r13), %xmm0
|
||||
add $LOCAL_SIZE, r13
|
||||
#endif
|
||||
ret
|
||||
0: // special case expanded key is not 16-byte aligned
|
||||
#if BUILDKERNEL
|
||||
movaps %xmm1, 16(r13) // save xmm1 into stack
|
||||
#endif
|
||||
movups (ctx), %xmm1
|
||||
pxor %xmm1, %xmm0
|
||||
movups 16(ctx), %xmm1
|
||||
aesenc %xmm1, %xmm0
|
||||
movups 32(ctx), %xmm1
|
||||
aesenc %xmm1, %xmm0
|
||||
movups 48(ctx), %xmm1
|
||||
aesenc %xmm1, %xmm0
|
||||
movups 64(ctx), %xmm1
|
||||
aesenc %xmm1, %xmm0
|
||||
movups 80(ctx), %xmm1
|
||||
aesenc %xmm1, %xmm0
|
||||
movups 96(ctx), %xmm1
|
||||
aesenc %xmm1, %xmm0
|
||||
movups 112(ctx), %xmm1
|
||||
aesenc %xmm1, %xmm0
|
||||
movups 128(ctx), %xmm1
|
||||
aesenc %xmm1, %xmm0
|
||||
movups 144(ctx), %xmm1
|
||||
aesenc %xmm1, %xmm0
|
||||
movups 160(ctx), %xmm1
|
||||
aesenc %xmm1, %xmm0
|
||||
movups 176(ctx), %xmm1
|
||||
aesenc %xmm1, %xmm0
|
||||
movups 192(ctx), %xmm1
|
||||
aesenc %xmm1, %xmm0
|
||||
movups 208(ctx), %xmm1
|
||||
aesenc %xmm1, %xmm0
|
||||
movups 224(ctx), %xmm1
|
||||
aesenclast %xmm1, %xmm0
|
||||
xorl %eax, %eax
|
||||
movups %xmm0, (out)
|
||||
#if BUILDKERNEL
|
||||
movaps (r13), %xmm0
|
||||
movaps 16(r13), %xmm1
|
||||
add $LOCAL_SIZE, r13
|
||||
#endif
|
||||
ret
|
||||
|
||||
|
||||
.text
|
||||
.p2align 4,0x90
|
||||
.globl _aes_decrypt_aesni
|
||||
_aes_decrypt_aesni:
|
||||
|
||||
#if defined __i386__
|
||||
movl 4(%esp), %eax // in
|
||||
movl 12(%esp), %edx // ctx
|
||||
movl 8(%esp), %ecx // out
|
||||
|
||||
#endif
|
||||
|
||||
#if BUILDKERNEL
|
||||
sub $LOCAL_SIZE, r13
|
||||
movaps %xmm0, (r13)
|
||||
#endif
|
||||
movups (in), %xmm0
|
||||
|
||||
// key length identification
|
||||
movl 240(ctx), %eax // key length
|
||||
cmp $160, %eax
|
||||
je 0f // AES-128
|
||||
cmp $192, %eax
|
||||
je 1f // AES-192
|
||||
cmp $224, %eax
|
||||
je 2f // AES-256
|
||||
mov $-1, %eax // return ERROR
|
||||
#if BUILDKERNEL
|
||||
movaps (r13), %xmm0
|
||||
add $LOCAL_SIZE, r13
|
||||
#endif
|
||||
ret
|
||||
|
||||
0: // AES-128
|
||||
testb $15, %dl // check whether expanded key is 16-byte aligned
|
||||
jne 9f // if not 16-byte aligned, aesenc xmm, m128 won't work
|
||||
pxor 160(ctx), %xmm0
|
||||
aesdec 144(ctx), %xmm0
|
||||
aesdec 128(ctx), %xmm0
|
||||
aesdec 112(ctx), %xmm0
|
||||
aesdec 96(ctx), %xmm0
|
||||
aesdec 80(ctx), %xmm0
|
||||
aesdec 64(ctx), %xmm0
|
||||
aesdec 48(ctx), %xmm0
|
||||
aesdec 32(ctx), %xmm0
|
||||
aesdec 16(ctx), %xmm0
|
||||
aesdeclast (ctx), %xmm0
|
||||
xorl %eax, %eax
|
||||
movups %xmm0, (out)
|
||||
#if BUILDKERNEL
|
||||
movaps (r13), %xmm0
|
||||
add $LOCAL_SIZE, r13
|
||||
#endif
|
||||
ret
|
||||
9: // AES-128 Decrypt : special case expanded key is not 16-byte aligned
|
||||
#if BUILDKERNEL
|
||||
movaps %xmm1, 16(r13) // save xmm1 into stack
|
||||
#endif
|
||||
movups 160(ctx), %xmm1
|
||||
pxor %xmm1, %xmm0
|
||||
movups 144(ctx), %xmm1
|
||||
aesdec %xmm1, %xmm0
|
||||
movups 128(ctx), %xmm1
|
||||
aesdec %xmm1, %xmm0
|
||||
movups 112(ctx), %xmm1
|
||||
aesdec %xmm1, %xmm0
|
||||
movups 96(ctx), %xmm1
|
||||
aesdec %xmm1, %xmm0
|
||||
movups 80(ctx), %xmm1
|
||||
aesdec %xmm1, %xmm0
|
||||
movups 64(ctx), %xmm1
|
||||
aesdec %xmm1, %xmm0
|
||||
movups 48(ctx), %xmm1
|
||||
aesdec %xmm1, %xmm0
|
||||
movups 32(ctx), %xmm1
|
||||
aesdec %xmm1, %xmm0
|
||||
movups 16(ctx), %xmm1
|
||||
aesdec %xmm1, %xmm0
|
||||
movups (ctx), %xmm1
|
||||
aesdeclast %xmm1, %xmm0
|
||||
xorl %eax, %eax
|
||||
movups %xmm0, (out)
|
||||
#if BUILDKERNEL
|
||||
movaps (r13), %xmm0
|
||||
movaps 16(r13), %xmm1
|
||||
add $LOCAL_SIZE, r13
|
||||
#endif
|
||||
ret
|
||||
|
||||
1: // AES-192
|
||||
testb $15, %dl // check whether expanded key is 16-byte aligned
|
||||
jne 9f // if not 16-byte aligned, aesenc xmm, m128 won't work
|
||||
pxor 192(ctx), %xmm0
|
||||
aesdec 176(ctx), %xmm0
|
||||
aesdec 160(ctx), %xmm0
|
||||
aesdec 144(ctx), %xmm0
|
||||
aesdec 128(ctx), %xmm0
|
||||
aesdec 112(ctx), %xmm0
|
||||
aesdec 96(ctx), %xmm0
|
||||
aesdec 80(ctx), %xmm0
|
||||
aesdec 64(ctx), %xmm0
|
||||
aesdec 48(ctx), %xmm0
|
||||
aesdec 32(ctx), %xmm0
|
||||
aesdec 16(ctx), %xmm0
|
||||
aesdeclast (ctx), %xmm0
|
||||
xorl %eax, %eax
|
||||
movups %xmm0, (out)
|
||||
#if BUILDKERNEL
|
||||
movaps (r13), %xmm0
|
||||
add $LOCAL_SIZE, r13
|
||||
#endif
|
||||
ret
|
||||
9: // AES-192 Decrypt : special case expanded key is not 16-byte aligned
|
||||
#if BUILDKERNEL
|
||||
movaps %xmm1, 16(r13) // save xmm1 into stack
|
||||
#endif
|
||||
movups 192(ctx), %xmm1
|
||||
pxor %xmm1, %xmm0
|
||||
movups 176(ctx), %xmm1
|
||||
aesdec %xmm1, %xmm0
|
||||
movups 160(ctx), %xmm1
|
||||
aesdec %xmm1, %xmm0
|
||||
movups 144(ctx), %xmm1
|
||||
aesdec %xmm1, %xmm0
|
||||
movups 128(ctx), %xmm1
|
||||
aesdec %xmm1, %xmm0
|
||||
movups 112(ctx), %xmm1
|
||||
aesdec %xmm1, %xmm0
|
||||
movups 96(ctx), %xmm1
|
||||
aesdec %xmm1, %xmm0
|
||||
movups 80(ctx), %xmm1
|
||||
aesdec %xmm1, %xmm0
|
||||
movups 64(ctx), %xmm1
|
||||
aesdec %xmm1, %xmm0
|
||||
movups 48(ctx), %xmm1
|
||||
aesdec %xmm1, %xmm0
|
||||
movups 32(ctx), %xmm1
|
||||
aesdec %xmm1, %xmm0
|
||||
movups 16(ctx), %xmm1
|
||||
aesdec %xmm1, %xmm0
|
||||
movups (ctx), %xmm1
|
||||
aesdeclast %xmm1, %xmm0
|
||||
xorl %eax, %eax
|
||||
movups %xmm0, (out)
|
||||
#if BUILDKERNEL
|
||||
movaps (r13), %xmm0
|
||||
movaps 16(r13), %xmm1
|
||||
add $LOCAL_SIZE, r13
|
||||
#endif
|
||||
ret
|
||||
|
||||
2: // AES-256
|
||||
testb $15, %dl // check whether expanded key is 16-byte aligned
|
||||
jne 9f // if not 16-byte aligned, aesenc xmm, m128 won't work
|
||||
pxor 224(ctx), %xmm0
|
||||
aesdec 208(ctx), %xmm0
|
||||
aesdec 192(ctx), %xmm0
|
||||
aesdec 176(ctx), %xmm0
|
||||
aesdec 160(ctx), %xmm0
|
||||
aesdec 144(ctx), %xmm0
|
||||
aesdec 128(ctx), %xmm0
|
||||
aesdec 112(ctx), %xmm0
|
||||
aesdec 96(ctx), %xmm0
|
||||
aesdec 80(ctx), %xmm0
|
||||
aesdec 64(ctx), %xmm0
|
||||
aesdec 48(ctx), %xmm0
|
||||
aesdec 32(ctx), %xmm0
|
||||
aesdec 16(ctx), %xmm0
|
||||
aesdeclast (ctx), %xmm0
|
||||
xorl %eax, %eax
|
||||
movups %xmm0, (out)
|
||||
#if BUILDKERNEL
|
||||
movaps (r13), %xmm0
|
||||
add $LOCAL_SIZE, r13
|
||||
#endif
|
||||
ret
|
||||
9: // AES-256 Decrypt : special case expanded key is not 16-byte aligned
|
||||
#if BUILDKERNEL
|
||||
movaps %xmm1, 16(r13) // save xmm1 into stack
|
||||
#endif
|
||||
movups 224(ctx), %xmm1
|
||||
pxor %xmm1, %xmm0
|
||||
movups 208(ctx), %xmm1
|
||||
aesdec %xmm1, %xmm0
|
||||
movups 192(ctx), %xmm1
|
||||
aesdec %xmm1, %xmm0
|
||||
movups 176(ctx), %xmm1
|
||||
aesdec %xmm1, %xmm0
|
||||
movups 160(ctx), %xmm1
|
||||
aesdec %xmm1, %xmm0
|
||||
movups 144(ctx), %xmm1
|
||||
aesdec %xmm1, %xmm0
|
||||
movups 128(ctx), %xmm1
|
||||
aesdec %xmm1, %xmm0
|
||||
movups 112(ctx), %xmm1
|
||||
aesdec %xmm1, %xmm0
|
||||
movups 96(ctx), %xmm1
|
||||
aesdec %xmm1, %xmm0
|
||||
movups 80(ctx), %xmm1
|
||||
aesdec %xmm1, %xmm0
|
||||
movups 64(ctx), %xmm1
|
||||
aesdec %xmm1, %xmm0
|
||||
movups 48(ctx), %xmm1
|
||||
aesdec %xmm1, %xmm0
|
||||
movups 32(ctx), %xmm1
|
||||
aesdec %xmm1, %xmm0
|
||||
movups 16(ctx), %xmm1
|
||||
aesdec %xmm1, %xmm0
|
||||
movups (ctx), %xmm1
|
||||
aesdeclast %xmm1, %xmm0
|
||||
xorl %eax, %eax
|
||||
movups %xmm0, (out)
|
||||
#if BUILDKERNEL
|
||||
movaps (r13), %xmm0
|
||||
movaps 16(r13), %xmm1
|
||||
add $LOCAL_SIZE, r13
|
||||
#endif
|
||||
ret
|
||||
|
||||
#endif /* x86 based build */
|
||||
|
|
@ -0,0 +1,146 @@
|
|||
# Copyright (c) (2012,2015,2016,2019) Apple Inc. All rights reserved.
|
||||
#
|
||||
# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
# is contained in the License.txt file distributed with corecrypto) and only to
|
||||
# people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
# Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
# devices and computers you own or control, for the sole purpose of verifying the
|
||||
# security characteristics and correct functioning of the Apple Software. You may
|
||||
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
#include <corecrypto/cc_config.h>
|
||||
|
||||
/* AES.s -- Core AES routines for Intel processors.
|
||||
|
||||
Written by Eric Postpischil, January 30, 2008.
|
||||
*/
|
||||
|
||||
#if (defined __i386__ || defined __x86_64__)
|
||||
|
||||
|
||||
/* We build these AES routines as a single module because the routines refer
|
||||
to labels in Data.s and it is easier and faster to refer to them as local
|
||||
labels. In my implementations of AES for CommonCrypto, both i386 and
|
||||
x86_64 use position-independent code. For this in-kernel implementation,
|
||||
i386 has been converted to absolute addressing, but x86_64 still uses PIC.
|
||||
|
||||
A local label can be referred to with position-independent assembler
|
||||
expressions such as "label-base(register)", where <base> is a local label
|
||||
whose address has been loaded into <register>. (On i386, this is typically
|
||||
done with the idiom of a call to the next instruction and a pop of that
|
||||
return address into a register.) Without local labels, the references must
|
||||
be done using spaces for addresses of "lazy symbols" that are filled in by
|
||||
the dynamic loader and loaded by the code that wants the address.
|
||||
|
||||
So the various routines in other files are assembled here via #include
|
||||
directives.
|
||||
*/
|
||||
#include "Data.s"
|
||||
|
||||
|
||||
#define TableSize (256*4)
|
||||
/* Each of the arrays defined in Data.s except for the round constants
|
||||
in _AESRcon is composed of four tables of 256 entries of four bytes
|
||||
each. TableSize is the number of bytes in one of those four tables.
|
||||
*/
|
||||
|
||||
|
||||
// Include constants describing the AES context structures.
|
||||
#include "Context.h"
|
||||
|
||||
|
||||
/* Define a macro to select a value based on architecture. This reduces
|
||||
some of the architecture conditionalization later in the source.
|
||||
*/
|
||||
#if defined __i386__
|
||||
#define Arch(i386, x86_64) i386
|
||||
#elif defined __x86_64__
|
||||
#define Arch(i386, x86_64) x86_64
|
||||
#endif
|
||||
|
||||
|
||||
// Define an instruction for moving pointers.
|
||||
#define movp Arch(movd, movd)
|
||||
// Latter argument should be "movq", but the assembler uses "movd".
|
||||
|
||||
|
||||
/* Rename the general registers. This makes it easier to keep track of them
|
||||
and provides names for the "whole register" that are uniform between i386
|
||||
and x86_64.
|
||||
*/
|
||||
#if defined __i386__
|
||||
#define r0 %eax // Available for any use.
|
||||
#define r1 %ecx // Available for any use, some special purposes (loop).
|
||||
#define r2 %edx // Available for any use.
|
||||
#define r3 %ebx // Must be preserved by called routine.
|
||||
#define r4 %esp // Stack pointer.
|
||||
#define r5 %ebp // Frame pointer, must preserve, no bare indirect.
|
||||
#define r6 %esi // Must be preserved by called routine.
|
||||
#define r7 %edi // Must be preserved by called routine.
|
||||
#elif defined __x86_64__
|
||||
#define r0 %rax // Available for any use.
|
||||
#define r1 %rcx // Available for any use.
|
||||
#define r2 %rdx // Available for any use.
|
||||
#define r3 %rbx // Must be preserved by called routine.
|
||||
#define r4 %rsp // Stack pointer.
|
||||
#define r5 %rbp // Frame pointer. Must be preserved by called routine.
|
||||
#define r6 %rsi // Available for any use.
|
||||
#define r7 %rdi // Available for any use.
|
||||
#define r8 %r8 // Available for any use.
|
||||
#define r9 %r9 // Available for any use.
|
||||
#define r10 %r10 // Available for any use.
|
||||
#define r11 %r11 // Available for any use.
|
||||
#define r12 %r12 // Must be preserved by called routine.
|
||||
#define r13 %r13 // Must be preserved by called routine.
|
||||
#define r14 %r14 // Must be preserved by called routine.
|
||||
#define r15 %r15 // Must be preserved by called routine.
|
||||
#else
|
||||
#error "Unknown architecture."
|
||||
#endif
|
||||
|
||||
// Define names for parts of registers.
|
||||
|
||||
#define r0d %eax // Low 32 bits of r0.
|
||||
#define r1d %ecx // Low 32 bits of r1.
|
||||
#define r2d %edx // Low 32 bits of r2.
|
||||
#define r3d %ebx // Low 32 bits of r3.
|
||||
#define r5d %ebp // Low 32 bits of r5.
|
||||
#define r6d %esi // Low 32 bits of r6.
|
||||
#define r7d %edi // Low 32 bits of r7.
|
||||
#define r8d %r8d // Low 32 bits of r8.
|
||||
#define r9d %r9d // Low 32 bits of r9.
|
||||
#define r11d %r11d // Low 32 bits of r11.
|
||||
|
||||
#define r0l %al // Low byte of r0.
|
||||
#define r1l %cl // Low byte of r1.
|
||||
#define r2l %dl // Low byte of r2.
|
||||
#define r3l %bl // Low byte of r3.
|
||||
#define r5l %bpl // Low byte of r5.
|
||||
|
||||
#define r0h %ah // Second lowest byte of r0.
|
||||
#define r1h %ch // Second lowest byte of r1.
|
||||
#define r2h %dh // Second lowest byte of r2.
|
||||
#define r3h %bh // Second lowest byte of r3.
|
||||
|
||||
|
||||
.text
|
||||
|
||||
|
||||
// Define encryption routine, _AESEncryptWithExpandedKey
|
||||
#define Select 0
|
||||
#include "EncryptDecrypt.s"
|
||||
#undef Select
|
||||
|
||||
|
||||
// Define decryption routine, _AESDecryptWithExpandedKey
|
||||
#define Select 1
|
||||
#include "EncryptDecrypt.s"
|
||||
#undef Select
|
||||
|
||||
// Define key expansion routine for encryption, _AESExpandKeyForEncryption.
|
||||
// #include "ExpandKeyForEncryption.s"
|
||||
|
||||
|
||||
// Define key expansion for decryption routine, _AESExpandKeyForDecryption.
|
||||
// #include "ExpandKeyForDecryption.s"
|
||||
#endif /* x86 based build */
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,362 @@
|
|||
# Copyright (c) (2018-2020) Apple Inc. All rights reserved.
|
||||
#
|
||||
# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
# is contained in the License.txt file distributed with corecrypto) and only to
|
||||
# people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
# Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
# devices and computers you own or control, for the sole purpose of verifying the
|
||||
# security characteristics and correct functioning of the Apple Software. You may
|
||||
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
|
||||
/*
|
||||
|
||||
void SHA1( int HASH[], int MESSAGE[] )
|
||||
{
|
||||
int A[81], B[81], C[81], D[81], E[81];
|
||||
int W[80];
|
||||
int i, FN;
|
||||
|
||||
A[0] = HASH[0]; B[0] = HASH[1]; C[0] = HASH[2]; D[0] = HASH[3]; E[0] = HASH[4];
|
||||
|
||||
for ( i=0; i<80; ++i ) {
|
||||
if ( i < 16 )
|
||||
W[i] = BIG_ENDIAN_LOAD( MESSAGE[i] );
|
||||
else
|
||||
W[i] = ROTATE_LEFT( W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16], 1 );
|
||||
|
||||
FN = F( i, B[i], C[i], D[i] );
|
||||
A[i+1] = FN + E[i] + ROTATE_LEFT( A[i], 5 ) + W[i] + K(i);
|
||||
B[i+1] = A[i];
|
||||
C[i+1] = ROTATE_LEFT( B[i], 30 );
|
||||
D[i+1] = C[i];
|
||||
E[i+1] = D[i];
|
||||
}
|
||||
|
||||
HASH[0] += A[80]; HASH[1] += B[80]; HASH[2] += C[80]; HASH[3] += D[80]; HASH[4] += E[80];
|
||||
}
|
||||
|
||||
|
||||
For i=0:15, W[i] is simply big-endian loading of MESSAGE[i].
|
||||
For i=16:79, W[i] is updated according to W[i] = ROTATE_LEFT( W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16], 1 );
|
||||
|
||||
The approach (by Dean Gaudet) can be used to vectorize the computation of W[i] for i=16:79,
|
||||
|
||||
1. update 4 consequtive W[i] (stored in a single 16-byte register)
|
||||
W[i ] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1
|
||||
W[i+1] = (W[i-2] ^ W[i-7] ^ W[i-13] ^ W[i-15]) rol 1
|
||||
W[i+2] = (W[i-1] ^ W[i-6] ^ W[i-12] ^ W[i-14]) rol 1
|
||||
W[i+3] = ( 0 ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1
|
||||
|
||||
2. this additional calculation unfortunately requires many additional operations
|
||||
W[i+3] ^= W[i] rol 1
|
||||
|
||||
3. once we have 4 W[i] values in a Q register, we can also add four K values with one instruction
|
||||
W[i:i+3] += {K,K,K,K}
|
||||
|
||||
Let W0 = {W[i] W[i+1] W[i+2] W[i+3]} be the current W-vector to be computed,
|
||||
W4 = {W[i-4] W[i-3] W[i-2] W[i-1]} be the previous vector, and so on
|
||||
|
||||
The Dean Gaudet approach can be expressed as
|
||||
|
||||
1. W0 = rotate_left(left_shift(W4,32) ^ W8 ^ left_shift(concatenate(W16,W12),64) ^ W16,1);
|
||||
2. W[i+3] ^= W[i] rol 1
|
||||
3. W0 += {K,K,K,K}
|
||||
|
||||
For i>=32, the Intel online article suggests that (using a basic identity (X rol 1) rol 1 = X rol 2)
|
||||
the update equation is equivalent to
|
||||
|
||||
1. W0 = rotate_left(left_shift(concatenate(W8,W4),64) ^ W16 ^ W28 ^ W32, 2);
|
||||
|
||||
Note:
|
||||
1. In total, we need 8 16-byte registers or memory for W0,W4,...,W28. W0 and W32 can be the same register or memory.
|
||||
2. The registers are used in a circular buffering mode. For example, we start with W28,W24,...,W0
|
||||
(with W0 indicating the most recent 16-byte)
|
||||
i=0, W28,W24,...,W0
|
||||
i=4, W24,W20,...,W28
|
||||
i=8, W20,W16,...,W24
|
||||
.
|
||||
.
|
||||
and so forth.
|
||||
3. once W-vector is computed, W+K is then computed and saved in the stack memory, this will be used later when
|
||||
updating the digests A/B/C/D/E
|
||||
|
||||
the execution flow (for 1 single 64-byte block) looks like
|
||||
|
||||
W_PRECALC_00_15 // big-endian loading of 64-bytes into 4 W-vectors, compute WK=W+K, save WK in the stack memory
|
||||
|
||||
W_PRECALC_16_31 // for each vector, update digests, update W (Gaudet) and WK=W+K, save WK in the stack memory
|
||||
|
||||
W_PRECALC_32_79 // for each vector, update digests, update W (Intel) and WK=W+K, save WK in the stack memory
|
||||
|
||||
our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block
|
||||
into the last 16 rounds of its previous block:
|
||||
|
||||
----------------------------------------------------------------------------------------------------------
|
||||
|
||||
load W(0:15) (big-endian per 4 bytes) into 4 Q registers
|
||||
pre_calculate and store WK = W+K(0:15) in 16-byte aligned stack memory
|
||||
|
||||
L_loop:
|
||||
|
||||
load digests a-e from ctx->state;
|
||||
|
||||
for (r=0;r<16;r+=4) {
|
||||
digests a-e update and permute round r:r+3
|
||||
update W([r:r+3]%16) (Gaudet) and WK([r:r+3]%16) for the next 4th iteration
|
||||
}
|
||||
|
||||
for (r=16;r<64;r+=4) {
|
||||
digests a-e update and permute round r:r+3
|
||||
update W([r:r+3]%16) (Intel) and WK([r:r+3]%16) for the next 4th iteration
|
||||
}
|
||||
|
||||
num_block--;
|
||||
if (num_block==0) jmp L_last_block;
|
||||
|
||||
for (r=64;r<80;r+=4) {
|
||||
digests a-e update and permute round r:r+3
|
||||
load W([r:r+3]%16) (big-endian per 4 bytes) into 4 Q registers
|
||||
pre_calculate and store W+K([r:r+3]%16) in stack
|
||||
}
|
||||
|
||||
ctx->states += digests a-e;
|
||||
|
||||
jmp L_loop;
|
||||
|
||||
L_last_block:
|
||||
|
||||
for (r=64;r<80;r+=4) {
|
||||
digests a-e update and permute round r:r+3
|
||||
}
|
||||
|
||||
ctx->states += digests a-e;
|
||||
|
||||
|
||||
----------------------------------------------------------------------------------------------------------
|
||||
|
||||
*/
|
||||
|
||||
#if defined(__arm64__)
|
||||
|
||||
#include "arm64_isa_compatibility.h"
|
||||
#include "ccarm_pac_bti_macros.h"
|
||||
|
||||
.subsections_via_symbols
|
||||
.text
|
||||
|
||||
.p2align 4
|
||||
|
||||
#define K1 0x5a827999
|
||||
#define K2 0x6ed9eba1
|
||||
#define K3 0x8f1bbcdc
|
||||
#define K4 0xca62c1d6
|
||||
|
||||
K_XMM_AR:
|
||||
.long K1
|
||||
.long K1
|
||||
.long K1
|
||||
.long K1
|
||||
.long K2
|
||||
.long K2
|
||||
.long K2
|
||||
.long K2
|
||||
.long K3
|
||||
.long K3
|
||||
.long K3
|
||||
.long K3
|
||||
.long K4
|
||||
.long K4
|
||||
.long K4
|
||||
.long K4
|
||||
|
||||
.p2align 4
|
||||
|
||||
.globl _AccelerateCrypto_SHA1_compress
|
||||
_AccelerateCrypto_SHA1_compress:
|
||||
|
||||
|
||||
#define hashes x0
|
||||
#define numblocks x1
|
||||
#define data x2
|
||||
#define ktable x3
|
||||
|
||||
BRANCH_TARGET_CALL
|
||||
|
||||
#ifdef __ILP32__
|
||||
uxtw numblocks, numblocks // in arm64_32 size_t is 32-bit, so we need to extend it
|
||||
#endif
|
||||
|
||||
// early exit if input number of blocks is zero
|
||||
|
||||
adrp ktable, K_XMM_AR@page
|
||||
cbnz numblocks, 1f
|
||||
ret lr
|
||||
1:
|
||||
add ktable, ktable, K_XMM_AR@pageoff // K table
|
||||
|
||||
#if BUILDKERNEL
|
||||
|
||||
// saved vector registers that will be used in the computation v0-v7, v16-v21
|
||||
|
||||
sub x4, sp, #17*16
|
||||
sub sp, sp, #17*16
|
||||
|
||||
st1.4s {v0,v1,v2,v3}, [x4], #64
|
||||
st1.4s {v4,v5,v6,v7}, [x4], #64
|
||||
st1.4s {v16,v17,v18,v19}, [x4], #64
|
||||
st1.4s {v20,v21,v22,v23}, [x4], #64
|
||||
st1.4s {v24}, [x4], #16
|
||||
|
||||
#endif
|
||||
|
||||
ld1.4s {v0,v1,v2,v3}, [data], #64 // w0,w1,w2,w3 need to bswap into big-endian
|
||||
ld1.4s {v21,v22,v23,v24}, [ktable], #64 // k1,k2,k3,k4
|
||||
ldr q16, [hashes], #16
|
||||
ldr s17, [hashes], #-16
|
||||
|
||||
rev32.16b v0, v0 // byte swap of 1st 4 ints
|
||||
rev32.16b v1, v1 // byte swap of 2nd 4 ints
|
||||
rev32.16b v2, v2 // byte swap of 3rd 4 ints
|
||||
rev32.16b v3, v3 // byte swap of 4th 4 ints
|
||||
|
||||
mov.16b v18, v16
|
||||
add.4s v4, v0, v21 // 1st 4 input + K256
|
||||
add.4s v5, v1, v21 // 2nd 4 input + K256
|
||||
mov.16b v19, v17
|
||||
add.4s v6, v2, v21 // 3rd 4 input + K256
|
||||
add.4s v7, v3, v21 // 4th 4 input + K256
|
||||
|
||||
|
||||
.macro sha1c_round
|
||||
SHA1SU0 $0, $1, $2
|
||||
mov.16b v20, v18
|
||||
SHA1C 18, 19, $4
|
||||
SHA1H 19, 20
|
||||
SHA1SU1 $0, $3
|
||||
add.4s $6, $5, $7
|
||||
.endm
|
||||
|
||||
.macro sha1p_round
|
||||
SHA1SU0 $0, $1, $2
|
||||
mov.16b v20, v18
|
||||
SHA1P 18, 19, $4
|
||||
SHA1H 19, 20
|
||||
SHA1SU1 $0, $3
|
||||
add.4s $6, $5, $7
|
||||
.endm
|
||||
|
||||
.macro sha1m_round
|
||||
SHA1SU0 $0, $1, $2
|
||||
mov.16b v20, v18
|
||||
SHA1M 18, 19, $4
|
||||
SHA1H 19, 20
|
||||
SHA1SU1 $0, $3
|
||||
add.4s $6, $5, $7
|
||||
.endm
|
||||
|
||||
// 4 vector hashes update and load next vector rounds
|
||||
.macro sha1p_hash_load_round
|
||||
rev32.16b $1, $1
|
||||
mov.16b v20, v18
|
||||
SHA1P 18, 19, $0
|
||||
SHA1H 19, 20
|
||||
add.4s $2, $1, $3
|
||||
.endm
|
||||
|
||||
.macro sha1p_hash_round
|
||||
mov.16b v20, v18
|
||||
SHA1P 18, 19, $0
|
||||
SHA1H 19, 20
|
||||
.endm
|
||||
|
||||
sha1c_round 0, 1, 2, 3, 4, v0, v4, v21
|
||||
sha1c_round 1, 2, 3, 0, 5, v1, v5, v22
|
||||
sha1c_round 2, 3, 0, 1, 6, v2, v6, v22
|
||||
sha1c_round 3, 0, 1, 2, 7, v3, v7, v22
|
||||
|
||||
sha1c_round 0, 1, 2, 3, 4, v0, v4, v22
|
||||
sha1p_round 1, 2, 3, 0, 5, v1, v5, v22
|
||||
sha1p_round 2, 3, 0, 1, 6, v2, v6, v23
|
||||
sha1p_round 3, 0, 1, 2, 7, v3, v7, v23
|
||||
|
||||
sha1p_round 0, 1, 2, 3, 4, v0, v4, v23
|
||||
sha1p_round 1, 2, 3, 0, 5, v1, v5, v23
|
||||
sha1m_round 2, 3, 0, 1, 6, v2, v6, v23
|
||||
sha1m_round 3, 0, 1, 2, 7, v3, v7, v24
|
||||
|
||||
sha1m_round 0, 1, 2, 3, 4, v0, v4, v24
|
||||
sha1m_round 1, 2, 3, 0, 5, v1, v5, v24
|
||||
sha1m_round 2, 3, 0, 1, 6, v2, v6, v24
|
||||
sha1p_round 3, 0, 1, 2, 7, v3, v7, v24
|
||||
|
||||
subs numblocks, numblocks, #1 // pre-decrement num_blocks by 1
|
||||
b.le L_wrapup
|
||||
|
||||
|
||||
L_loop:
|
||||
|
||||
ld1.4s {v0,v1,v2,v3}, [data], #64 // w0,w1,w2,w3 need to bswap into big-endian
|
||||
|
||||
sha1p_hash_load_round 4, v0, v4, v21
|
||||
sha1p_hash_load_round 5, v1, v5, v21
|
||||
sha1p_hash_load_round 6, v2, v6, v21
|
||||
sha1p_hash_load_round 7, v3, v7, v21
|
||||
|
||||
add.4s v18, v16, v18
|
||||
add.4s v19, v17, v19
|
||||
mov.16b v16, v18
|
||||
mov.16b v17, v19
|
||||
|
||||
sha1c_round 0, 1, 2, 3, 4, v0, v4, v21
|
||||
sha1c_round 1, 2, 3, 0, 5, v1, v5, v22
|
||||
sha1c_round 2, 3, 0, 1, 6, v2, v6, v22
|
||||
sha1c_round 3, 0, 1, 2, 7, v3, v7, v22
|
||||
|
||||
sha1c_round 0, 1, 2, 3, 4, v0, v4, v22
|
||||
sha1p_round 1, 2, 3, 0, 5, v1, v5, v22
|
||||
sha1p_round 2, 3, 0, 1, 6, v2, v6, v23
|
||||
sha1p_round 3, 0, 1, 2, 7, v3, v7, v23
|
||||
|
||||
sha1p_round 0, 1, 2, 3, 4, v0, v4, v23
|
||||
sha1p_round 1, 2, 3, 0, 5, v1, v5, v23
|
||||
sha1m_round 2, 3, 0, 1, 6, v2, v6, v23
|
||||
sha1m_round 3, 0, 1, 2, 7, v3, v7, v24
|
||||
|
||||
sha1m_round 0, 1, 2, 3, 4, v0, v4, v24
|
||||
sha1m_round 1, 2, 3, 0, 5, v1, v5, v24
|
||||
sha1m_round 2, 3, 0, 1, 6, v2, v6, v24
|
||||
sha1p_round 3, 0, 1, 2, 7, v3, v7, v24
|
||||
|
||||
subs numblocks, numblocks, #1 // pre-decrement num_blocks by 1
|
||||
b.gt L_loop
|
||||
|
||||
L_wrapup:
|
||||
|
||||
sha1p_hash_round 4
|
||||
sha1p_hash_round 5
|
||||
sha1p_hash_round 6
|
||||
sha1p_hash_round 7
|
||||
|
||||
add.4s v16, v16, v18
|
||||
add.4s v17, v17, v19
|
||||
str q16,[hashes], #16
|
||||
str s17,[hashes]
|
||||
|
||||
|
||||
|
||||
#if BUILDKERNEL
|
||||
|
||||
// restore vector registers that have be used clobbered in the computation v0-v7, v16-v21
|
||||
|
||||
ld1.4s {v0,v1,v2,v3}, [sp], #64
|
||||
ld1.4s {v4,v5,v6,v7}, [sp], #64
|
||||
ld1.4s {v16,v17,v18,v19}, [sp], #64
|
||||
ld1.4s {v20,v21,v22,v23}, [sp], #64
|
||||
ld1.4s {v24}, [sp], #16
|
||||
|
||||
#endif
|
||||
|
||||
ret lr
|
||||
|
||||
#endif // define(__arm64__)
|
||||
|
||||
|
|
@ -0,0 +1,30 @@
|
|||
/* Copyright (c) (2019) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
#include "config.h"
|
||||
#include "AccelerateCrypto.h"
|
||||
|
||||
#if (defined(__x86_64__) || defined(__i386__))
|
||||
extern void AccelerateCrypto_SHA1_compress_ssse3(uint32_t *state, size_t num, const void *buf) __asm__("_AccelerateCrypto_SHA1_compress_ssse3");
|
||||
extern void AccelerateCrypto_SHA1_compress_AVX1(uint32_t *state, size_t num, const void *buf) __asm__("_AccelerateCrypto_SHA1_compress_AVX1");
|
||||
extern void AccelerateCrypto_SHA1_compress_AVX2(uint32_t *state, size_t num, const void *buf) __asm__("_AccelerateCrypto_SHA1_compress_AVX2");
|
||||
|
||||
void AccelerateCrypto_SHA1_compress(uint32_t *state, size_t num, const void *buf)
|
||||
{
|
||||
#if defined(__x86_64__)
|
||||
if (HAS_AVX2()) AccelerateCrypto_SHA1_compress_AVX2(state, num, buf);
|
||||
else if (HAS_AVX1()) AccelerateCrypto_SHA1_compress_AVX1(state, num, buf);
|
||||
else
|
||||
#endif
|
||||
AccelerateCrypto_SHA1_compress_ssse3(state, num, buf);
|
||||
}
|
||||
#endif // (defined(__x86_64__) || defined(__i386__))
|
||||
|
|
@ -0,0 +1,785 @@
|
|||
# Copyright (c) (2016,2018,2019) Apple Inc. All rights reserved.
|
||||
#
|
||||
# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
# is contained in the License.txt file distributed with corecrypto) and only to
|
||||
# people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
# Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
# devices and computers you own or control, for the sole purpose of verifying the
|
||||
# security characteristics and correct functioning of the Apple Software. You may
|
||||
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
#include <corecrypto/cc_config.h>
|
||||
|
||||
#if defined(__x86_64__)
|
||||
|
||||
/* vng_sha1LittleEndian.s : this file provides optimized x86_64 avx1 implementation of the sha1 function
|
||||
CoreOS - vector and numerics group
|
||||
|
||||
The implementation is based on the principle described in an Intel online article
|
||||
"Improving the Performance of the Secure Hash Algorithm (SHA-1)"
|
||||
http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
|
||||
|
||||
|
||||
Update HASH[] by processing a one 64-byte block in MESSAGE[] can be represented by the following C function
|
||||
|
||||
void SHA1( int HASH[], int MESSAGE[] )
|
||||
{
|
||||
int A[81], B[81], C[81], D[81], E[81];
|
||||
int W[80];
|
||||
|
||||
int i, FN;
|
||||
|
||||
A[0] = HASH[0];
|
||||
B[0] = HASH[1];
|
||||
C[0] = HASH[2];
|
||||
D[0] = HASH[3];
|
||||
E[0] = HASH[4];
|
||||
|
||||
for ( i=0; i<80; ++i )
|
||||
{
|
||||
if ( i < 16 )
|
||||
W[i] = BIG_ENDIAN_LOAD( MESSAGE[i] );
|
||||
else
|
||||
W[i] = ROTATE_LEFT( W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16], 1 );
|
||||
|
||||
FN = F( i, B[i], C[i], D[i] );
|
||||
|
||||
A[i+1] = FN + E[i] + ROTATE_LEFT( A[i], 5 ) + W[i] + K(i);
|
||||
B[i+1] = A[i];
|
||||
C[i+1] = ROTATE_LEFT( B[i], 30 );
|
||||
D[i+1] = C[i];
|
||||
E[i+1] = D[i];
|
||||
}
|
||||
|
||||
HASH[0] += A[80];
|
||||
HASH[1] += B[80];
|
||||
HASH[2] += C[80];
|
||||
HASH[3] += D[80];
|
||||
HASH[4] += E[80];
|
||||
}
|
||||
|
||||
For i=0:15, W[i] is simply big-endian loading of MESSAGE[i]. For i=16:79, W[i] is updated according to W[i] = ROTATE_LEFT( W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16], 1 );
|
||||
|
||||
The approach (by Dean Gaudet) can be used to vectorize the computation of W[i] for i=16:79,
|
||||
|
||||
1. done on 4 consequtive W[i] values in a single XMM register
|
||||
W[i ] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1
|
||||
W[i+1] = (W[i-2] ^ W[i-7] ^ W[i-13] ^ W[i-15]) rol 1
|
||||
W[i+2] = (W[i-1] ^ W[i-6] ^ W[i-12] ^ W[i-14]) rol 1
|
||||
W[i+3] = ( 0 ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1
|
||||
|
||||
2. this additional calculation unfortunately requires many additional operations
|
||||
W[i+3] ^= W[i] rol 1
|
||||
|
||||
3. once we have 4 W[i] values in XMM we can also add four K values with one instruction
|
||||
W[i:i+3] += {K,K,K,K}
|
||||
|
||||
Let W0 = {W[i] W[i+1] W[i+2] W[i+3]} be the current W-vector to be computed, W4 = {W[i-4] W[i-3] W[i-2] W[i-1]} be the previous vector, and so on
|
||||
The Dean Gaudet approach can be expressed as
|
||||
|
||||
1. W0 = rotate_left(left_shift(W4,32) ^ W8 ^ left_shift(concatenate(W16,W12),64) ^ W16,1);
|
||||
2. W[i+3] ^= W[i] rol 1
|
||||
3. W0 += {K,K,K,K}
|
||||
|
||||
For i>=32, the Intel online article suggests that (using a basic identity (X rol 1) rol 1 = X rol 2) the update equation is equivalent to
|
||||
|
||||
1. W0 = rotate_left(left_shift(concatenate(W8,W4),64) ^ W16 ^ W28 ^ W32, 2);
|
||||
|
||||
Note:
|
||||
1. In total, we need 8 16-byte registers or memory for W0,W4,...,W28. W0 and W32 can be the same register or memory.
|
||||
2. The registers are used in a circular buffering mode. For example, we start with W28,W24,...,W0 (with W0 indicating the most recent 16-byte)
|
||||
i=0, W28,W24,...,W0
|
||||
i=4, W24,W20,...,W28
|
||||
i=8, W20,W16,...,W24
|
||||
.
|
||||
.
|
||||
and so forth.
|
||||
3. 2 ssse3 instructions are used in the Intel article, pshufb and palignr.
|
||||
a. pshufb is used to simplify the BIG_ENDIAN_LOAD operation
|
||||
b. palignr is used to simplify the computation of left_shift(concatenate(W12,W8),64)
|
||||
|
||||
*/
|
||||
|
||||
/* the code can be compiled into single block (64 bytes) per call mode by setting Multiple_blocks to 0 */
|
||||
#define Multiple_Blocks 1
|
||||
|
||||
#if defined (__x86_64__)
|
||||
|
||||
#if BUILDKERNEL
|
||||
#define stack_size (32*10+16*4+16) // ymm0-9 + 4 128-bits for intermediate WK(t) storage + 32-byte alignment
|
||||
#else
|
||||
#define stack_size (16*4) // 4 128-bits for intermediate WK(t) storage
|
||||
#endif
|
||||
#define sp %rsp // unifying architectural stack pointer representation
|
||||
#define ctx %rdi // 1st input argument, will move to HASH_PTR (%r9)
|
||||
#define buf %rdx // 3rd input argument, will move to BUFFER_PTR (%r10)
|
||||
#define cnt %r11 // will copy from the 2nd input argument (%rsi)
|
||||
#define K_BASE %r8 // an aligned pointer to point to shufb reference numbers of table of K values
|
||||
#define HASH_PTR %r9 // pointer to Hash values (A,B,C,D,E)
|
||||
#define BUFFER_PTR %r10 // pointer to input blocks
|
||||
|
||||
// symbolizing registers or stack memory with algorithmic variables W0,W4,...,W28 + W_TMP, W_TMP2, and XMM_SHUFB_BSWAP for code with avx1 support
|
||||
|
||||
#define W_TMP %xmm0
|
||||
#define W_TMP2 %xmm1
|
||||
#define W0 %xmm2
|
||||
#define W4 %xmm3
|
||||
#define W8 %xmm4
|
||||
#define W12 %xmm5
|
||||
#define W16 %xmm6
|
||||
#define W20 %xmm7
|
||||
#define W24 %xmm8
|
||||
#define W28 %xmm9
|
||||
#define XMM_SHUFB_BSWAP REV32(%rip)
|
||||
|
||||
#define xmov vmovaps // aligned 16-byte move
|
||||
#define xmovu vmovups // unaligned 16-byte move
|
||||
|
||||
// intermediate hash variables
|
||||
#define A %ecx
|
||||
#define B %esi
|
||||
#define C %edi
|
||||
#define D %r15d
|
||||
#define E %edx
|
||||
|
||||
// temp variables
|
||||
#define T1 %eax
|
||||
#define T2 %ebx
|
||||
|
||||
#define WK(t) ((t)&15)*4(sp)
|
||||
|
||||
// int F1(int B, int C, int D) { return (D ^ ( B & (C ^ D)); }
|
||||
// result in T1
|
||||
.macro F1 arg0, arg1, arg2
|
||||
mov \arg1, T1
|
||||
xor \arg2, T1
|
||||
and \arg0, T1
|
||||
xor \arg2, T1
|
||||
.endm
|
||||
|
||||
// int F2(int B, int C, int D) { return (D ^ B ^ C); }
|
||||
// result in T1
|
||||
.macro F2 arg0, arg1, arg2
|
||||
mov \arg2, T1
|
||||
xor \arg1, T1
|
||||
xor \arg0, T1
|
||||
.endm
|
||||
|
||||
// int F3(int B, int C, int D) { return (B & C) | (D & (B ^ C)); }
|
||||
// result in T1
|
||||
.macro F3 arg0, arg1, arg2
|
||||
mov \arg1, T1
|
||||
mov \arg0, T2
|
||||
or \arg0, T1
|
||||
and \arg1, T2
|
||||
and \arg2, T1
|
||||
or T2, T1
|
||||
.endm
|
||||
|
||||
// for i=60:79, F4 is identical to F2
|
||||
#define F4 F2
|
||||
|
||||
|
||||
/*
|
||||
i=0:15, W[i] = BIG_ENDIAN_LOAD(MESSAGE[i]);
|
||||
|
||||
for (i=0;i<16;i+=4) {
|
||||
1. W_TMP = new 16 bytes from MESSAGE[]
|
||||
2. W_TMP = pshufb(W_TMP, XMM_SHUFB_BSWAP); save to W circular buffer for updating W
|
||||
3. WTMP += {K,K,K,K};
|
||||
4. save quadruple W[i]+K[i] = W_TMP in the stack memory;
|
||||
}
|
||||
|
||||
each step is represented in one of the following 4 macro definitions
|
||||
|
||||
*/
|
||||
|
||||
.macro W_PRECALC_00_15_0 arg0 // input argument $0 : 0/4/8/12
|
||||
xmovu \arg0*4(BUFFER_PTR), W_TMP // read 16-bytes into W_TMP, BUFFER_PTR possibly not 16-byte aligned
|
||||
.endm
|
||||
|
||||
.macro W_PRECALC_00_15_1 arg0 // input argument $0 : current 16-bytes in the circular buffer, one of W0,W4,W8,...,W28
|
||||
vpshufb XMM_SHUFB_BSWAP, W_TMP, \arg0 // convert W_TMP from little-endian into big-endian
|
||||
.endm
|
||||
|
||||
.macro W_PRECALC_00_15_2 arg0 // K_BASE points to the current K quadruple.
|
||||
vpaddd (K_BASE), \arg0, W_TMP // W_TMP += {K,K,K,K};
|
||||
.endm
|
||||
|
||||
.macro W_PRECALC_00_15_3 arg0
|
||||
xmov W_TMP, WK(\arg0&~3) // save quadruple W[i]+K in the stack memory, which would be used later for updating the hashes A/B/C/D/E
|
||||
.endm
|
||||
|
||||
// rounds 16-31 compute W[0] using the vectorization approach by Dean Gaudet
|
||||
/*
|
||||
W[i ] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1
|
||||
W[i+1] = (W[i-2] ^ W[i-7] ^ W[i-13] ^ W[i-15]) rol 1
|
||||
W[i+2] = (W[i-1] ^ W[i-6] ^ W[i-12] ^ W[i-14]) rol 1
|
||||
W[i+3] = ( 0 ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1
|
||||
|
||||
W[i+3] ^= W[i] rol 1; // this W[i] is already rol by 1, if we are taking from the intial W before rol 1, we should rol this by 2
|
||||
|
||||
The operation (updating W and W+K) is scheduled as and divided into 4 steps
|
||||
|
||||
0. W_tmp = W3; W = W14 ^ W8
|
||||
1. W = W3 ^ W8 ^ W14 ^ W16; W_TMP = W; W_TMP2 = (W[i] 0 0 0);
|
||||
2. W_TMP = (W3 ^ W8 ^ W14 ^ W16) rol 1; split (W[i] 0 0 0) rol 2 in W_TMP2 and W
|
||||
3. W = W_TMP = W_TMP ^ W_TMP2 ^ W = (W3 ^ W8 ^ W14 ^ W16) rol 1 ^ (W[i] 0 0 0) rol 2; WK = W _TMP+K;
|
||||
|
||||
*/
|
||||
|
||||
.macro W_PRECALC_16_31_0 arg0, arg1, arg2, arg3, arg4 // input arguments : W16,W12,W8,W4,W
|
||||
vpalignr $8, \arg0, \arg1, \arg4 // W = W14
|
||||
vpsrldq $4, \arg3, W_TMP // W_TMP = W3
|
||||
vpxor \arg2, \arg4, \arg4 // W = W8 ^ W14
|
||||
.endm
|
||||
|
||||
.macro W_PRECALC_16_31_1 arg0, arg1 // input arguments : W16,W
|
||||
vpxor \arg0, W_TMP, W_TMP // W_TMP = W3 ^ W16
|
||||
vpxor W_TMP, \arg1, \arg1 // W = W3 ^ W16 ^ W8 ^ W14
|
||||
vpslldq $12, \arg1, W_TMP2 // W_TMP2 = (W[i] 0 0 0)
|
||||
.endm
|
||||
|
||||
.macro W_PRECALC_16_31_2 arg0 // input argument : W
|
||||
vpslld $1, \arg0, W_TMP // (W3 ^ W16 ^ W8 ^ W14)<<1
|
||||
vpsrld $31, \arg0, \arg0 // (W3 ^ W16 ^ W8 ^ W14)>>31
|
||||
vpor \arg0, W_TMP, W_TMP // W_TMP = (W3 ^ W16 ^ W8 ^ W14) rol 1
|
||||
vpslld $2, W_TMP2, \arg0 // W = W[i] higher 30 bits after rol 2
|
||||
vpsrld $30, W_TMP2, W_TMP2 // W_TMP2 = W[i] lower 2 bits after rol 2
|
||||
.endm
|
||||
|
||||
.macro W_PRECALC_16_31_3 arg0, arg1, arg2 // input arguments: W, i, K_XMM
|
||||
vpxor W_TMP, \arg0, \arg0
|
||||
vpxor W_TMP2, \arg0, \arg0 // W_TMP = (W3 ^ W16 ^ W8 ^ W14) rol 1 ^ (W[i] 0 0 0) rol 2
|
||||
vpaddd \arg2(K_BASE), \arg0, W_TMP // W+K
|
||||
xmov W_TMP, WK(\arg1&~3) // save WK = W+K for later update of the hashes A/B/C/D/E
|
||||
.endm
|
||||
|
||||
/* rounds 32-79 compute W und W+K iusing the vectorization approach from the Intel article
|
||||
|
||||
W = rotate_left(left_shift(concatenate(W8,W4),64) ^ W16 ^ W28 ^ W32, 2);
|
||||
|
||||
where left_shift(concatenate(W8,W4),64) is equivalent to W6. Note also that W32 and W use the same register.
|
||||
|
||||
|
||||
0. W_tmp = W6; W = W28 ^ W32;
|
||||
1. W = W_tmp = W6 ^ W16 ^ W28 ^ W32;
|
||||
2. W_tmp = (W6 ^ W16 ^ W28 ^ W32) rol 2;
|
||||
3. W = W_Tmp; WK = W_tmp + K;
|
||||
|
||||
*/
|
||||
|
||||
|
||||
.macro W_PRECALC_32_79_0 arg0, arg1, arg2, arg3 // inputr arguments : W28,W8,W4,W
|
||||
vpxor \arg0, \arg3, \arg3 // W = W28 ^ W32;
|
||||
vpalignr $8, \arg1, \arg2, W_TMP // W_tmp = (w3 w4 w5 w6) = W6;
|
||||
.endm
|
||||
|
||||
.macro W_PRECALC_32_79_1 arg0, arg1 // input arguments : W16,W
|
||||
vpxor \arg0, \arg1, \arg1 // W_tmp = W6 ^ W16
|
||||
vpxor W_TMP, \arg1, \arg1 // W_tmp = W6 ^ W16 ^ W28 ^ W32
|
||||
//xmov W_TMP, \arg1 // W = W_tmp = W6 ^ W16 ^ W28 ^ W32
|
||||
.endm
|
||||
|
||||
.macro W_PRECALC_32_79_2 arg0 // input argument : W
|
||||
vpslld $2, \arg0, W_TMP // W << 2
|
||||
vpsrld $30, \arg0, \arg0 // W >> 30
|
||||
vpor W_TMP, \arg0, \arg0 // W_tmp = (W6 ^ W16 ^ W28 ^ W32) rol 2
|
||||
.endm
|
||||
|
||||
.macro W_PRECALC_32_79_3 arg0, arg1, arg2 // input argument W, i, K_XMM
|
||||
vpaddd \arg2(K_BASE), \arg0, W_TMP // W + K
|
||||
xmov W_TMP, WK(\arg1&~3) // write W+K
|
||||
.endm
|
||||
|
||||
|
||||
/* The hash update operation is completed by the following statements.
|
||||
|
||||
A[i+1] = FN + E[i] + ROTATE_LEFT( A[i], 5 ) + WK(i);
|
||||
B[i+1] = A[i];
|
||||
C[i+1] = ROTATE_LEFT( B[i], 30 );
|
||||
D[i+1] = C[i];
|
||||
E[i+1] = D[i];
|
||||
|
||||
Suppose we start with A0,B0,C0,D0,E0. The 1st iteration can be expressed as follows:
|
||||
|
||||
A1 = FN + E0 + rol(A0,5) + WK;
|
||||
B1 = A0;
|
||||
C1 = rol(B0, 30);
|
||||
D1 = C0;
|
||||
E1 = D0;
|
||||
|
||||
to avoid excessive memory movement between registers,
|
||||
1. A1 = FN + E0 + rol(A0,5) + WK; can be temporarily saved in E0,
|
||||
2. C1 = rol(B0,30) can be temporarily saved in B0.
|
||||
|
||||
Therefore, ignoring the time index, the update operation is equivalent to
|
||||
1. E = FN(B,C,D) + E + rol(A,5) + WK(i)
|
||||
2. B = rol(B,30)
|
||||
3. the hashes are now stored in the order of E,A,B,C,D
|
||||
|
||||
|
||||
To pack 2 hash update operations in 1 iteration, starting with A,B,C,D,E
|
||||
1. E = FN(B,C,D) + E + rol(A,5) + WK(i)
|
||||
2. B = rol(B,30)
|
||||
// now the hashes are in the order of E,A,B,C,D
|
||||
3. D = FN(A,B,C) + D + rol(E,5) + WK(i+1)
|
||||
4. A = rol(A,30)
|
||||
// now the hashes are in the order of D,E,A,B,C
|
||||
|
||||
These operations are distributed into the following 2 macro definitions RR0 and RR1.
|
||||
|
||||
*/
|
||||
|
||||
.macro RR0 arg0, arg1, arg2, arg3, arg4, arg5, arg6 // input arguments : FN, A, B, C, D, E, i
|
||||
\arg0 \arg2, \arg3, \arg4 // T1 = FN(B,C,D)
|
||||
rol $30, \arg2 // B = rol(B,30)
|
||||
add WK(\arg6), \arg5 // E + WK(i)
|
||||
mov \arg1, T2 // T2 = A
|
||||
add WK(\arg6+1), \arg4 // D + WK(i+1)
|
||||
rol $5, T2 // rol(A,5)
|
||||
add T1, \arg5 // E = FN(B,C,D) + E + WK(i)
|
||||
.endm
|
||||
|
||||
.macro RR1 arg0, arg1, arg2, arg3, arg4, arg5, arg6
|
||||
add \arg5, T2 // T2 = FN(B,C,D) + E + rol(A,5) + WK(i)
|
||||
mov T2, \arg5 // E = FN(B,C,D) + E + rol(A,5) + WK(i)
|
||||
rol $5, T2 // rol(E,5)
|
||||
add T2, \arg4 // D + WK(i+1) + rol(E,5)
|
||||
\arg0 \arg1, \arg2, \arg3 // FN(A,B,C)
|
||||
add T1, \arg4 // D = FN(A,B,C) + D + rol(E,5) + WK(i+1)
|
||||
rol $30, \arg1 // A = rol(A,30)
|
||||
.endm
|
||||
|
||||
|
||||
.macro INITIAL_W_PRECALC // BIG_ENDIAN_LOAD(64 bytes block) into W (i=0:15) and store W+K into the stack memory
|
||||
|
||||
// i=0 : W28,W24,W20,W16,W12,W8,W4,W0
|
||||
W_PRECALC_00_15_0 0 // W_TMP = (BUFFER_PTR)
|
||||
W_PRECALC_00_15_1 W0 // convert W_TMP to big-endian, and save W0 = W_TMP
|
||||
W_PRECALC_00_15_2 W0 // W_TMP = W0 + K
|
||||
W_PRECALC_00_15_3 3 // (sp) = W_TMP = W0 + K
|
||||
|
||||
// i=4 : W24,W20,W16,W12,W8,W4,W0,W28
|
||||
W_PRECALC_00_15_0 4 // W_TMP = 16(BUFFER_PTR)
|
||||
W_PRECALC_00_15_1 W28 // convert W_TMP to big-endian, and save W28 = W_TMP
|
||||
W_PRECALC_00_15_2 W28 // W_TMP = W28 + K
|
||||
W_PRECALC_00_15_3 7 // 16(sp) = W_TMP = W28 + K
|
||||
|
||||
// i=8 : W20,W16,W12,W8,W4,W0,W28,W24
|
||||
W_PRECALC_00_15_0 8 // W_TMP = 32(BUFFER_PTR)
|
||||
W_PRECALC_00_15_1 W24 // convert W_TMP to big-endian, and save W24 = W_TMP
|
||||
W_PRECALC_00_15_2 W24 // W_TMP = W24 + K
|
||||
W_PRECALC_00_15_3 11 // 32(sp) = W_TMP = W24 + K
|
||||
|
||||
// i=12 : W16,W12,W8,W4,W0,W28,W24,W20
|
||||
W_PRECALC_00_15_0 12 // W_TMP = 48(BUFFER_PTR)
|
||||
W_PRECALC_00_15_1 W20 // convert W_TMP to big-endian, and save W20 = W_TMP
|
||||
W_PRECALC_00_15_2 W20 // W_TMP = W20 + K
|
||||
W_PRECALC_00_15_3 15 // 48(sp) = W_TMP = W20 + K
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
.macro INTERNAL // updating W (16:79) and update the digests A/B/C/D/E (i=0:63, based on W+K stored in the stack memory)
|
||||
|
||||
// i=16 : W12,W8,W4,W0,W28,W24,W20,W16
|
||||
W_PRECALC_16_31_0 W0,W28,W24,W20,W16
|
||||
RR0 F1,A,B,C,D,E,0
|
||||
W_PRECALC_16_31_1 W0,W16
|
||||
RR1 F1,A,B,C,D,E,0
|
||||
W_PRECALC_16_31_2 W16
|
||||
RR0 F1,D,E,A,B,C,2
|
||||
W_PRECALC_16_31_3 W16, 2, 0
|
||||
RR1 F1,D,E,A,B,C,2
|
||||
|
||||
// i=20 : W8,W4,W0,W28,W24,W20,W16,W12
|
||||
W_PRECALC_16_31_0 W28,W24,W20,W16,W12
|
||||
RR0 F1,B,C,D,E,A,4
|
||||
W_PRECALC_16_31_1 W28,W12
|
||||
RR1 F1,B,C,D,E,A,4
|
||||
W_PRECALC_16_31_2 W12
|
||||
RR0 F1,E,A,B,C,D,6
|
||||
W_PRECALC_16_31_3 W12, 6, 16
|
||||
RR1 F1,E,A,B,C,D,6
|
||||
|
||||
// i=24 : W4,W0,W28,W24,W20,W16,W12,W8
|
||||
W_PRECALC_16_31_0 W24,W20,W16,W12,W8
|
||||
RR0 F1,C,D,E,A,B,8
|
||||
W_PRECALC_16_31_1 W24,W8
|
||||
RR1 F1,C,D,E,A,B,8
|
||||
W_PRECALC_16_31_2 W8
|
||||
RR0 F1,A,B,C,D,E,10
|
||||
W_PRECALC_16_31_3 W8,10,16
|
||||
RR1 F1,A,B,C,D,E,10
|
||||
|
||||
// i=28 : W0,W28,W24,W20,W16,W12,W8,W4
|
||||
W_PRECALC_16_31_0 W20,W16,W12,W8,W4
|
||||
RR0 F1,D,E,A,B,C,12
|
||||
W_PRECALC_16_31_1 W20,W4
|
||||
RR1 F1,D,E,A,B,C,12
|
||||
W_PRECALC_16_31_2 W4
|
||||
RR0 F1,B,C,D,E,A,14
|
||||
W_PRECALC_16_31_3 W4,14,16
|
||||
RR1 F1,B,C,D,E,A,14
|
||||
|
||||
// i=32 : W28,W24,W20,W16,W12,W8,W4,W0
|
||||
W_PRECALC_32_79_0 W28,W8,W4,W0
|
||||
RR0 F1,E,A,B,C,D,16
|
||||
W_PRECALC_32_79_1 W16,W0
|
||||
RR1 F1,E,A,B,C,D,16
|
||||
W_PRECALC_32_79_2 W0
|
||||
RR0 F1,C,D,E,A,B,18
|
||||
W_PRECALC_32_79_3 W0,18,16
|
||||
RR1 F1,C,D,E,A,B,18
|
||||
|
||||
// starting using F2
|
||||
|
||||
// i=36 : W24,W20,W16,W12,W8,W4,W0,W28
|
||||
W_PRECALC_32_79_0 W24,W4,W0,W28
|
||||
RR0 F2,A,B,C,D,E,20
|
||||
W_PRECALC_32_79_1 W12,W28
|
||||
RR1 F2,A,B,C,D,E,20
|
||||
W_PRECALC_32_79_2 W28
|
||||
RR0 F2,D,E,A,B,C,22
|
||||
W_PRECALC_32_79_3 W28,22,16
|
||||
RR1 F2,D,E,A,B,C,22
|
||||
|
||||
// i=40 : W20,W16,W12,W8,W4,W0,W28,W24
|
||||
#undef K_XMM
|
||||
#define K_XMM 32
|
||||
W_PRECALC_32_79_0 W20,W0,W28,W24
|
||||
RR0 F2,B,C,D,E,A,24
|
||||
W_PRECALC_32_79_1 W8,W24
|
||||
RR1 F2,B,C,D,E,A,24
|
||||
W_PRECALC_32_79_2 W24
|
||||
RR0 F2,E,A,B,C,D,26
|
||||
W_PRECALC_32_79_3 W24,26,K_XMM
|
||||
RR1 F2,E,A,B,C,D,26
|
||||
|
||||
// i=44 : W16,W12,W8,W4,W0,W28,W24,W20
|
||||
W_PRECALC_32_79_0 W16,W28,W24,W20
|
||||
RR0 F2,C,D,E,A,B,28
|
||||
W_PRECALC_32_79_1 W4,W20
|
||||
RR1 F2,C,D,E,A,B,28
|
||||
W_PRECALC_32_79_2 W20
|
||||
RR0 F2,A,B,C,D,E,30
|
||||
W_PRECALC_32_79_3 W20,30,K_XMM
|
||||
RR1 F2,A,B,C,D,E,30
|
||||
|
||||
// i=48 : W12,W8,W4,W0,W28,W24,W20,W16
|
||||
W_PRECALC_32_79_0 W12,W24,W20,W16
|
||||
RR0 F2,D,E,A,B,C,32
|
||||
W_PRECALC_32_79_1 W0,W16
|
||||
RR1 F2,D,E,A,B,C,32
|
||||
W_PRECALC_32_79_2 W16
|
||||
RR0 F2,B,C,D,E,A,34
|
||||
W_PRECALC_32_79_3 W16,34,K_XMM
|
||||
RR1 F2,B,C,D,E,A,34
|
||||
|
||||
// i=52 : W8,W4,W0,W28,W24,W20,W16,W12
|
||||
W_PRECALC_32_79_0 W8,W20,W16,W12
|
||||
RR0 F2,E,A,B,C,D,36
|
||||
W_PRECALC_32_79_1 W28,W12
|
||||
RR1 F2,E,A,B,C,D,36
|
||||
W_PRECALC_32_79_2 W12
|
||||
RR0 F2,C,D,E,A,B,38
|
||||
W_PRECALC_32_79_3 W12,38,K_XMM
|
||||
RR1 F2,C,D,E,A,B,38
|
||||
|
||||
// starting using F3
|
||||
|
||||
// i=56 : W4,W0,W28,W24,W20,W16,W12,W8
|
||||
W_PRECALC_32_79_0 W4,W16,W12,W8
|
||||
RR0 F3,A,B,C,D,E,40
|
||||
W_PRECALC_32_79_1 W24,W8
|
||||
RR1 F3,A,B,C,D,E,40
|
||||
W_PRECALC_32_79_2 W8
|
||||
RR0 F3,D,E,A,B,C,42
|
||||
W_PRECALC_32_79_3 W8,42,K_XMM
|
||||
RR1 F3,D,E,A,B,C,42
|
||||
|
||||
// i=60 : W0,W28,W24,W20,W16,W12,W8,W4
|
||||
#undef K_XMM
|
||||
#define K_XMM 48
|
||||
W_PRECALC_32_79_0 W0,W12,W8,W4
|
||||
RR0 F3,B,C,D,E,A,44
|
||||
W_PRECALC_32_79_1 W20,W4
|
||||
RR1 F3,B,C,D,E,A,44
|
||||
W_PRECALC_32_79_2 W4
|
||||
RR0 F3,E,A,B,C,D,46
|
||||
W_PRECALC_32_79_3 W4,46,K_XMM
|
||||
RR1 F3,E,A,B,C,D,46
|
||||
|
||||
// i=64 : W28,W24,W20,W16,W12,W8,W4,W0
|
||||
W_PRECALC_32_79_0 W28,W8,W4,W0
|
||||
RR0 F3,C,D,E,A,B,48
|
||||
W_PRECALC_32_79_1 W16,W0
|
||||
RR1 F3,C,D,E,A,B,48
|
||||
W_PRECALC_32_79_2 W0
|
||||
RR0 F3,A,B,C,D,E,50
|
||||
W_PRECALC_32_79_3 W0,50,K_XMM
|
||||
RR1 F3,A,B,C,D,E,50
|
||||
|
||||
// i=68 : W24,W20,W16,W12,W8,W4,W0,W28
|
||||
W_PRECALC_32_79_0 W24,W4,W0,W28
|
||||
RR0 F3,D,E,A,B,C,52
|
||||
W_PRECALC_32_79_1 W12,W28
|
||||
RR1 F3,D,E,A,B,C,52
|
||||
W_PRECALC_32_79_2 W28
|
||||
RR0 F3,B,C,D,E,A,54
|
||||
W_PRECALC_32_79_3 W28,54,K_XMM
|
||||
RR1 F3,B,C,D,E,A,54
|
||||
|
||||
// i=72 : W20,W16,W12,W8,W4,W0,W28,W24
|
||||
W_PRECALC_32_79_0 W20,W0,W28,W24
|
||||
RR0 F3,E,A,B,C,D,56
|
||||
W_PRECALC_32_79_1 W8,W24
|
||||
RR1 F3,E,A,B,C,D,56
|
||||
W_PRECALC_32_79_2 W24
|
||||
RR0 F3,C,D,E,A,B,58
|
||||
W_PRECALC_32_79_3 W24,58,K_XMM
|
||||
RR1 F3,C,D,E,A,B,58
|
||||
|
||||
// starting using F4
|
||||
|
||||
// i=76 : W16,W12,W8,W4,W0,W28,W24,W20
|
||||
W_PRECALC_32_79_0 W16,W28,W24,W20
|
||||
RR0 F4,A,B,C,D,E,60
|
||||
W_PRECALC_32_79_1 W4,W20
|
||||
RR1 F4,A,B,C,D,E,60
|
||||
W_PRECALC_32_79_2 W20
|
||||
RR0 F4,D,E,A,B,C,62
|
||||
W_PRECALC_32_79_3 W20,62,K_XMM
|
||||
RR1 F4,D,E,A,B,C,62
|
||||
|
||||
.endm
|
||||
|
||||
.macro SOFTWARE_PIPELINING
|
||||
// i=0 : W28,W24,W20,W16,W12,W8,W4,W0
|
||||
W_PRECALC_00_15_0 0 // W_TMP = (BUFFER_PTR)
|
||||
RR0 F4,B,C,D,E,A,64
|
||||
W_PRECALC_00_15_1 W0 // convert W_TMP to big-endian, and save W0 = W_TMP
|
||||
RR1 F4,B,C,D,E,A,64
|
||||
W_PRECALC_00_15_2 W0 // W_TMP = W0 + K
|
||||
RR0 F4,E,A,B,C,D,66
|
||||
W_PRECALC_00_15_3 3 // (sp) = W_TMP = W0 + K
|
||||
RR1 F4,E,A,B,C,D,66
|
||||
|
||||
// i=4 : W24,W20,W16,W12,W8,W4,W0,W28
|
||||
W_PRECALC_00_15_0 4 // W_TMP = 16(BUFFER_PTR)
|
||||
RR0 F4,C,D,E,A,B,68
|
||||
W_PRECALC_00_15_1 W28 // convert W_TMP to big-endian, and save W28 = W_TMP
|
||||
RR1 F4,C,D,E,A,B,68
|
||||
W_PRECALC_00_15_2 W28 // W_TMP = W28 + K
|
||||
RR0 F4,A,B,C,D,E,70
|
||||
W_PRECALC_00_15_3 7 // 16(sp) = W_TMP = W28 + K[0]
|
||||
RR1 F4,A,B,C,D,E,70
|
||||
|
||||
// i=8 : W20,W16,W12,W8,W4,W0,W28,W24
|
||||
W_PRECALC_00_15_0 8 // W_TMP = 32(BUFFER_PTR)
|
||||
RR0 F4,D,E,A,B,C,72
|
||||
W_PRECALC_00_15_1 W24 // convert W_TMP to big-endian, and save W24 = W_TMP
|
||||
RR1 F4,D,E,A,B,C,72
|
||||
W_PRECALC_00_15_2 W24 // W_TMP = W24 + K
|
||||
RR0 F4,B,C,D,E,A,74
|
||||
W_PRECALC_00_15_3 11 // 32(sp) = W_TMP = W24 + K
|
||||
RR1 F4,B,C,D,E,A,74
|
||||
|
||||
// i=12 : W16,W12,W8,W4,W0,W28,W24,W20
|
||||
W_PRECALC_00_15_0 12 // W_TMP = 48(BUFFER_PTR)
|
||||
RR0 F4,E,A,B,C,D,76
|
||||
W_PRECALC_00_15_1 W20 // convert W_TMP to big-endian, and save W20 = W_TMP
|
||||
RR1 F4,E,A,B,C,D,76
|
||||
W_PRECALC_00_15_2 W20 // W_TMP = W20 + K
|
||||
RR0 F4,C,D,E,A,B,78
|
||||
W_PRECALC_00_15_3 15 // 48(sp) = W_TMP = W20 + K
|
||||
RR1 F4,C,D,E,A,B,78
|
||||
.endm
|
||||
|
||||
|
||||
#undef W_PRECALC_00_15_0
|
||||
#undef W_PRECALC_00_15_1
|
||||
#undef W_PRECALC_16_31_0
|
||||
#undef W_PRECALC_32_79_0
|
||||
|
||||
.macro ENDING // finish up updating hash digests (i=64:79)
|
||||
//i=80
|
||||
RR0 F4,B,C,D,E,A,64
|
||||
RR1 F4,B,C,D,E,A,64
|
||||
RR0 F4,E,A,B,C,D,66
|
||||
RR1 F4,E,A,B,C,D,66
|
||||
|
||||
//i=84
|
||||
RR0 F4,C,D,E,A,B,68
|
||||
RR1 F4,C,D,E,A,B,68
|
||||
RR0 F4,A,B,C,D,E,70
|
||||
RR1 F4,A,B,C,D,E,70
|
||||
|
||||
//i=88
|
||||
RR0 F4,D,E,A,B,C,72
|
||||
RR1 F4,D,E,A,B,C,72
|
||||
RR0 F4,B,C,D,E,A,74
|
||||
RR1 F4,B,C,D,E,A,74
|
||||
|
||||
//i=92
|
||||
RR0 F4,E,A,B,C,D,76
|
||||
RR1 F4,E,A,B,C,D,76
|
||||
RR0 F4,C,D,E,A,B,78
|
||||
RR1 F4,C,D,E,A,B,78
|
||||
.endm
|
||||
|
||||
// load hash digests A,B,C,D,E from memory into registers
|
||||
.macro LOAD_HASH
|
||||
mov (HASH_PTR), A
|
||||
mov 4(HASH_PTR), B
|
||||
mov 8(HASH_PTR), C
|
||||
mov 12(HASH_PTR), D
|
||||
mov 16(HASH_PTR), E
|
||||
.endm
|
||||
|
||||
.macro UPDATE_HASH arg0, arg1
|
||||
add \arg0, \arg1
|
||||
mov \arg1, \arg0
|
||||
.endm
|
||||
|
||||
.macro UPDATE_ALL_HASH
|
||||
UPDATE_HASH (HASH_PTR), A
|
||||
UPDATE_HASH 4(HASH_PTR), B
|
||||
UPDATE_HASH 8(HASH_PTR), C
|
||||
UPDATE_HASH 12(HASH_PTR), D
|
||||
UPDATE_HASH 16(HASH_PTR), E
|
||||
.endm
|
||||
|
||||
|
||||
/*
|
||||
main sha1 code for system with avx1 support
|
||||
*/
|
||||
|
||||
.macro SHA1_PIPELINED_MAIN_BODY
|
||||
LOAD_HASH // load initial hashes into A,B,C,D,E
|
||||
INITIAL_W_PRECALC // big_endian_load(W) and W+K (i=0:15)
|
||||
.p2align 4,0x90
|
||||
0:
|
||||
INTERNAL // update W (i=16:79) and update ABCDE (i=0:63)
|
||||
#if Multiple_Blocks
|
||||
addq _IMM(64), BUFFER_PTR // BUFFER_PTR+=64;
|
||||
subq _IMM(1), cnt // pre-decrement cnt by 1
|
||||
jbe 1f // if cnt <= 0, branch to finish off
|
||||
SOFTWARE_PIPELINING // update ABCDE (i=64:79) || big_endian_load(W) and W+K (i=0:15)
|
||||
UPDATE_ALL_HASH // update output hashes
|
||||
jmp 0b // repeat for next block
|
||||
.p2align 4,0x90
|
||||
1:
|
||||
#endif
|
||||
ENDING // update ABCDE (i=64:79)
|
||||
UPDATE_ALL_HASH // update output hashes
|
||||
.endm
|
||||
|
||||
/*
|
||||
I removed the cpu capabilities check. The check is now down
|
||||
in C code and the appropriate version of the assembler code
|
||||
is selected.
|
||||
*/
|
||||
.text
|
||||
.globl _AccelerateCrypto_SHA1_compress_AVX1
|
||||
_AccelerateCrypto_SHA1_compress_AVX1:
|
||||
|
||||
// start the sha1 code with avx1 support
|
||||
|
||||
// save callee-save registers
|
||||
push %rbp
|
||||
mov %rsp, %rbp
|
||||
push %rbx
|
||||
push %r15
|
||||
|
||||
sub $stack_size, sp // allocate stack memory for use
|
||||
|
||||
// save used xmm register if this is for kernel
|
||||
#if BUILDKERNEL
|
||||
andq $-32, sp // aligned sp to 32-bytes
|
||||
leaq 4*16(sp), %rax
|
||||
xmov %ymm0, 0*32(%rax)
|
||||
xmov %ymm1, 1*32(%rax)
|
||||
xmov %ymm2, 2*32(%rax)
|
||||
xmov %ymm3, 3*32(%rax)
|
||||
xmov %ymm4, 4*32(%rax)
|
||||
xmov %ymm5, 5*32(%rax)
|
||||
xmov %ymm6, 6*32(%rax)
|
||||
xmov %ymm7, 7*32(%rax)
|
||||
xmov %ymm8, 8*32(%rax)
|
||||
xmov %ymm9, 9*32(%rax)
|
||||
#endif
|
||||
|
||||
|
||||
// set up registers to free %edx/%edi/%esi for other use (ABCDE)
|
||||
mov ctx, HASH_PTR
|
||||
mov buf, BUFFER_PTR
|
||||
#if Multiple_Blocks
|
||||
mov %rsi, cnt
|
||||
#endif
|
||||
lea K_XMM_AR(%rip), K_BASE
|
||||
|
||||
|
||||
SHA1_PIPELINED_MAIN_BODY
|
||||
|
||||
// restore used xmm registers if this is for kernel
|
||||
#if BUILDKERNEL
|
||||
leaq 4*16(sp), %rax
|
||||
xmov 0*32(%rax), %ymm0
|
||||
xmov 1*32(%rax), %ymm1
|
||||
xmov 2*32(%rax), %ymm2
|
||||
xmov 3*32(%rax), %ymm3
|
||||
xmov 4*32(%rax), %ymm4
|
||||
xmov 5*32(%rax), %ymm5
|
||||
xmov 6*32(%rax), %ymm6
|
||||
xmov 7*32(%rax), %ymm7
|
||||
xmov 8*32(%rax), %ymm8
|
||||
xmov 9*32(%rax), %ymm9
|
||||
#endif
|
||||
|
||||
leaq -16(%rbp), %rsp
|
||||
|
||||
// restore callee-save registers
|
||||
pop %r15
|
||||
pop %rbx
|
||||
pop %rbp
|
||||
|
||||
ret // return
|
||||
|
||||
CC_ASM_SECTION_CONST
|
||||
.p2align 4, 0x90
|
||||
|
||||
#define K1 0x5a827999
|
||||
#define K2 0x6ed9eba1
|
||||
#define K3 0x8f1bbcdc
|
||||
#define K4 0xca62c1d6
|
||||
|
||||
K_XMM_AR:
|
||||
.long K1
|
||||
.long K1
|
||||
.long K1
|
||||
.long K1
|
||||
.long K2
|
||||
.long K2
|
||||
.long K2
|
||||
.long K2
|
||||
.long K3
|
||||
.long K3
|
||||
.long K3
|
||||
.long K3
|
||||
.long K4
|
||||
.long K4
|
||||
.long K4
|
||||
.long K4
|
||||
REV32:
|
||||
// bswap_shufb_ctl: accessed thru 0x40(K_XMM_AR)
|
||||
.long 0x00010203
|
||||
.long 0x04050607
|
||||
.long 0x08090a0b
|
||||
.long 0x0c0d0e0f
|
||||
|
||||
|
||||
#endif // architecture x86_64
|
||||
|
||||
#endif // defined(__x86_64__)
|
||||
|
||||
|
|
@ -0,0 +1,780 @@
|
|||
# Copyright (c) (2016,2018,2019) Apple Inc. All rights reserved.
|
||||
#
|
||||
# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
# is contained in the License.txt file distributed with corecrypto) and only to
|
||||
# people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
# Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
# devices and computers you own or control, for the sole purpose of verifying the
|
||||
# security characteristics and correct functioning of the Apple Software. You may
|
||||
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
#include <corecrypto/cc_config.h>
|
||||
|
||||
#if defined(__x86_64__)
|
||||
|
||||
/* vng_sha1LittleEndian.s : this file provides optimized x86_64 avx2 implementation of the sha1 function
|
||||
CoreOS - vector and numerics group
|
||||
|
||||
The implementation is based on the principle described in an Intel online article
|
||||
"Improving the Performance of the Secure Hash Algorithm (SHA-1)"
|
||||
http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
|
||||
|
||||
|
||||
Update HASH[] by processing a one 64-byte block in MESSAGE[] can be represented by the following C function
|
||||
|
||||
void SHA1( int HASH[], int MESSAGE[] )
|
||||
{
|
||||
int A[81], B[81], C[81], D[81], E[81];
|
||||
int W[80];
|
||||
|
||||
int i, FN;
|
||||
|
||||
A[0] = HASH[0];
|
||||
B[0] = HASH[1];
|
||||
C[0] = HASH[2];
|
||||
D[0] = HASH[3];
|
||||
E[0] = HASH[4];
|
||||
|
||||
for ( i=0; i<80; ++i )
|
||||
{
|
||||
if ( i < 16 )
|
||||
W[i] = BIG_ENDIAN_LOAD( MESSAGE[i] );
|
||||
else
|
||||
W[i] = ROTATE_LEFT( W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16], 1 );
|
||||
|
||||
FN = F( i, B[i], C[i], D[i] );
|
||||
|
||||
A[i+1] = FN + E[i] + ROTATE_LEFT( A[i], 5 ) + W[i] + K(i);
|
||||
B[i+1] = A[i];
|
||||
C[i+1] = ROTATE_LEFT( B[i], 30 );
|
||||
D[i+1] = C[i];
|
||||
E[i+1] = D[i];
|
||||
}
|
||||
|
||||
HASH[0] += A[80];
|
||||
HASH[1] += B[80];
|
||||
HASH[2] += C[80];
|
||||
HASH[3] += D[80];
|
||||
HASH[4] += E[80];
|
||||
}
|
||||
|
||||
For i=0:15, W[i] is simply big-endian loading of MESSAGE[i]. For i=16:79, W[i] is updated according to W[i] = ROTATE_LEFT( W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16], 1 );
|
||||
|
||||
The approach (by Dean Gaudet) can be used to vectorize the computation of W[i] for i=16:79,
|
||||
|
||||
1. done on 4 consequtive W[i] values in a single XMM register
|
||||
W[i ] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1
|
||||
W[i+1] = (W[i-2] ^ W[i-7] ^ W[i-13] ^ W[i-15]) rol 1
|
||||
W[i+2] = (W[i-1] ^ W[i-6] ^ W[i-12] ^ W[i-14]) rol 1
|
||||
W[i+3] = ( 0 ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1
|
||||
|
||||
2. this additional calculation unfortunately requires many additional operations
|
||||
W[i+3] ^= W[i] rol 1
|
||||
|
||||
3. once we have 4 W[i] values in XMM we can also add four K values with one instruction
|
||||
W[i:i+3] += {K,K,K,K}
|
||||
|
||||
Let W0 = {W[i] W[i+1] W[i+2] W[i+3]} be the current W-vector to be computed, W4 = {W[i-4] W[i-3] W[i-2] W[i-1]} be the previous vector, and so on
|
||||
The Dean Gaudet approach can be expressed as
|
||||
|
||||
1. W0 = rotate_left(left_shift(W4,32) ^ W8 ^ left_shift(concatenate(W16,W12),64) ^ W16,1);
|
||||
2. W[i+3] ^= W[i] rol 1
|
||||
3. W0 += {K,K,K,K}
|
||||
|
||||
For i>=32, the Intel online article suggests that (using a basic identity (X rol 1) rol 1 = X rol 2) the update equation is equivalent to
|
||||
|
||||
1. W0 = rotate_left(left_shift(concatenate(W8,W4),64) ^ W16 ^ W28 ^ W32, 2);
|
||||
|
||||
Note:
|
||||
1. In total, we need 8 16-byte registers or memory for W0,W4,...,W28. W0 and W32 can be the same register or memory.
|
||||
2. The registers are used in a circular buffering mode. For example, we start with W28,W24,...,W0 (with W0 indicating the most recent 16-byte)
|
||||
i=0, W28,W24,...,W0
|
||||
i=4, W24,W20,...,W28
|
||||
i=8, W20,W16,...,W24
|
||||
.
|
||||
.
|
||||
and so forth.
|
||||
3. 2 ssse3 instructions are used in the Intel article, pshufb and palignr.
|
||||
a. pshufb is used to simplify the BIG_ENDIAN_LOAD operation
|
||||
b. palignr is used to simplify the computation of left_shift(concatenate(W12,W8),64)
|
||||
|
||||
*/
|
||||
|
||||
/* the code can be compiled into single block (64 bytes) per call mode by setting Multiple_blocks to 0 */
|
||||
#define Multiple_Blocks 1
|
||||
|
||||
#if BUILDKERNEL
|
||||
#define stack_size (32*10+16*4+16) // ymm0-9 + 4 128-bits for intermediate WK(t) storage + 32byte alignment
|
||||
#else
|
||||
#define stack_size (16*4) // 4 128-bits for intermediate WK(t) storage
|
||||
#endif
|
||||
#define sp %rsp // unifying architectural stack pointer representation
|
||||
#define ctx %rdi // 1st input argument, will move to HASH_PTR (%r9)
|
||||
#define buf %rdx // 3rd input argument, will move to BUFFER_PTR (%r10)
|
||||
#define cnt %r11 // will copy from the 2nd input argument (%rsi)
|
||||
#define K_BASE %r8 // an aligned pointer to point to shufb reference numbers of table of K values
|
||||
#define HASH_PTR %r9 // pointer to Hash values (A,B,C,D,E)
|
||||
#define BUFFER_PTR %r10 // pointer to input blocks
|
||||
|
||||
// symbolizing registers or stack memory with algorithmic variables W0,W4,...,W28 + W_TMP, W_TMP2, and XMM_SHUFB_BSWAP for code with avx2 support
|
||||
|
||||
#define W_TMP %xmm0
|
||||
#define W_TMP2 %xmm1
|
||||
#define W0 %xmm2
|
||||
#define W4 %xmm3
|
||||
#define W8 %xmm4
|
||||
#define W12 %xmm5
|
||||
#define W16 %xmm6
|
||||
#define W20 %xmm7
|
||||
#define W24 %xmm8
|
||||
#define W28 %xmm9
|
||||
#define XMM_SHUFB_BSWAP REV32(%rip)
|
||||
|
||||
#define xmov vmovaps // aligned 16-byte move
|
||||
#define xmovu vmovups // unaligned 16-byte move
|
||||
|
||||
// intermediate hash variables
|
||||
#define A %ecx
|
||||
#define B %esi
|
||||
#define C %edi
|
||||
#define D %r15d
|
||||
#define E %edx
|
||||
|
||||
// temp variables
|
||||
#define T1 %eax
|
||||
#define T2 %ebx
|
||||
|
||||
#define WK(t) ((t)&15)*4(sp)
|
||||
|
||||
// int F1(int B, int C, int D) { return (D ^ ( B & (C ^ D)); }
|
||||
// result in T1
|
||||
.macro F1 arg0, arg1, arg2
|
||||
mov \arg1, T1
|
||||
xor \arg2, T1
|
||||
and \arg0, T1
|
||||
xor \arg2, T1
|
||||
.endm
|
||||
|
||||
// int F2(int B, int C, int D) { return (D ^ B ^ C); }
|
||||
// result in T1
|
||||
.macro F2 arg0, arg1, arg2
|
||||
mov \arg2, T1
|
||||
xor \arg1, T1
|
||||
xor \arg0, T1
|
||||
.endm
|
||||
|
||||
// int F3(int B, int C, int D) { return (B & C) | (D & (B ^ C)); }
|
||||
// result in T1
|
||||
.macro F3 arg0, arg1, arg2
|
||||
mov \arg1, T1
|
||||
mov \arg0, T2
|
||||
or \arg0, T1
|
||||
and \arg1, T2
|
||||
and \arg2, T1
|
||||
or T2, T1
|
||||
.endm
|
||||
|
||||
// for i=60:79, F4 is identical to F2
|
||||
#define F4 F2
|
||||
|
||||
|
||||
/*
|
||||
i=0:15, W[i] = BIG_ENDIAN_LOAD(MESSAGE[i]);
|
||||
|
||||
for (i=0;i<16;i+=4) {
|
||||
1. W_TMP = new 16 bytes from MESSAGE[]
|
||||
2. W_TMP = pshufb(W_TMP, XMM_SHUFB_BSWAP); save to W circular buffer for updating W
|
||||
3. WTMP += {K,K,K,K};
|
||||
4. save quadruple W[i]+K[i] = W_TMP in the stack memory;
|
||||
}
|
||||
|
||||
each step is represented in one of the following 4 macro definitions
|
||||
|
||||
*/
|
||||
|
||||
.macro W_PRECALC_00_15_0 arg0 // input argument $0 : 0/4/8/12
|
||||
xmovu \arg0*4(BUFFER_PTR), W_TMP // read 16-bytes into W_TMP, BUFFER_PTR possibly not 16-byte aligned
|
||||
.endm
|
||||
|
||||
.macro W_PRECALC_00_15_1 arg0 // input argument $0 : current 16-bytes in the circular buffer, one of W0,W4,W8,...,W28
|
||||
vpshufb XMM_SHUFB_BSWAP, W_TMP, \arg0 // convert W_TMP from little-endian into big-endian
|
||||
.endm
|
||||
|
||||
.macro W_PRECALC_00_15_2 arg0 // K_BASE points to the current K quadruple.
|
||||
vpaddd (K_BASE), \arg0, W_TMP // W_TMP += {K,K,K,K};
|
||||
.endm
|
||||
|
||||
.macro W_PRECALC_00_15_3 arg0
|
||||
xmov W_TMP, WK(\arg0&~3) // save quadruple W[i]+K in the stack memory, which would be used later for updating the hashes A/B/C/D/E
|
||||
.endm
|
||||
|
||||
// rounds 16-31 compute W[0] using the vectorization approach by Dean Gaudet
|
||||
/*
|
||||
W[i ] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1
|
||||
W[i+1] = (W[i-2] ^ W[i-7] ^ W[i-13] ^ W[i-15]) rol 1
|
||||
W[i+2] = (W[i-1] ^ W[i-6] ^ W[i-12] ^ W[i-14]) rol 1
|
||||
W[i+3] = ( 0 ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1
|
||||
|
||||
W[i+3] ^= W[i] rol 1; // this W[i] is already rol by 1, if we are taking from the intial W before rol 1, we should rol this by 2
|
||||
|
||||
The operation (updating W and W+K) is scheduled as and divided into 4 steps
|
||||
|
||||
0. W_tmp = W3; W = W14 ^ W8
|
||||
1. W = W3 ^ W8 ^ W14 ^ W16; W_TMP = W; W_TMP2 = (W[i] 0 0 0);
|
||||
2. W_TMP = (W3 ^ W8 ^ W14 ^ W16) rol 1; split (W[i] 0 0 0) rol 2 in W_TMP2 and W
|
||||
3. W = W_TMP = W_TMP ^ W_TMP2 ^ W = (W3 ^ W8 ^ W14 ^ W16) rol 1 ^ (W[i] 0 0 0) rol 2; WK = W _TMP+K;
|
||||
|
||||
*/
|
||||
|
||||
.macro W_PRECALC_16_31_0 arg0, arg1, arg2, arg3, arg4 // input arguments : W16,W12,W8,W4,W
|
||||
vpalignr $8, \arg0, \arg1, \arg4 // W = W14
|
||||
vpsrldq $4, \arg3, W_TMP // W_TMP = W3
|
||||
vpxor \arg2, \arg4, \arg4 // W = W8 ^ W14
|
||||
.endm
|
||||
|
||||
.macro W_PRECALC_16_31_1 arg0, arg1 // input arguments : W16,W
|
||||
vpxor \arg0, W_TMP, W_TMP // W_TMP = W3 ^ W16
|
||||
vpxor W_TMP, \arg1, \arg1 // W = W3 ^ W16 ^ W8 ^ W14
|
||||
vpslldq $12, \arg1, W_TMP2 // W_TMP2 = (W[i] 0 0 0)
|
||||
.endm
|
||||
|
||||
.macro W_PRECALC_16_31_2 arg0 // input argument : W
|
||||
vpslld $1, \arg0, W_TMP // (W3 ^ W16 ^ W8 ^ W14)<<1
|
||||
vpsrld $31, \arg0, \arg0 // (W3 ^ W16 ^ W8 ^ W14)>>31
|
||||
vpor \arg0, W_TMP, W_TMP // W_TMP = (W3 ^ W16 ^ W8 ^ W14) rol 1
|
||||
vpslld $2, W_TMP2, \arg0 // W = W[i] higher 30 bits after rol 2
|
||||
vpsrld $30, W_TMP2, W_TMP2 // W_TMP2 = W[i] lower 2 bits after rol 2
|
||||
.endm
|
||||
|
||||
.macro W_PRECALC_16_31_3 arg0, arg1, arg2 // input arguments: W, i, K_XMM
|
||||
vpxor W_TMP, \arg0, \arg0
|
||||
vpxor W_TMP2, \arg0, \arg0 // W_TMP = (W3 ^ W16 ^ W8 ^ W14) rol 1 ^ (W[i] 0 0 0) rol 2
|
||||
vpaddd \arg2(K_BASE), \arg0, W_TMP // W+K
|
||||
xmov W_TMP, WK(\arg1&~3) // save WK = W+K for later update of the hashes A/B/C/D/E
|
||||
.endm
|
||||
|
||||
/* rounds 32-79 compute W und W+K iusing the vectorization approach from the Intel article
|
||||
|
||||
W = rotate_left(left_shift(concatenate(W8,W4),64) ^ W16 ^ W28 ^ W32, 2);
|
||||
|
||||
where left_shift(concatenate(W8,W4),64) is equivalent to W6. Note also that W32 and W use the same register.
|
||||
|
||||
|
||||
0. W_tmp = W6; W = W28 ^ W32;
|
||||
1. W = W_tmp = W6 ^ W16 ^ W28 ^ W32;
|
||||
2. W_tmp = (W6 ^ W16 ^ W28 ^ W32) rol 2;
|
||||
3. W = W_Tmp; WK = W_tmp + K;
|
||||
|
||||
*/
|
||||
|
||||
|
||||
.macro W_PRECALC_32_79_0 arg0, arg1, arg2, arg3 // inputr arguments : W28,W8,W4,W
|
||||
vpxor \arg0, \arg3, \arg3 // W = W28 ^ W32;
|
||||
vpalignr $8, \arg1, \arg2, W_TMP // W_tmp = (w3 w4 w5 w6) = W6;
|
||||
.endm
|
||||
|
||||
.macro W_PRECALC_32_79_1 arg0, arg1 // input arguments : W16,W
|
||||
vpxor \arg0, \arg1, \arg1 // W_tmp = W6 ^ W16
|
||||
vpxor W_TMP, \arg1, \arg1 // W_tmp = W6 ^ W16 ^ W28 ^ W32
|
||||
//xmov W_TMP, \arg1 // W = W_tmp = W6 ^ W16 ^ W28 ^ W32
|
||||
.endm
|
||||
|
||||
.macro W_PRECALC_32_79_2 arg0 // input argument : W
|
||||
vpslld $2, \arg0, W_TMP // W << 2
|
||||
vpsrld $30, \arg0, \arg0 // W >> 30
|
||||
vpor W_TMP, \arg0, \arg0 // W_tmp = (W6 ^ W16 ^ W28 ^ W32) rol 2
|
||||
.endm
|
||||
|
||||
.macro W_PRECALC_32_79_3 arg0, arg1, arg2 // input argument W, i, K_XMM
|
||||
vpaddd \arg2(K_BASE), \arg0, W_TMP // W + K
|
||||
xmov W_TMP, WK(\arg1&~3) // write W+K
|
||||
.endm
|
||||
|
||||
|
||||
/* The hash update operation is completed by the following statements.
|
||||
|
||||
A[i+1] = FN + E[i] + ROTATE_LEFT( A[i], 5 ) + WK(i);
|
||||
B[i+1] = A[i];
|
||||
C[i+1] = ROTATE_LEFT( B[i], 30 );
|
||||
D[i+1] = C[i];
|
||||
E[i+1] = D[i];
|
||||
|
||||
Suppose we start with A0,B0,C0,D0,E0. The 1st iteration can be expressed as follows:
|
||||
|
||||
A1 = FN + E0 + rol(A0,5) + WK;
|
||||
B1 = A0;
|
||||
C1 = rol(B0, 30);
|
||||
D1 = C0;
|
||||
E1 = D0;
|
||||
|
||||
to avoid excessive memory movement between registers,
|
||||
1. A1 = FN + E0 + rol(A0,5) + WK; can be temporarily saved in E0,
|
||||
2. C1 = rol(B0,30) can be temporarily saved in B0.
|
||||
|
||||
Therefore, ignoring the time index, the update operation is equivalent to
|
||||
1. E = FN(B,C,D) + E + rol(A,5) + WK(i)
|
||||
2. B = rol(B,30)
|
||||
3. the hashes are now stored in the order of E,A,B,C,D
|
||||
|
||||
|
||||
To pack 2 hash update operations in 1 iteration, starting with A,B,C,D,E
|
||||
1. E = FN(B,C,D) + E + rol(A,5) + WK(i)
|
||||
2. B = rol(B,30)
|
||||
// now the hashes are in the order of E,A,B,C,D
|
||||
3. D = FN(A,B,C) + D + rol(E,5) + WK(i+1)
|
||||
4. A = rol(A,30)
|
||||
// now the hashes are in the order of D,E,A,B,C
|
||||
|
||||
These operations are distributed into the following 2 macro definitions RR0 and RR1.
|
||||
|
||||
*/
|
||||
|
||||
.macro RR0 arg0, arg1, arg2, arg3, arg4, arg5, arg6 // input arguments : FN, A, B, C, D, E, i
|
||||
\arg0 \arg2, \arg3, \arg4 // T1 = FN(B,C,D)
|
||||
rol $30, \arg2 // B = rol(B,30)
|
||||
add WK(\arg6), \arg5 // E + WK(i)
|
||||
rorx $27, \arg1, T2 // rol(A,5)
|
||||
add WK(\arg6+1), \arg4 // D + WK(i+1)
|
||||
add T1, \arg5 // E = FN(B,C,D) + E + WK(i)
|
||||
.endm
|
||||
|
||||
.macro RR1 arg0, arg1, arg2, arg3, arg4, arg5, arg6
|
||||
add T2, \arg5 // T2 = FN(B,C,D) + E + rol(A,5) + WK(i)
|
||||
rorx $27, \arg5, T2 // rol(E,5)
|
||||
add T2, \arg4 // D + WK(i+1) + rol(E,5)
|
||||
\arg0 \arg1, \arg2, \arg3 // FN(A,B,C)
|
||||
add T1, \arg4 // D = FN(A,B,C) + D + rol(E,5) + WK(i+1)
|
||||
rol $30, \arg1 // A = rol(A,30)
|
||||
.endm
|
||||
|
||||
|
||||
.macro INITIAL_W_PRECALC // BIG_ENDIAN_LOAD(64 bytes block) into W (i=0:15) and store W+K into the stack memory
|
||||
|
||||
// i=0 : W28,W24,W20,W16,W12,W8,W4,W0
|
||||
W_PRECALC_00_15_0 0 // W_TMP = (BUFFER_PTR)
|
||||
W_PRECALC_00_15_1 W0 // convert W_TMP to big-endian, and save W0 = W_TMP
|
||||
W_PRECALC_00_15_2 W0 // W_TMP = W0 + K
|
||||
W_PRECALC_00_15_3 3 // (sp) = W_TMP = W0 + K
|
||||
|
||||
// i=4 : W24,W20,W16,W12,W8,W4,W0,W28
|
||||
W_PRECALC_00_15_0 4 // W_TMP = 16(BUFFER_PTR)
|
||||
W_PRECALC_00_15_1 W28 // convert W_TMP to big-endian, and save W28 = W_TMP
|
||||
W_PRECALC_00_15_2 W28 // W_TMP = W28 + K
|
||||
W_PRECALC_00_15_3 7 // 16(sp) = W_TMP = W28 + K
|
||||
|
||||
// i=8 : W20,W16,W12,W8,W4,W0,W28,W24
|
||||
W_PRECALC_00_15_0 8 // W_TMP = 32(BUFFER_PTR)
|
||||
W_PRECALC_00_15_1 W24 // convert W_TMP to big-endian, and save W24 = W_TMP
|
||||
W_PRECALC_00_15_2 W24 // W_TMP = W24 + K
|
||||
W_PRECALC_00_15_3 11 // 32(sp) = W_TMP = W24 + K
|
||||
|
||||
// i=12 : W16,W12,W8,W4,W0,W28,W24,W20
|
||||
W_PRECALC_00_15_0 12 // W_TMP = 48(BUFFER_PTR)
|
||||
W_PRECALC_00_15_1 W20 // convert W_TMP to big-endian, and save W20 = W_TMP
|
||||
W_PRECALC_00_15_2 W20 // W_TMP = W20 + K
|
||||
W_PRECALC_00_15_3 15 // 48(sp) = W_TMP = W20 + K
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
.macro INTERNAL // updating W (16:79) and update the digests A/B/C/D/E (i=0:63, based on W+K stored in the stack memory)
|
||||
|
||||
// i=16 : W12,W8,W4,W0,W28,W24,W20,W16
|
||||
W_PRECALC_16_31_0 W0,W28,W24,W20,W16
|
||||
RR0 F1,A,B,C,D,E,0
|
||||
W_PRECALC_16_31_1 W0,W16
|
||||
RR1 F1,A,B,C,D,E,0
|
||||
W_PRECALC_16_31_2 W16
|
||||
RR0 F1,D,E,A,B,C,2
|
||||
W_PRECALC_16_31_3 W16, 2, 0
|
||||
RR1 F1,D,E,A,B,C,2
|
||||
|
||||
// i=20 : W8,W4,W0,W28,W24,W20,W16,W12
|
||||
W_PRECALC_16_31_0 W28,W24,W20,W16,W12
|
||||
RR0 F1,B,C,D,E,A,4
|
||||
W_PRECALC_16_31_1 W28,W12
|
||||
RR1 F1,B,C,D,E,A,4
|
||||
W_PRECALC_16_31_2 W12
|
||||
RR0 F1,E,A,B,C,D,6
|
||||
W_PRECALC_16_31_3 W12, 6, 16
|
||||
RR1 F1,E,A,B,C,D,6
|
||||
|
||||
// i=24 : W4,W0,W28,W24,W20,W16,W12,W8
|
||||
W_PRECALC_16_31_0 W24,W20,W16,W12,W8
|
||||
RR0 F1,C,D,E,A,B,8
|
||||
W_PRECALC_16_31_1 W24,W8
|
||||
RR1 F1,C,D,E,A,B,8
|
||||
W_PRECALC_16_31_2 W8
|
||||
RR0 F1,A,B,C,D,E,10
|
||||
W_PRECALC_16_31_3 W8,10,16
|
||||
RR1 F1,A,B,C,D,E,10
|
||||
|
||||
// i=28 : W0,W28,W24,W20,W16,W12,W8,W4
|
||||
W_PRECALC_16_31_0 W20,W16,W12,W8,W4
|
||||
RR0 F1,D,E,A,B,C,12
|
||||
W_PRECALC_16_31_1 W20,W4
|
||||
RR1 F1,D,E,A,B,C,12
|
||||
W_PRECALC_16_31_2 W4
|
||||
RR0 F1,B,C,D,E,A,14
|
||||
W_PRECALC_16_31_3 W4,14,16
|
||||
RR1 F1,B,C,D,E,A,14
|
||||
|
||||
// i=32 : W28,W24,W20,W16,W12,W8,W4,W0
|
||||
W_PRECALC_32_79_0 W28,W8,W4,W0
|
||||
RR0 F1,E,A,B,C,D,16
|
||||
W_PRECALC_32_79_1 W16,W0
|
||||
RR1 F1,E,A,B,C,D,16
|
||||
W_PRECALC_32_79_2 W0
|
||||
RR0 F1,C,D,E,A,B,18
|
||||
W_PRECALC_32_79_3 W0,18,16
|
||||
RR1 F1,C,D,E,A,B,18
|
||||
|
||||
// starting using F2
|
||||
|
||||
// i=36 : W24,W20,W16,W12,W8,W4,W0,W28
|
||||
W_PRECALC_32_79_0 W24,W4,W0,W28
|
||||
RR0 F2,A,B,C,D,E,20
|
||||
W_PRECALC_32_79_1 W12,W28
|
||||
RR1 F2,A,B,C,D,E,20
|
||||
W_PRECALC_32_79_2 W28
|
||||
RR0 F2,D,E,A,B,C,22
|
||||
W_PRECALC_32_79_3 W28,22,16
|
||||
RR1 F2,D,E,A,B,C,22
|
||||
|
||||
// i=40 : W20,W16,W12,W8,W4,W0,W28,W24
|
||||
#undef K_XMM
|
||||
#define K_XMM 32
|
||||
W_PRECALC_32_79_0 W20,W0,W28,W24
|
||||
RR0 F2,B,C,D,E,A,24
|
||||
W_PRECALC_32_79_1 W8,W24
|
||||
RR1 F2,B,C,D,E,A,24
|
||||
W_PRECALC_32_79_2 W24
|
||||
RR0 F2,E,A,B,C,D,26
|
||||
W_PRECALC_32_79_3 W24,26,K_XMM
|
||||
RR1 F2,E,A,B,C,D,26
|
||||
|
||||
// i=44 : W16,W12,W8,W4,W0,W28,W24,W20
|
||||
W_PRECALC_32_79_0 W16,W28,W24,W20
|
||||
RR0 F2,C,D,E,A,B,28
|
||||
W_PRECALC_32_79_1 W4,W20
|
||||
RR1 F2,C,D,E,A,B,28
|
||||
W_PRECALC_32_79_2 W20
|
||||
RR0 F2,A,B,C,D,E,30
|
||||
W_PRECALC_32_79_3 W20,30,K_XMM
|
||||
RR1 F2,A,B,C,D,E,30
|
||||
|
||||
// i=48 : W12,W8,W4,W0,W28,W24,W20,W16
|
||||
W_PRECALC_32_79_0 W12,W24,W20,W16
|
||||
RR0 F2,D,E,A,B,C,32
|
||||
W_PRECALC_32_79_1 W0,W16
|
||||
RR1 F2,D,E,A,B,C,32
|
||||
W_PRECALC_32_79_2 W16
|
||||
RR0 F2,B,C,D,E,A,34
|
||||
W_PRECALC_32_79_3 W16,34,K_XMM
|
||||
RR1 F2,B,C,D,E,A,34
|
||||
|
||||
// i=52 : W8,W4,W0,W28,W24,W20,W16,W12
|
||||
W_PRECALC_32_79_0 W8,W20,W16,W12
|
||||
RR0 F2,E,A,B,C,D,36
|
||||
W_PRECALC_32_79_1 W28,W12
|
||||
RR1 F2,E,A,B,C,D,36
|
||||
W_PRECALC_32_79_2 W12
|
||||
RR0 F2,C,D,E,A,B,38
|
||||
W_PRECALC_32_79_3 W12,38,K_XMM
|
||||
RR1 F2,C,D,E,A,B,38
|
||||
|
||||
// starting using F3
|
||||
|
||||
// i=56 : W4,W0,W28,W24,W20,W16,W12,W8
|
||||
W_PRECALC_32_79_0 W4,W16,W12,W8
|
||||
RR0 F3,A,B,C,D,E,40
|
||||
W_PRECALC_32_79_1 W24,W8
|
||||
RR1 F3,A,B,C,D,E,40
|
||||
W_PRECALC_32_79_2 W8
|
||||
RR0 F3,D,E,A,B,C,42
|
||||
W_PRECALC_32_79_3 W8,42,K_XMM
|
||||
RR1 F3,D,E,A,B,C,42
|
||||
|
||||
// i=60 : W0,W28,W24,W20,W16,W12,W8,W4
|
||||
#undef K_XMM
|
||||
#define K_XMM 48
|
||||
W_PRECALC_32_79_0 W0,W12,W8,W4
|
||||
RR0 F3,B,C,D,E,A,44
|
||||
W_PRECALC_32_79_1 W20,W4
|
||||
RR1 F3,B,C,D,E,A,44
|
||||
W_PRECALC_32_79_2 W4
|
||||
RR0 F3,E,A,B,C,D,46
|
||||
W_PRECALC_32_79_3 W4,46,K_XMM
|
||||
RR1 F3,E,A,B,C,D,46
|
||||
|
||||
// i=64 : W28,W24,W20,W16,W12,W8,W4,W0
|
||||
W_PRECALC_32_79_0 W28,W8,W4,W0
|
||||
RR0 F3,C,D,E,A,B,48
|
||||
W_PRECALC_32_79_1 W16,W0
|
||||
RR1 F3,C,D,E,A,B,48
|
||||
W_PRECALC_32_79_2 W0
|
||||
RR0 F3,A,B,C,D,E,50
|
||||
W_PRECALC_32_79_3 W0,50,K_XMM
|
||||
RR1 F3,A,B,C,D,E,50
|
||||
|
||||
// i=68 : W24,W20,W16,W12,W8,W4,W0,W28
|
||||
W_PRECALC_32_79_0 W24,W4,W0,W28
|
||||
RR0 F3,D,E,A,B,C,52
|
||||
W_PRECALC_32_79_1 W12,W28
|
||||
RR1 F3,D,E,A,B,C,52
|
||||
W_PRECALC_32_79_2 W28
|
||||
RR0 F3,B,C,D,E,A,54
|
||||
W_PRECALC_32_79_3 W28,54,K_XMM
|
||||
RR1 F3,B,C,D,E,A,54
|
||||
|
||||
// i=72 : W20,W16,W12,W8,W4,W0,W28,W24
|
||||
W_PRECALC_32_79_0 W20,W0,W28,W24
|
||||
RR0 F3,E,A,B,C,D,56
|
||||
W_PRECALC_32_79_1 W8,W24
|
||||
RR1 F3,E,A,B,C,D,56
|
||||
W_PRECALC_32_79_2 W24
|
||||
RR0 F3,C,D,E,A,B,58
|
||||
W_PRECALC_32_79_3 W24,58,K_XMM
|
||||
RR1 F3,C,D,E,A,B,58
|
||||
|
||||
// starting using F4
|
||||
|
||||
// i=76 : W16,W12,W8,W4,W0,W28,W24,W20
|
||||
W_PRECALC_32_79_0 W16,W28,W24,W20
|
||||
RR0 F4,A,B,C,D,E,60
|
||||
W_PRECALC_32_79_1 W4,W20
|
||||
RR1 F4,A,B,C,D,E,60
|
||||
W_PRECALC_32_79_2 W20
|
||||
RR0 F4,D,E,A,B,C,62
|
||||
W_PRECALC_32_79_3 W20,62,K_XMM
|
||||
RR1 F4,D,E,A,B,C,62
|
||||
|
||||
.endm
|
||||
|
||||
.macro SOFTWARE_PIPELINING
|
||||
// i=0 : W28,W24,W20,W16,W12,W8,W4,W0
|
||||
W_PRECALC_00_15_0 0 // W_TMP = (BUFFER_PTR)
|
||||
RR0 F4,B,C,D,E,A,64
|
||||
W_PRECALC_00_15_1 W0 // convert W_TMP to big-endian, and save W0 = W_TMP
|
||||
RR1 F4,B,C,D,E,A,64
|
||||
W_PRECALC_00_15_2 W0 // W_TMP = W0 + K
|
||||
RR0 F4,E,A,B,C,D,66
|
||||
W_PRECALC_00_15_3 3 // (sp) = W_TMP = W0 + K
|
||||
RR1 F4,E,A,B,C,D,66
|
||||
|
||||
// i=4 : W24,W20,W16,W12,W8,W4,W0,W28
|
||||
W_PRECALC_00_15_0 4 // W_TMP = 16(BUFFER_PTR)
|
||||
RR0 F4,C,D,E,A,B,68
|
||||
W_PRECALC_00_15_1 W28 // convert W_TMP to big-endian, and save W28 = W_TMP
|
||||
RR1 F4,C,D,E,A,B,68
|
||||
W_PRECALC_00_15_2 W28 // W_TMP = W28 + K
|
||||
RR0 F4,A,B,C,D,E,70
|
||||
W_PRECALC_00_15_3 7 // 16(sp) = W_TMP = W28 + K[0]
|
||||
RR1 F4,A,B,C,D,E,70
|
||||
|
||||
// i=8 : W20,W16,W12,W8,W4,W0,W28,W24
|
||||
W_PRECALC_00_15_0 8 // W_TMP = 32(BUFFER_PTR)
|
||||
RR0 F4,D,E,A,B,C,72
|
||||
W_PRECALC_00_15_1 W24 // convert W_TMP to big-endian, and save W24 = W_TMP
|
||||
RR1 F4,D,E,A,B,C,72
|
||||
W_PRECALC_00_15_2 W24 // W_TMP = W24 + K
|
||||
RR0 F4,B,C,D,E,A,74
|
||||
W_PRECALC_00_15_3 11 // 32(sp) = W_TMP = W24 + K
|
||||
RR1 F4,B,C,D,E,A,74
|
||||
|
||||
// i=12 : W16,W12,W8,W4,W0,W28,W24,W20
|
||||
W_PRECALC_00_15_0 12 // W_TMP = 48(BUFFER_PTR)
|
||||
RR0 F4,E,A,B,C,D,76
|
||||
W_PRECALC_00_15_1 W20 // convert W_TMP to big-endian, and save W20 = W_TMP
|
||||
RR1 F4,E,A,B,C,D,76
|
||||
W_PRECALC_00_15_2 W20 // W_TMP = W20 + K
|
||||
RR0 F4,C,D,E,A,B,78
|
||||
W_PRECALC_00_15_3 15 // 48(sp) = W_TMP = W20 + K
|
||||
RR1 F4,C,D,E,A,B,78
|
||||
.endm
|
||||
|
||||
|
||||
#undef W_PRECALC_00_15_0
|
||||
#undef W_PRECALC_00_15_1
|
||||
#undef W_PRECALC_16_31_0
|
||||
#undef W_PRECALC_32_79_0
|
||||
|
||||
.macro ENDING // finish up updating hash digests (i=64:79)
|
||||
//i=80
|
||||
RR0 F4,B,C,D,E,A,64
|
||||
RR1 F4,B,C,D,E,A,64
|
||||
RR0 F4,E,A,B,C,D,66
|
||||
RR1 F4,E,A,B,C,D,66
|
||||
|
||||
//i=84
|
||||
RR0 F4,C,D,E,A,B,68
|
||||
RR1 F4,C,D,E,A,B,68
|
||||
RR0 F4,A,B,C,D,E,70
|
||||
RR1 F4,A,B,C,D,E,70
|
||||
|
||||
//i=88
|
||||
RR0 F4,D,E,A,B,C,72
|
||||
RR1 F4,D,E,A,B,C,72
|
||||
RR0 F4,B,C,D,E,A,74
|
||||
RR1 F4,B,C,D,E,A,74
|
||||
|
||||
//i=92
|
||||
RR0 F4,E,A,B,C,D,76
|
||||
RR1 F4,E,A,B,C,D,76
|
||||
RR0 F4,C,D,E,A,B,78
|
||||
RR1 F4,C,D,E,A,B,78
|
||||
.endm
|
||||
|
||||
// load hash digests A,B,C,D,E from memory into registers
|
||||
.macro LOAD_HASH
|
||||
mov (HASH_PTR), A
|
||||
mov 4(HASH_PTR), B
|
||||
mov 8(HASH_PTR), C
|
||||
mov 12(HASH_PTR), D
|
||||
mov 16(HASH_PTR), E
|
||||
.endm
|
||||
|
||||
.macro UPDATE_HASH arg0, arg1
|
||||
add \arg0, \arg1
|
||||
mov \arg1, \arg0
|
||||
.endm
|
||||
|
||||
.macro UPDATE_ALL_HASH
|
||||
UPDATE_HASH (HASH_PTR), A
|
||||
UPDATE_HASH 4(HASH_PTR), B
|
||||
UPDATE_HASH 8(HASH_PTR), C
|
||||
UPDATE_HASH 12(HASH_PTR), D
|
||||
UPDATE_HASH 16(HASH_PTR), E
|
||||
.endm
|
||||
|
||||
|
||||
/*
|
||||
main sha1 code for system with avx2 support
|
||||
*/
|
||||
|
||||
.macro SHA1_PIPELINED_MAIN_BODY
|
||||
LOAD_HASH // load initial hashes into A,B,C,D,E
|
||||
INITIAL_W_PRECALC // big_endian_load(W) and W+K (i=0:15)
|
||||
.p2align 4,0x90
|
||||
0:
|
||||
INTERNAL // update W (i=16:79) and update ABCDE (i=0:63)
|
||||
#if Multiple_Blocks
|
||||
addq _IMM(64), BUFFER_PTR // BUFFER_PTR+=64;
|
||||
subq _IMM(1), cnt // pre-decrement cnt by 1
|
||||
jbe 1f // if cnt <= 0, branch to finish off
|
||||
SOFTWARE_PIPELINING // update ABCDE (i=64:79) || big_endian_load(W) and W+K (i=0:15)
|
||||
UPDATE_ALL_HASH // update output hashes
|
||||
jmp 0b // repeat for next block
|
||||
.p2align 4,0x90
|
||||
1:
|
||||
#endif
|
||||
ENDING // update ABCDE (i=64:79)
|
||||
UPDATE_ALL_HASH // update output hashes
|
||||
.endm
|
||||
|
||||
/*
|
||||
I removed the cpu capabilities check. The check is now down
|
||||
in C code and the appropriate version of the assembler code
|
||||
is selected.
|
||||
*/
|
||||
.text
|
||||
|
||||
.globl _AccelerateCrypto_SHA1_compress_AVX2
|
||||
_AccelerateCrypto_SHA1_compress_AVX2:
|
||||
|
||||
// start the sha1 code with avx2 support
|
||||
|
||||
// save callee-save registers
|
||||
push %rbp
|
||||
mov %rsp, %rbp
|
||||
push %rbx
|
||||
push %r15
|
||||
|
||||
sub $stack_size, sp // allocate stack memory for use
|
||||
|
||||
// save used xmm register if this is for kernel
|
||||
#if BUILDKERNEL
|
||||
andq $-32, sp // aligned sp to 32-bytes
|
||||
leaq 4*16(sp), %rax
|
||||
xmov %ymm0, 0*32(%rax)
|
||||
xmov %ymm1, 1*32(%rax)
|
||||
xmov %ymm2, 2*32(%rax)
|
||||
xmov %ymm3, 3*32(%rax)
|
||||
xmov %ymm4, 4*32(%rax)
|
||||
xmov %ymm5, 5*32(%rax)
|
||||
xmov %ymm6, 6*32(%rax)
|
||||
xmov %ymm7, 7*32(%rax)
|
||||
xmov %ymm8, 8*32(%rax)
|
||||
xmov %ymm9, 9*32(%rax)
|
||||
#endif
|
||||
|
||||
|
||||
// set up registers to free %edx/%edi/%esi for other use (ABCDE)
|
||||
mov ctx, HASH_PTR
|
||||
mov buf, BUFFER_PTR
|
||||
#if Multiple_Blocks
|
||||
mov %rsi, cnt
|
||||
#endif
|
||||
lea K_XMM_AR(%rip), K_BASE
|
||||
|
||||
|
||||
SHA1_PIPELINED_MAIN_BODY
|
||||
|
||||
// restore used xmm registers if this is for kernel
|
||||
#if BUILDKERNEL
|
||||
leaq 4*16(sp), %rax
|
||||
xmov 0*32(%rax), %ymm0
|
||||
xmov 1*32(%rax), %ymm1
|
||||
xmov 2*32(%rax), %ymm2
|
||||
xmov 3*32(%rax), %ymm3
|
||||
xmov 4*32(%rax), %ymm4
|
||||
xmov 5*32(%rax), %ymm5
|
||||
xmov 6*32(%rax), %ymm6
|
||||
xmov 7*32(%rax), %ymm7
|
||||
xmov 8*32(%rax), %ymm8
|
||||
xmov 9*32(%rax), %ymm9
|
||||
#endif
|
||||
|
||||
leaq -16(%rbp), %rsp
|
||||
|
||||
// restore callee-save registers
|
||||
pop %r15
|
||||
pop %rbx
|
||||
pop %rbp
|
||||
|
||||
ret // return
|
||||
|
||||
CC_ASM_SECTION_CONST
|
||||
.p2align 4, 0x90
|
||||
|
||||
#define K1 0x5a827999
|
||||
#define K2 0x6ed9eba1
|
||||
#define K3 0x8f1bbcdc
|
||||
#define K4 0xca62c1d6
|
||||
|
||||
K_XMM_AR:
|
||||
.long K1
|
||||
.long K1
|
||||
.long K1
|
||||
.long K1
|
||||
.long K2
|
||||
.long K2
|
||||
.long K2
|
||||
.long K2
|
||||
.long K3
|
||||
.long K3
|
||||
.long K3
|
||||
.long K3
|
||||
.long K4
|
||||
.long K4
|
||||
.long K4
|
||||
.long K4
|
||||
REV32:
|
||||
// bswap_shufb_ctl: accessed thru 0x40(K_XMM_AR)
|
||||
.long 0x00010203
|
||||
.long 0x04050607
|
||||
.long 0x08090a0b
|
||||
.long 0x0c0d0e0f
|
||||
|
||||
|
||||
#endif // defined(__x86_64__)
|
||||
|
||||
|
|
@ -0,0 +1,983 @@
|
|||
# Copyright (c) (2010,2011,2012,2014,2015,2016,2018,2019) Apple Inc. All rights reserved.
|
||||
#
|
||||
# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
# is contained in the License.txt file distributed with corecrypto) and only to
|
||||
# people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
# Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
# devices and computers you own or control, for the sole purpose of verifying the
|
||||
# security characteristics and correct functioning of the Apple Software. You may
|
||||
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
#include <corecrypto/cc_config.h>
|
||||
|
||||
#if (defined(__x86_64__) || defined(__i386__))
|
||||
|
||||
/* vng_sha1LittleEndian.s : this file provides optimized x86_64 and i386 implementation of the sha1 function
|
||||
CoreOS - vector and numerics group
|
||||
|
||||
The implementation is based on the principle described in an Intel online article
|
||||
"Improving the Performance of the Secure Hash Algorithm (SHA-1)"
|
||||
http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
|
||||
|
||||
|
||||
Update HASH[] by processing a one 64-byte block in MESSAGE[] can be represented by the following C function
|
||||
|
||||
void SHA1( int HASH[], int MESSAGE[] )
|
||||
{
|
||||
int A[81], B[81], C[81], D[81], E[81];
|
||||
int W[80];
|
||||
|
||||
int i, FN;
|
||||
|
||||
A[0] = HASH[0];
|
||||
B[0] = HASH[1];
|
||||
C[0] = HASH[2];
|
||||
D[0] = HASH[3];
|
||||
E[0] = HASH[4];
|
||||
|
||||
for ( i=0; i<80; ++i )
|
||||
{
|
||||
if ( i < 16 )
|
||||
W[i] = BIG_ENDIAN_LOAD( MESSAGE[i] );
|
||||
else
|
||||
W[i] = ROTATE_LEFT( W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16], 1 );
|
||||
|
||||
FN = F( i, B[i], C[i], D[i] );
|
||||
|
||||
A[i+1] = FN + E[i] + ROTATE_LEFT( A[i], 5 ) + W[i] + K(i);
|
||||
B[i+1] = A[i];
|
||||
C[i+1] = ROTATE_LEFT( B[i], 30 );
|
||||
D[i+1] = C[i];
|
||||
E[i+1] = D[i];
|
||||
}
|
||||
|
||||
HASH[0] += A[80];
|
||||
HASH[1] += B[80];
|
||||
HASH[2] += C[80];
|
||||
HASH[3] += D[80];
|
||||
HASH[4] += E[80];
|
||||
}
|
||||
|
||||
For i=0:15, W[i] is simply big-endian loading of MESSAGE[i]. For i=16:79, W[i] is updated according to W[i] = ROTATE_LEFT( W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16], 1 );
|
||||
|
||||
The approach (by Dean Gaudet) can be used to vectorize the computation of W[i] for i=16:79,
|
||||
|
||||
1. done on 4 consequtive W[i] values in a single XMM register
|
||||
W[i ] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1
|
||||
W[i+1] = (W[i-2] ^ W[i-7] ^ W[i-13] ^ W[i-15]) rol 1
|
||||
W[i+2] = (W[i-1] ^ W[i-6] ^ W[i-12] ^ W[i-14]) rol 1
|
||||
W[i+3] = ( 0 ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1
|
||||
|
||||
2. this additional calculation unfortunately requires many additional operations
|
||||
W[i+3] ^= W[i] rol 1
|
||||
|
||||
3. once we have 4 W[i] values in XMM we can also add four K values with one instruction
|
||||
W[i:i+3] += {K,K,K,K}
|
||||
|
||||
Let W0 = {W[i] W[i+1] W[i+2] W[i+3]} be the current W-vector to be computed, W4 = {W[i-4] W[i-3] W[i-2] W[i-1]} be the previous vector, and so on
|
||||
The Dean Gaudet approach can be expressed as
|
||||
|
||||
1. W0 = rotate_left(left_shift(W4,32) ^ W8 ^ left_shift(concatenate(W16,W12),64) ^ W16,1);
|
||||
2. W[i+3] ^= W[i] rol 1
|
||||
3. W0 += {K,K,K,K}
|
||||
|
||||
For i>=32, the Intel online article suggests that (using a basic identity (X rol 1) rol 1 = X rol 2) the update equation is equivalent to
|
||||
|
||||
1. W0 = rotate_left(left_shift(concatenate(W8,W4),64) ^ W16 ^ W28 ^ W32, 2);
|
||||
|
||||
Note:
|
||||
1. In total, we need 8 16-byte registers or memory for W0,W4,...,W28. W0 and W32 can be the same register or memory.
|
||||
2. The registers are used in a circular buffering mode. For example, we start with W28,W24,...,W0 (with W0 indicating the most recent 16-byte)
|
||||
i=0, W28,W24,...,W0
|
||||
i=4, W24,W20,...,W28
|
||||
i=8, W20,W16,...,W24
|
||||
.
|
||||
.
|
||||
and so forth.
|
||||
3. 2 ssse3 instructions are used in the Intel article, pshufb and palignr.
|
||||
a. pshufb is used to simplify the BIG_ENDIAN_LOAD operation
|
||||
b. palignr is used to simplify the computation of left_shift(concatenate(W12,W8),64)
|
||||
|
||||
*/
|
||||
|
||||
/* the code can be compiled into single block (64 bytes) per call mode by setting Multiple_blocks to 0 */
|
||||
#define Multiple_Blocks 1
|
||||
|
||||
#if defined (__x86_64__) || defined(__i386__) // x86_64 or i386 architectures
|
||||
|
||||
#if defined(__x86_64__)
|
||||
|
||||
// set up for x86_64
|
||||
#define stack_size (16*11+16*4) // x0-x10 + 4 128-bits for intermediate WK(t) storage
|
||||
#define sp %rsp // unifying architectural stack pointer representation
|
||||
#define ctx %rdi // 1st input argument, will move to HASH_PTR (%r9)
|
||||
#define buf %rdx // 3rd input argument, will move to BUFFER_PTR (%r10)
|
||||
#define cnt %r11 // will copy from the 2nd input argument (%rsi)
|
||||
#define K_BASE %r8 // an aligned pointer to point to shufb reference numbers of table of K values
|
||||
#define HASH_PTR %r9 // pointer to Hash values (A,B,C,D,E)
|
||||
#define BUFFER_PTR %r10 // pointer to input blocks
|
||||
|
||||
#else // !__x86_64__
|
||||
|
||||
// set up for i386
|
||||
#define stack_size (12+16*2+16*11+16*4) // 12-bytes (alignment) + extra 2 + 3 (W24/W28/XMM_SHUFB_BSWAP) + 8 (xmm0-xmm7) + 4 (WK(t))
|
||||
#define sp %esp // unifying architectural stack pointer representation
|
||||
#define HASH_PTR stack_size+16+4(sp) // use 1st input argument from caller function, 16 for (esi/edi/ebx/ebp)
|
||||
#define cnt stack_size+16+8(sp) // use 2nd input argument from caller function
|
||||
#define BUFFER_PTR stack_size+16+12(sp) // use 3rd input argument from caller function
|
||||
#define K_BASE stack_size-4(sp) // use for K_BASE
|
||||
|
||||
#endif // __x86_64__
|
||||
|
||||
// symbolizing registers or stack memory with algorithmic variables W0,W4,...,W28 + W_TMP, W_TMP2, and XMM_SHUFB_BSWAP for code with ssse3 support
|
||||
|
||||
#define W_TMP %xmm0
|
||||
#define W_TMP2 %xmm1
|
||||
#define W0 %xmm2
|
||||
#define W4 %xmm3
|
||||
#define W8 %xmm4
|
||||
#define W12 %xmm5
|
||||
#define W16 %xmm6
|
||||
#define W20 %xmm7
|
||||
#if defined(__x86_64__)
|
||||
#define W24 %xmm8
|
||||
#define W28 %xmm9
|
||||
#define XMM_SHUFB_BSWAP %xmm10 // used only when ssse3 is supported
|
||||
#else // defined (__i386__)
|
||||
#define W24 12*16(sp)
|
||||
#define W28 13*16(sp)
|
||||
#define XMM_SHUFB_BSWAP 14*16(sp) // used only when ssse3 is supported
|
||||
#endif
|
||||
|
||||
#define xmov movaps // aligned 16-byte move
|
||||
#define xmovu movups // unaligned 16-byte move
|
||||
|
||||
// intermediate hash variables
|
||||
#define A %ecx
|
||||
#define B %esi
|
||||
#define C %edi
|
||||
#if defined(__x86_64__)
|
||||
#define D %r15d
|
||||
#else
|
||||
#define D %ebp
|
||||
#endif
|
||||
#define E %edx
|
||||
|
||||
// temp variables
|
||||
#define T1 %eax
|
||||
#define T2 %ebx
|
||||
|
||||
#define WK(t) ((t)&15)*4(sp)
|
||||
|
||||
// int F1(int B, int C, int D) { return (D ^ ( B & (C ^ D)); }
|
||||
// result in T1
|
||||
.macro F1 arg0, arg1, arg2
|
||||
mov \arg1, T1
|
||||
xor \arg2, T1
|
||||
and \arg0, T1
|
||||
xor \arg2, T1
|
||||
.endm
|
||||
|
||||
// int F2(int B, int C, int D) { return (D ^ B ^ C); }
|
||||
// result in T1
|
||||
.macro F2 arg0, arg1, arg2
|
||||
mov \arg2, T1
|
||||
xor \arg1, T1
|
||||
xor \arg0, T1
|
||||
.endm
|
||||
|
||||
// int F3(int B, int C, int D) { return (B & C) | (D & (B ^ C)); }
|
||||
// result in T1
|
||||
.macro F3 arg0, arg1, arg2
|
||||
mov \arg1, T1
|
||||
mov \arg0, T2
|
||||
or \arg0, T1
|
||||
and \arg1, T2
|
||||
and \arg2, T1
|
||||
or T2, T1
|
||||
.endm
|
||||
|
||||
// for i=60:79, F4 is identical to F2
|
||||
#define F4 F2
|
||||
|
||||
|
||||
/*
|
||||
i=0:15, W[i] = BIG_ENDIAN_LOAD(MESSAGE[i]);
|
||||
|
||||
with ssse3 support, this is achived via
|
||||
for (i=0;i<16;i+=4) {
|
||||
1. W_TMP = new 16 bytes from MESSAGE[]
|
||||
2. W_TMP = pshufb(W_TMP, XMM_SHUFB_BSWAP); save to W circular buffer for updating W
|
||||
3. WTMP += {K,K,K,K};
|
||||
4. save quadruple W[i]+K[i] = W_TMP in the stack memory;
|
||||
}
|
||||
|
||||
each step is represented in one of the following 4 macro definitions
|
||||
|
||||
*/
|
||||
|
||||
.macro W_PRECALC_00_15_0_ssse3 arg0 // input argument $0 : 0/4/8/12
|
||||
#if defined (__x86_64__) // BUFFER_PTR is already an address register in x86_64
|
||||
xmovu \arg0*4(BUFFER_PTR), W_TMP // read 16-bytes into W_TMP, BUFFER_PTR possibly not 16-byte aligned
|
||||
#else // BUFFER_PTR is from the argument set up in the caller
|
||||
mov BUFFER_PTR, T1 // T1 = BUFFER_PTR
|
||||
xmovu \arg0*4(T1), W_TMP // read 16-bytes into W_TMP, BUFFER_PTR possibly not 16-byte aligned
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro W_PRECALC_00_15_1_ssse3 arg0 // input argument $0 : current 16-bytes in the circular buffer, one of W0,W4,W8,...,W28
|
||||
pshufb XMM_SHUFB_BSWAP, W_TMP // convert W_TMP from little-endian into big-endian
|
||||
xmov W_TMP, \arg0 // save W_TMP in the circular buffer
|
||||
.endm
|
||||
|
||||
.macro W_PRECALC_00_15_2 // K_BASE points to the current K quadruple.
|
||||
#if defined (__x86_64__) // K_BASE is already an address register in x86_64
|
||||
paddd (K_BASE), W_TMP // W_TMP += {K,K,K,K};
|
||||
#else // K_BASE is previously set up in the stack memory
|
||||
mov K_BASE, T1 // T1 = K_BASE
|
||||
paddd (T1), W_TMP // W_TMP += {K,K,K,K};
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro W_PRECALC_00_15_3 arg0
|
||||
xmov W_TMP, WK(\arg0&~3) // save quadruple W[i]+K in the stack memory, which would be used later for updating the hashes A/B/C/D/E
|
||||
.endm
|
||||
|
||||
// rounds 16-31 compute W[0] using the vectorization approach by Dean Gaudet
|
||||
/*
|
||||
W[i ] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1
|
||||
W[i+1] = (W[i-2] ^ W[i-7] ^ W[i-13] ^ W[i-15]) rol 1
|
||||
W[i+2] = (W[i-1] ^ W[i-6] ^ W[i-12] ^ W[i-14]) rol 1
|
||||
W[i+3] = ( 0 ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1
|
||||
|
||||
W[i+3] ^= W[i] rol 1; // this W[i] is already rol by 1, if we are taking from the intial W before rol 1, we should rol this by 2
|
||||
|
||||
The operation (updating W and W+K) is scheduled as and divided into 4 steps
|
||||
|
||||
0. W_tmp = W3; W = W14 ^ W8
|
||||
1. W = W3 ^ W8 ^ W14 ^ W16; W_TMP = W; W_TMP2 = (W[i] 0 0 0);
|
||||
2. W_TMP = (W3 ^ W8 ^ W14 ^ W16) rol 1; split (W[i] 0 0 0) rol 2 in W_TMP2 and W
|
||||
3. W = W_TMP = W_TMP ^ W_TMP2 ^ W = (W3 ^ W8 ^ W14 ^ W16) rol 1 ^ (W[i] 0 0 0) rol 2; WK = W _TMP+K;
|
||||
|
||||
*/
|
||||
|
||||
.macro W_PRECALC_16_31_0_ssse3 arg0, arg1, arg2, arg3, arg4 // input arguments : W16,W12,W8,W4,W
|
||||
xmov \arg1, \arg4 // W = W12
|
||||
palignr $8, \arg0, \arg4 // W = W14
|
||||
xmov \arg3, W_TMP // W_TMP = W4
|
||||
psrldq $4, W_TMP // W_TMP = W3
|
||||
pxor \arg2, \arg4 // W = W8 ^ W14
|
||||
.endm
|
||||
|
||||
.macro W_PRECALC_16_31_1 arg0, arg1 // input arguments : W16,W
|
||||
pxor \arg0, W_TMP // W_TMP = W3 ^ W16
|
||||
pxor W_TMP, \arg1 // W = W3 ^ W16 ^ W8 ^ W14
|
||||
xmov \arg1, W_TMP2 // W_TMP2 = W3 ^ W16 ^ W8 ^ W14
|
||||
xmov \arg1, W_TMP // W_TMP = W3 ^ W16 ^ W8 ^ W14
|
||||
pslldq $12, W_TMP2 // W_TMP2 = (W[i] 0 0 0)
|
||||
.endm
|
||||
|
||||
.macro W_PRECALC_16_31_2 arg0 // input argument : W
|
||||
psrld $31, \arg0 // (W3 ^ W16 ^ W8 ^ W14)>>31
|
||||
pslld $1, W_TMP // (W3 ^ W16 ^ W8 ^ W14)<<1
|
||||
por \arg0, W_TMP // W_TMP = (W3 ^ W16 ^ W8 ^ W14) rol 1
|
||||
xmov W_TMP2, \arg0 // copy W[i] at location of W[i+3]
|
||||
psrld $30, W_TMP2 // W_TMP2 = W[i] lower 2 bits after rol 2
|
||||
pslld $2, \arg0 // W = W[i] higher 30 bits after rol 2
|
||||
.endm
|
||||
|
||||
.macro W_PRECALC_16_31_3 arg0, arg1, arg2 // input arguments: W, i, K_XMM
|
||||
#if defined (__i386__)
|
||||
mov K_BASE, T1 // K_BASE is store in the stack memory for i386
|
||||
#endif
|
||||
pxor \arg0, W_TMP
|
||||
pxor W_TMP2, W_TMP // W_TMP = (W3 ^ W16 ^ W8 ^ W14) rol 1 ^ (W[i] 0 0 0) rol 2
|
||||
xmov W_TMP, \arg0 // save W = W_TMP in the W circular buffer
|
||||
#if defined (__x86_64__)
|
||||
paddd \arg2(K_BASE), W_TMP // W+K
|
||||
#else
|
||||
paddd \arg2(T1), W_TMP // W+K
|
||||
#endif
|
||||
xmov W_TMP, WK(\arg1&~3) // save WK = W+K for later update of the hashes A/B/C/D/E
|
||||
.endm
|
||||
|
||||
/* rounds 32-79 compute W und W+K iusing the vectorization approach from the Intel article
|
||||
|
||||
W = rotate_left(left_shift(concatenate(W8,W4),64) ^ W16 ^ W28 ^ W32, 2);
|
||||
|
||||
where left_shift(concatenate(W8,W4),64) is equivalent to W6. Note also that W32 and W use the same register.
|
||||
|
||||
|
||||
0. W_tmp = W6; W = W28 ^ W32;
|
||||
1. W = W_tmp = W6 ^ W16 ^ W28 ^ W32;
|
||||
2. W_tmp = (W6 ^ W16 ^ W28 ^ W32) rol 2;
|
||||
3. W = W_Tmp; WK = W_tmp + K;
|
||||
|
||||
*/
|
||||
|
||||
|
||||
.macro W_PRECALC_32_79_0_ssse3 arg0, arg1, arg2, arg3 // inputr arguments : W28,W8,W4,W
|
||||
xmov \arg2, W_TMP // (w1 w2 w3 w4)
|
||||
pxor \arg0, \arg3 // W = W28 ^ W32;
|
||||
palignr $8, \arg1, W_TMP // W_tmp = (w3 w4 w5 w6) = W6;
|
||||
.endm
|
||||
|
||||
// this is a variant of W_PRECALC_32_79_0_ssse3 for i386 (as W24/W28 are stored in memory, not in registers)
|
||||
.macro W_PRECALC_32_79_0_i386_ssse3 arg0, arg1, arg2, arg3 // input arguments : W28,W8,W4,W
|
||||
xmov \arg3, W_TMP // W32
|
||||
pxor \arg0, W_TMP // W28 ^ W32
|
||||
xmov W_TMP, \arg3 // W = W28 ^ W32;
|
||||
xmov \arg2, W_TMP // W4
|
||||
palignr $8, \arg1, W_TMP // W_tmp = (w3 w4 w5 w6) = W6;
|
||||
.endm
|
||||
|
||||
.macro W_PRECALC_32_79_1 arg0, arg1 // input arguments : W16,W
|
||||
pxor \arg0, W_TMP // W_tmp = W6 ^ W16
|
||||
pxor \arg1, W_TMP // W_tmp = W6 ^ W16 ^ W28 ^ W32
|
||||
xmov W_TMP, \arg1 // W = W_tmp = W6 ^ W16 ^ W28 ^ W32
|
||||
.endm
|
||||
|
||||
.macro W_PRECALC_32_79_2 arg0 // input argument : W
|
||||
psrld $30, \arg0 // W >> 30
|
||||
pslld $2, W_TMP // W << 2
|
||||
por \arg0, W_TMP // W_tmp = (W6 ^ W16 ^ W28 ^ W32) rol 2
|
||||
.endm
|
||||
|
||||
// this is a variant of W_PRECALC_32_79_2 for i386 (as W24/W28 are stored in memory, not in registers)
|
||||
// this should be used when the input is either W24 or W28 on i386 architecture
|
||||
.macro W_PRECALC_32_79_2_i386 arg0 // input argument : W
|
||||
xmov \arg0, W_TMP2 // W
|
||||
psrld $30, W_TMP2 // W >> 30
|
||||
xmov W_TMP2, \arg0 // save (W >> 30) at W
|
||||
pslld $2, W_TMP // W_tmp << 2
|
||||
por \arg0, W_TMP // W_tmp = (W6 ^ W16 ^ W28 ^ W32) rol 2
|
||||
.endm
|
||||
|
||||
.macro W_PRECALC_32_79_3 arg0, arg1, arg2 // input argument W, i, K_XMM
|
||||
#if defined (__x86_64__)
|
||||
xmov W_TMP, \arg0 // W = (W6 ^ W16 ^ W28 ^ W32) rol 2
|
||||
paddd \arg2(K_BASE), W_TMP // W + K
|
||||
xmov W_TMP, WK(\arg1&~3) // write W+K
|
||||
#else
|
||||
mov K_BASE, T1 // T1 = K_BASE (which is in the caller argument)
|
||||
xmov W_TMP, \arg0 // W = (W6 ^ W16 ^ W28 ^ W32) rol 2
|
||||
paddd \arg2(T1), W_TMP // W_tmp = W + K
|
||||
xmov W_TMP, WK(\arg1&~3) // write WK
|
||||
#endif
|
||||
.endm
|
||||
|
||||
|
||||
/* The hash update operation is completed by the following statements.
|
||||
|
||||
A[i+1] = FN + E[i] + ROTATE_LEFT( A[i], 5 ) + WK(i);
|
||||
B[i+1] = A[i];
|
||||
C[i+1] = ROTATE_LEFT( B[i], 30 );
|
||||
D[i+1] = C[i];
|
||||
E[i+1] = D[i];
|
||||
|
||||
Suppose we start with A0,B0,C0,D0,E0. The 1st iteration can be expressed as follows:
|
||||
|
||||
A1 = FN + E0 + rol(A0,5) + WK;
|
||||
B1 = A0;
|
||||
C1 = rol(B0, 30);
|
||||
D1 = C0;
|
||||
E1 = D0;
|
||||
|
||||
to avoid excessive memory movement between registers,
|
||||
1. A1 = FN + E0 + rol(A0,5) + WK; can be temporarily saved in E0,
|
||||
2. C1 = rol(B0,30) can be temporarily saved in B0.
|
||||
|
||||
Therefore, ignoring the time index, the update operation is equivalent to
|
||||
1. E = FN(B,C,D) + E + rol(A,5) + WK(i)
|
||||
2. B = rol(B,30)
|
||||
3. the hashes are now stored in the order of E,A,B,C,D
|
||||
|
||||
|
||||
To pack 2 hash update operations in 1 iteration, starting with A,B,C,D,E
|
||||
1. E = FN(B,C,D) + E + rol(A,5) + WK(i)
|
||||
2. B = rol(B,30)
|
||||
// now the hashes are in the order of E,A,B,C,D
|
||||
3. D = FN(A,B,C) + D + rol(E,5) + WK(i+1)
|
||||
4. A = rol(A,30)
|
||||
// now the hashes are in the order of D,E,A,B,C
|
||||
|
||||
These operations are distributed into the following 2 macro definitions RR0 and RR1.
|
||||
|
||||
*/
|
||||
|
||||
.macro RR0 arg0, arg1, arg2, arg3, arg4, arg5, arg6 // input arguments : FN, A, B, C, D, E, i
|
||||
\arg0 \arg2, \arg3, \arg4 // T1 = FN(B,C,D)
|
||||
add WK(\arg6), \arg5 // E + WK(i)
|
||||
rol $30, \arg2 // B = rol(B,30)
|
||||
mov \arg1, T2 // T2 = A
|
||||
add WK(\arg6+1), \arg4 // D + WK(i+1)
|
||||
rol $5, T2 // rol(A,5)
|
||||
add T1, \arg5 // E = FN(B,C,D) + E + WK(i)
|
||||
.endm
|
||||
|
||||
.macro RR1 arg0, arg1, arg2, arg3, arg4, arg5, arg6
|
||||
add \arg5, T2 // T2 = FN(B,C,D) + E + rol(A,5) + WK(i)
|
||||
mov T2, \arg5 // E = FN(B,C,D) + E + rol(A,5) + WK(i)
|
||||
rol $5, T2 // rol(E,5)
|
||||
add T2, \arg4 // D + WK(i+1) + rol(E,5)
|
||||
\arg0 \arg1, \arg2, \arg3 // FN(A,B,C)
|
||||
add T1, \arg4 // D = FN(A,B,C) + D + rol(E,5) + WK(i+1)
|
||||
rol $30, \arg1 // A = rol(A,30)
|
||||
.endm
|
||||
|
||||
|
||||
|
||||
/*
|
||||
|
||||
The following macro definitions are used to expand code for the per-block sha1 operation.
|
||||
|
||||
INITIAL_W_PRECALC_ssse3 : BIG_ENDIAN_LOAD(64 bytes block) into W (i=0:15) and store W+K into the stack memory
|
||||
INTERNAL_ssse3 : updating W (16:79) and update the digests A/B/C/D/E (i=0:63, based on W+K stored in the stack memory)
|
||||
ENDING : finishing up update the digests A/B/C/D/E (i=64:79)
|
||||
|
||||
For multiple-block sha1 operation (Multiple_Blocks = 1), INITIAL_W_PRECALC_ssse3 and ENDING are combined
|
||||
into 1 macro definition for software pipeling.
|
||||
|
||||
SOFTWARE_PIPELINING_ssse3 : BIG_ENDIAN_LOAD(64 bytes block) into W (i=0:15) and store W+K into the stack, and finishing up update the digests A/B/C/D/E (i=64:79)
|
||||
|
||||
assume cnt (the number of blocks) >= 1, the main code body should look like
|
||||
|
||||
INITIAL_W_PRECALC_ssse3 // W = big_endian_load and pre-compute W+K (i=0:15)
|
||||
do {
|
||||
INTERNAL_ssse3 // update W(i=16:79), and update hash digests A/B/C/D/E (i=0:63)
|
||||
cnt--;
|
||||
if (cnt==0) break;
|
||||
BUFFER_PTR += 64;
|
||||
SOFTWARE_PIPELINING_ssse3; // update hash digests A/B/C/D/E (i=64:79) + W = big_endian_load and pre-compute W+K (i=0:15)
|
||||
}
|
||||
ENDING // update hash digests A/B/C/D/E (i=64:79)
|
||||
|
||||
*/
|
||||
|
||||
#define W_PRECALC_00_15_0 W_PRECALC_00_15_0_ssse3
|
||||
#define W_PRECALC_00_15_1 W_PRECALC_00_15_1_ssse3
|
||||
#define W_PRECALC_16_31_0 W_PRECALC_16_31_0_ssse3
|
||||
#define W_PRECALC_32_79_0 W_PRECALC_32_79_0_ssse3
|
||||
#define W_PRECALC_32_79_0_i386 W_PRECALC_32_79_0_i386_ssse3
|
||||
|
||||
|
||||
.macro INITIAL_W_PRECALC_ssse3 // BIG_ENDIAN_LOAD(64 bytes block) into W (i=0:15) and store W+K into the stack memory
|
||||
|
||||
// i=0 : W28,W24,W20,W16,W12,W8,W4,W0
|
||||
W_PRECALC_00_15_0 0 // W_TMP = (BUFFER_PTR)
|
||||
W_PRECALC_00_15_1 W0 // convert W_TMP to big-endian, and save W0 = W_TMP
|
||||
W_PRECALC_00_15_2 // W_TMP = W0 + K
|
||||
W_PRECALC_00_15_3 3 // (sp) = W_TMP = W0 + K
|
||||
|
||||
// i=4 : W24,W20,W16,W12,W8,W4,W0,W28
|
||||
W_PRECALC_00_15_0 4 // W_TMP = 16(BUFFER_PTR)
|
||||
W_PRECALC_00_15_1 W28 // convert W_TMP to big-endian, and save W28 = W_TMP
|
||||
W_PRECALC_00_15_2 // W_TMP = W28 + K
|
||||
W_PRECALC_00_15_3 7 // 16(sp) = W_TMP = W28 + K
|
||||
|
||||
// i=8 : W20,W16,W12,W8,W4,W0,W28,W24
|
||||
W_PRECALC_00_15_0 8 // W_TMP = 32(BUFFER_PTR)
|
||||
W_PRECALC_00_15_1 W24 // convert W_TMP to big-endian, and save W24 = W_TMP
|
||||
W_PRECALC_00_15_2 // W_TMP = W24 + K
|
||||
W_PRECALC_00_15_3 11 // 32(sp) = W_TMP = W24 + K
|
||||
|
||||
// i=12 : W16,W12,W8,W4,W0,W28,W24,W20
|
||||
W_PRECALC_00_15_0 12 // W_TMP = 48(BUFFER_PTR)
|
||||
W_PRECALC_00_15_1 W20 // convert W_TMP to big-endian, and save W20 = W_TMP
|
||||
W_PRECALC_00_15_2 // W_TMP = W20 + K
|
||||
W_PRECALC_00_15_3 15 // 48(sp) = W_TMP = W20 + K
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
.macro INTERNAL_ssse3 // updating W (16:79) and update the digests A/B/C/D/E (i=0:63, based on W+K stored in the stack memory)
|
||||
|
||||
// i=16 : W12,W8,W4,W0,W28,W24,W20,W16
|
||||
W_PRECALC_16_31_0 W0,W28,W24,W20,W16
|
||||
RR0 F1,A,B,C,D,E,0
|
||||
W_PRECALC_16_31_1 W0,W16
|
||||
RR1 F1,A,B,C,D,E,0
|
||||
W_PRECALC_16_31_2 W16
|
||||
RR0 F1,D,E,A,B,C,2
|
||||
W_PRECALC_16_31_3 W16, 2, 0
|
||||
RR1 F1,D,E,A,B,C,2
|
||||
|
||||
// i=20 : W8,W4,W0,W28,W24,W20,W16,W12
|
||||
W_PRECALC_16_31_0 W28,W24,W20,W16,W12
|
||||
RR0 F1,B,C,D,E,A,4
|
||||
W_PRECALC_16_31_1 W28,W12
|
||||
RR1 F1,B,C,D,E,A,4
|
||||
W_PRECALC_16_31_2 W12
|
||||
RR0 F1,E,A,B,C,D,6
|
||||
W_PRECALC_16_31_3 W12, 6, 16
|
||||
RR1 F1,E,A,B,C,D,6
|
||||
|
||||
// i=24 : W4,W0,W28,W24,W20,W16,W12,W8
|
||||
W_PRECALC_16_31_0 W24,W20,W16,W12,W8
|
||||
RR0 F1,C,D,E,A,B,8
|
||||
W_PRECALC_16_31_1 W24,W8
|
||||
RR1 F1,C,D,E,A,B,8
|
||||
W_PRECALC_16_31_2 W8
|
||||
RR0 F1,A,B,C,D,E,10
|
||||
W_PRECALC_16_31_3 W8,10,16
|
||||
RR1 F1,A,B,C,D,E,10
|
||||
|
||||
// i=28 : W0,W28,W24,W20,W16,W12,W8,W4
|
||||
W_PRECALC_16_31_0 W20,W16,W12,W8,W4
|
||||
RR0 F1,D,E,A,B,C,12
|
||||
W_PRECALC_16_31_1 W20,W4
|
||||
RR1 F1,D,E,A,B,C,12
|
||||
W_PRECALC_16_31_2 W4
|
||||
RR0 F1,B,C,D,E,A,14
|
||||
W_PRECALC_16_31_3 W4,14,16
|
||||
RR1 F1,B,C,D,E,A,14
|
||||
|
||||
// i=32 : W28,W24,W20,W16,W12,W8,W4,W0
|
||||
W_PRECALC_32_79_0 W28,W8,W4,W0
|
||||
RR0 F1,E,A,B,C,D,16
|
||||
W_PRECALC_32_79_1 W16,W0
|
||||
RR1 F1,E,A,B,C,D,16
|
||||
W_PRECALC_32_79_2 W0
|
||||
RR0 F1,C,D,E,A,B,18
|
||||
W_PRECALC_32_79_3 W0,18,16
|
||||
RR1 F1,C,D,E,A,B,18
|
||||
|
||||
// starting using F2
|
||||
|
||||
// i=36 : W24,W20,W16,W12,W8,W4,W0,W28
|
||||
#if defined (__x86_64__)
|
||||
W_PRECALC_32_79_0 W24,W4,W0,W28
|
||||
#else
|
||||
W_PRECALC_32_79_0_i386 W24,W4,W0,W28
|
||||
#endif
|
||||
RR0 F2,A,B,C,D,E,20
|
||||
W_PRECALC_32_79_1 W12,W28
|
||||
RR1 F2,A,B,C,D,E,20
|
||||
#if defined (__x86_64__)
|
||||
W_PRECALC_32_79_2 W28
|
||||
#else
|
||||
W_PRECALC_32_79_2_i386 W28
|
||||
#endif
|
||||
RR0 F2,D,E,A,B,C,22
|
||||
W_PRECALC_32_79_3 W28,22,16
|
||||
RR1 F2,D,E,A,B,C,22
|
||||
|
||||
// i=40 : W20,W16,W12,W8,W4,W0,W28,W24
|
||||
#undef K_XMM
|
||||
#define K_XMM 32
|
||||
#if defined (__x86_64__)
|
||||
W_PRECALC_32_79_0 W20,W0,W28,W24
|
||||
#else
|
||||
W_PRECALC_32_79_0_i386 W20,W0,W28,W24
|
||||
#endif
|
||||
RR0 F2,B,C,D,E,A,24
|
||||
W_PRECALC_32_79_1 W8,W24
|
||||
RR1 F2,B,C,D,E,A,24
|
||||
#if defined (__x86_64__)
|
||||
W_PRECALC_32_79_2 W24
|
||||
#else
|
||||
W_PRECALC_32_79_2_i386 W24
|
||||
#endif
|
||||
RR0 F2,E,A,B,C,D,26
|
||||
W_PRECALC_32_79_3 W24,26,K_XMM
|
||||
RR1 F2,E,A,B,C,D,26
|
||||
|
||||
// i=44 : W16,W12,W8,W4,W0,W28,W24,W20
|
||||
W_PRECALC_32_79_0 W16,W28,W24,W20
|
||||
RR0 F2,C,D,E,A,B,28
|
||||
W_PRECALC_32_79_1 W4,W20
|
||||
RR1 F2,C,D,E,A,B,28
|
||||
W_PRECALC_32_79_2 W20
|
||||
RR0 F2,A,B,C,D,E,30
|
||||
W_PRECALC_32_79_3 W20,30,K_XMM
|
||||
RR1 F2,A,B,C,D,E,30
|
||||
|
||||
// i=48 : W12,W8,W4,W0,W28,W24,W20,W16
|
||||
W_PRECALC_32_79_0 W12,W24,W20,W16
|
||||
RR0 F2,D,E,A,B,C,32
|
||||
W_PRECALC_32_79_1 W0,W16
|
||||
RR1 F2,D,E,A,B,C,32
|
||||
W_PRECALC_32_79_2 W16
|
||||
RR0 F2,B,C,D,E,A,34
|
||||
W_PRECALC_32_79_3 W16,34,K_XMM
|
||||
RR1 F2,B,C,D,E,A,34
|
||||
|
||||
// i=52 : W8,W4,W0,W28,W24,W20,W16,W12
|
||||
W_PRECALC_32_79_0 W8,W20,W16,W12
|
||||
RR0 F2,E,A,B,C,D,36
|
||||
W_PRECALC_32_79_1 W28,W12
|
||||
RR1 F2,E,A,B,C,D,36
|
||||
W_PRECALC_32_79_2 W12
|
||||
RR0 F2,C,D,E,A,B,38
|
||||
W_PRECALC_32_79_3 W12,38,K_XMM
|
||||
RR1 F2,C,D,E,A,B,38
|
||||
|
||||
// starting using F3
|
||||
|
||||
// i=56 : W4,W0,W28,W24,W20,W16,W12,W8
|
||||
W_PRECALC_32_79_0 W4,W16,W12,W8
|
||||
RR0 F3,A,B,C,D,E,40
|
||||
W_PRECALC_32_79_1 W24,W8
|
||||
RR1 F3,A,B,C,D,E,40
|
||||
W_PRECALC_32_79_2 W8
|
||||
RR0 F3,D,E,A,B,C,42
|
||||
W_PRECALC_32_79_3 W8,42,K_XMM
|
||||
RR1 F3,D,E,A,B,C,42
|
||||
|
||||
// i=60 : W0,W28,W24,W20,W16,W12,W8,W4
|
||||
#undef K_XMM
|
||||
#define K_XMM 48
|
||||
W_PRECALC_32_79_0 W0,W12,W8,W4
|
||||
RR0 F3,B,C,D,E,A,44
|
||||
W_PRECALC_32_79_1 W20,W4
|
||||
RR1 F3,B,C,D,E,A,44
|
||||
W_PRECALC_32_79_2 W4
|
||||
RR0 F3,E,A,B,C,D,46
|
||||
W_PRECALC_32_79_3 W4,46,K_XMM
|
||||
RR1 F3,E,A,B,C,D,46
|
||||
|
||||
// i=64 : W28,W24,W20,W16,W12,W8,W4,W0
|
||||
W_PRECALC_32_79_0 W28,W8,W4,W0
|
||||
RR0 F3,C,D,E,A,B,48
|
||||
W_PRECALC_32_79_1 W16,W0
|
||||
RR1 F3,C,D,E,A,B,48
|
||||
W_PRECALC_32_79_2 W0
|
||||
RR0 F3,A,B,C,D,E,50
|
||||
W_PRECALC_32_79_3 W0,50,K_XMM
|
||||
RR1 F3,A,B,C,D,E,50
|
||||
|
||||
// i=68 : W24,W20,W16,W12,W8,W4,W0,W28
|
||||
#if defined (__x86_64__)
|
||||
W_PRECALC_32_79_0 W24,W4,W0,W28
|
||||
#else
|
||||
W_PRECALC_32_79_0_i386 W24,W4,W0,W28
|
||||
#endif
|
||||
RR0 F3,D,E,A,B,C,52
|
||||
W_PRECALC_32_79_1 W12,W28
|
||||
RR1 F3,D,E,A,B,C,52
|
||||
#if defined (__x86_64__)
|
||||
W_PRECALC_32_79_2 W28
|
||||
#else
|
||||
W_PRECALC_32_79_2_i386 W28
|
||||
#endif
|
||||
RR0 F3,B,C,D,E,A,54
|
||||
W_PRECALC_32_79_3 W28,54,K_XMM
|
||||
RR1 F3,B,C,D,E,A,54
|
||||
|
||||
// i=72 : W20,W16,W12,W8,W4,W0,W28,W24
|
||||
#if defined (__x86_64__)
|
||||
W_PRECALC_32_79_0 W20,W0,W28,W24
|
||||
#else
|
||||
W_PRECALC_32_79_0_i386 W20,W0,W28,W24
|
||||
#endif
|
||||
RR0 F3,E,A,B,C,D,56
|
||||
W_PRECALC_32_79_1 W8,W24
|
||||
RR1 F3,E,A,B,C,D,56
|
||||
#if defined (__x86_64__)
|
||||
W_PRECALC_32_79_2 W24
|
||||
#else
|
||||
W_PRECALC_32_79_2_i386 W24
|
||||
#endif
|
||||
RR0 F3,C,D,E,A,B,58
|
||||
W_PRECALC_32_79_3 W24,58,K_XMM
|
||||
RR1 F3,C,D,E,A,B,58
|
||||
|
||||
// starting using F4
|
||||
|
||||
// i=76 : W16,W12,W8,W4,W0,W28,W24,W20
|
||||
W_PRECALC_32_79_0 W16,W28,W24,W20
|
||||
RR0 F4,A,B,C,D,E,60
|
||||
W_PRECALC_32_79_1 W4,W20
|
||||
RR1 F4,A,B,C,D,E,60
|
||||
W_PRECALC_32_79_2 W20
|
||||
RR0 F4,D,E,A,B,C,62
|
||||
W_PRECALC_32_79_3 W20,62,K_XMM
|
||||
RR1 F4,D,E,A,B,C,62
|
||||
|
||||
.endm
|
||||
|
||||
.macro SOFTWARE_PIPELINING_ssse3
|
||||
// i=0 : W28,W24,W20,W16,W12,W8,W4,W0
|
||||
W_PRECALC_00_15_0 0 // W_TMP = (BUFFER_PTR)
|
||||
RR0 F4,B,C,D,E,A,64
|
||||
W_PRECALC_00_15_1 W0 // convert W_TMP to big-endian, and save W0 = W_TMP
|
||||
RR1 F4,B,C,D,E,A,64
|
||||
W_PRECALC_00_15_2 // W_TMP = W0 + K
|
||||
RR0 F4,E,A,B,C,D,66
|
||||
W_PRECALC_00_15_3 3 // (sp) = W_TMP = W0 + K
|
||||
RR1 F4,E,A,B,C,D,66
|
||||
|
||||
// i=4 : W24,W20,W16,W12,W8,W4,W0,W28
|
||||
W_PRECALC_00_15_0 4 // W_TMP = 16(BUFFER_PTR)
|
||||
RR0 F4,C,D,E,A,B,68
|
||||
W_PRECALC_00_15_1 W28 // convert W_TMP to big-endian, and save W28 = W_TMP
|
||||
RR1 F4,C,D,E,A,B,68
|
||||
W_PRECALC_00_15_2 // W_TMP = W28 + K
|
||||
RR0 F4,A,B,C,D,E,70
|
||||
W_PRECALC_00_15_3 7 // 16(sp) = W_TMP = W28 + K[0]
|
||||
RR1 F4,A,B,C,D,E,70
|
||||
|
||||
// i=8 : W20,W16,W12,W8,W4,W0,W28,W24
|
||||
W_PRECALC_00_15_0 8 // W_TMP = 32(BUFFER_PTR)
|
||||
RR0 F4,D,E,A,B,C,72
|
||||
W_PRECALC_00_15_1 W24 // convert W_TMP to big-endian, and save W24 = W_TMP
|
||||
RR1 F4,D,E,A,B,C,72
|
||||
W_PRECALC_00_15_2 // W_TMP = W24 + K
|
||||
RR0 F4,B,C,D,E,A,74
|
||||
W_PRECALC_00_15_3 11 // 32(sp) = W_TMP = W24 + K
|
||||
RR1 F4,B,C,D,E,A,74
|
||||
|
||||
// i=12 : W16,W12,W8,W4,W0,W28,W24,W20
|
||||
W_PRECALC_00_15_0 12 // W_TMP = 48(BUFFER_PTR)
|
||||
RR0 F4,E,A,B,C,D,76
|
||||
W_PRECALC_00_15_1 W20 // convert W_TMP to big-endian, and save W20 = W_TMP
|
||||
RR1 F4,E,A,B,C,D,76
|
||||
W_PRECALC_00_15_2 // W_TMP = W20 + K
|
||||
RR0 F4,C,D,E,A,B,78
|
||||
W_PRECALC_00_15_3 15 // 48(sp) = W_TMP = W20 + K
|
||||
RR1 F4,C,D,E,A,B,78
|
||||
.endm
|
||||
|
||||
|
||||
#undef W_PRECALC_00_15_0
|
||||
#undef W_PRECALC_00_15_1
|
||||
#undef W_PRECALC_16_31_0
|
||||
#undef W_PRECALC_32_79_0
|
||||
#undef W_PRECALC_32_79_0_i386
|
||||
|
||||
.macro ENDING // finish up updating hash digests (i=64:79)
|
||||
//i=80
|
||||
RR0 F4,B,C,D,E,A,64
|
||||
RR1 F4,B,C,D,E,A,64
|
||||
RR0 F4,E,A,B,C,D,66
|
||||
RR1 F4,E,A,B,C,D,66
|
||||
|
||||
//i=84
|
||||
RR0 F4,C,D,E,A,B,68
|
||||
RR1 F4,C,D,E,A,B,68
|
||||
RR0 F4,A,B,C,D,E,70
|
||||
RR1 F4,A,B,C,D,E,70
|
||||
|
||||
//i=88
|
||||
RR0 F4,D,E,A,B,C,72
|
||||
RR1 F4,D,E,A,B,C,72
|
||||
RR0 F4,B,C,D,E,A,74
|
||||
RR1 F4,B,C,D,E,A,74
|
||||
|
||||
//i=92
|
||||
RR0 F4,E,A,B,C,D,76
|
||||
RR1 F4,E,A,B,C,D,76
|
||||
RR0 F4,C,D,E,A,B,78
|
||||
RR1 F4,C,D,E,A,B,78
|
||||
.endm
|
||||
|
||||
// load hash digests A,B,C,D,E from memory into registers
|
||||
.macro LOAD_HASH
|
||||
#if defined (__x86_64__)
|
||||
mov (HASH_PTR), A
|
||||
mov 4(HASH_PTR), B
|
||||
mov 8(HASH_PTR), C
|
||||
mov 12(HASH_PTR), D
|
||||
mov 16(HASH_PTR), E
|
||||
#else
|
||||
mov HASH_PTR, T1
|
||||
mov (T1), A
|
||||
mov 4(T1), B
|
||||
mov 8(T1), C
|
||||
mov 12(T1), D
|
||||
mov 16(T1), E
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro UPDATE_HASH arg0, arg1
|
||||
add \arg0, \arg1
|
||||
mov \arg1, \arg0
|
||||
.endm
|
||||
|
||||
.macro UPDATE_ALL_HASH
|
||||
#if defined (__x86_64__)
|
||||
UPDATE_HASH (HASH_PTR), A
|
||||
UPDATE_HASH 4(HASH_PTR), B
|
||||
UPDATE_HASH 8(HASH_PTR), C
|
||||
UPDATE_HASH 12(HASH_PTR), D
|
||||
UPDATE_HASH 16(HASH_PTR), E
|
||||
#else
|
||||
mov HASH_PTR, T1
|
||||
UPDATE_HASH (T1), A
|
||||
UPDATE_HASH 4(T1), B
|
||||
UPDATE_HASH 8(T1), C
|
||||
UPDATE_HASH 12(T1), D
|
||||
UPDATE_HASH 16(T1), E
|
||||
#endif
|
||||
.endm
|
||||
|
||||
|
||||
/*
|
||||
main sha1 code for system with ssse3 support
|
||||
*/
|
||||
|
||||
.macro SHA1_PIPELINED_MAIN_BODY_ssse3
|
||||
LOAD_HASH // load initial hashes into A,B,C,D,E
|
||||
INITIAL_W_PRECALC_ssse3 // big_endian_load(W) and W+K (i=0:15)
|
||||
.p2align 4,0x90
|
||||
0:
|
||||
INTERNAL_ssse3 // update W (i=16:79) and update ABCDE (i=0:63)
|
||||
#if Multiple_Blocks
|
||||
#if defined (__x86_64__)
|
||||
addq _IMM(64), BUFFER_PTR // BUFFER_PTR+=64;
|
||||
subq _IMM(1), cnt // pre-decrement cnt by 1
|
||||
#else
|
||||
addl _IMM(64), BUFFER_PTR // BUFFER_PTR+=64;
|
||||
subl _IMM(1), cnt // pre-decrement cnt by 1
|
||||
#endif
|
||||
jbe 1f // if cnt <= 0, branch to finish off
|
||||
SOFTWARE_PIPELINING_ssse3 // update ABCDE (i=64:79) || big_endian_load(W) and W+K (i=0:15)
|
||||
UPDATE_ALL_HASH // update output hashes
|
||||
jmp 0b // repeat for next block
|
||||
.p2align 4,0x90
|
||||
1:
|
||||
#endif
|
||||
ENDING // update ABCDE (i=64:79)
|
||||
UPDATE_ALL_HASH // update output hashes
|
||||
.endm
|
||||
|
||||
/*
|
||||
I removed the cpu capabilities check. The check is now down
|
||||
in C code and the appropriate version of the assembler code
|
||||
is selected.
|
||||
*/
|
||||
.text
|
||||
.globl _AccelerateCrypto_SHA1_compress_ssse3
|
||||
_AccelerateCrypto_SHA1_compress_ssse3:
|
||||
|
||||
// start the sha1 code with ssse3 support
|
||||
|
||||
// save callee-save registers
|
||||
#if defined (__x86_64__)
|
||||
push %rbp
|
||||
mov %rsp, %rbp
|
||||
push %rbx
|
||||
push %r15
|
||||
#else
|
||||
push %ebx
|
||||
push %ebp
|
||||
push %esi
|
||||
push %edi
|
||||
#endif
|
||||
|
||||
sub $stack_size, sp // allocate stack memory for use
|
||||
|
||||
// save used xmm register if this is for kernel
|
||||
#if BUILDKERNEL
|
||||
xmov %xmm0, 4*16(sp)
|
||||
xmov %xmm1, 5*16(sp)
|
||||
xmov %xmm2, 6*16(sp)
|
||||
xmov %xmm3, 7*16(sp)
|
||||
xmov %xmm4, 8*16(sp)
|
||||
xmov %xmm5, 9*16(sp)
|
||||
xmov %xmm6, 10*16(sp)
|
||||
xmov %xmm7, 11*16(sp)
|
||||
#if defined (__x86_64__)
|
||||
xmov %xmm8, 12*16(sp)
|
||||
xmov %xmm9, 13*16(sp)
|
||||
xmov %xmm10, 14*16(sp)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined (__x86_64__)
|
||||
|
||||
// set up registers to free %edx/%edi/%esi for other use (ABCDE)
|
||||
mov ctx, HASH_PTR
|
||||
mov buf, BUFFER_PTR
|
||||
#if Multiple_Blocks
|
||||
mov %rsi, cnt
|
||||
#endif
|
||||
lea K_XMM_AR(%rip), K_BASE
|
||||
xmov 0x40(K_BASE), XMM_SHUFB_BSWAP
|
||||
|
||||
#else // __i386__
|
||||
|
||||
#if BUILDKERNEL
|
||||
lea K_XMM_AR, %eax
|
||||
#else
|
||||
// Get address of 0 in R.
|
||||
call 0f // Push program counter onto stack.
|
||||
0: pop %eax // Get program counter.
|
||||
lea K_XMM_AR-0b(%eax), %eax
|
||||
#endif
|
||||
mov %eax, K_BASE
|
||||
xmov 0x40(%eax), %xmm0
|
||||
xmov %xmm0, XMM_SHUFB_BSWAP
|
||||
|
||||
#endif
|
||||
|
||||
SHA1_PIPELINED_MAIN_BODY_ssse3
|
||||
|
||||
// restore used xmm registers if this is for kernel
|
||||
#if BUILDKERNEL
|
||||
xmov 4*16(sp), %xmm0
|
||||
xmov 5*16(sp), %xmm1
|
||||
xmov 6*16(sp), %xmm2
|
||||
xmov 7*16(sp), %xmm3
|
||||
xmov 8*16(sp), %xmm4
|
||||
xmov 9*16(sp), %xmm5
|
||||
xmov 10*16(sp), %xmm6
|
||||
xmov 11*16(sp), %xmm7
|
||||
#if defined (__x86_64__)
|
||||
xmov 12*16(sp), %xmm8
|
||||
xmov 13*16(sp), %xmm9
|
||||
xmov 14*16(sp), %xmm10
|
||||
#endif
|
||||
#endif
|
||||
|
||||
add $stack_size, sp // deallocate stack memory
|
||||
|
||||
// restore callee-save registers
|
||||
#if defined (__x86_64__)
|
||||
pop %r15
|
||||
pop %rbx
|
||||
pop %rbp
|
||||
#else
|
||||
pop %edi
|
||||
pop %esi
|
||||
pop %ebp
|
||||
pop %ebx
|
||||
#endif
|
||||
|
||||
ret // return
|
||||
|
||||
CC_ASM_SECTION_CONST
|
||||
.p2align 4, 0x90
|
||||
|
||||
#define K1 0x5a827999
|
||||
#define K2 0x6ed9eba1
|
||||
#define K3 0x8f1bbcdc
|
||||
#define K4 0xca62c1d6
|
||||
|
||||
K_XMM_AR:
|
||||
.long K1
|
||||
.long K1
|
||||
.long K1
|
||||
.long K1
|
||||
.long K2
|
||||
.long K2
|
||||
.long K2
|
||||
.long K2
|
||||
.long K3
|
||||
.long K3
|
||||
.long K3
|
||||
.long K3
|
||||
.long K4
|
||||
.long K4
|
||||
.long K4
|
||||
.long K4
|
||||
// bswap_shufb_ctl: accessed thru 0x40(K_XMM_AR)
|
||||
.long 0x00010203
|
||||
.long 0x04050607
|
||||
.long 0x08090a0b
|
||||
.long 0x0c0d0e0f
|
||||
|
||||
|
||||
#endif // architecture x86_64 or i386
|
||||
|
||||
#endif // (defined(__x86_64__) || defined(__i386__))
|
||||
|
||||
|
|
@ -0,0 +1,854 @@
|
|||
# Copyright (c) (2011,2012,2013,2015,2016,2018,2019) Apple Inc. All rights reserved.
|
||||
#
|
||||
# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
# is contained in the License.txt file distributed with corecrypto) and only to
|
||||
# people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
# Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
# devices and computers you own or control, for the sole purpose of verifying the
|
||||
# security characteristics and correct functioning of the Apple Software. You may
|
||||
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
/*
|
||||
This file provides armv7+neon hand implementation of the following function
|
||||
|
||||
void SHA256_Transform(SHA256_ctx *ctx, char *data, unsigned int num_blocks);
|
||||
|
||||
which is a C function in sha2.c (from xnu).
|
||||
|
||||
sha256 algorithm per block description:
|
||||
|
||||
1. W(0:15) = big-endian (per 4 bytes) loading of input data (64 byte)
|
||||
2. load 8 digests a-h from ctx->state
|
||||
3. for r = 0:15
|
||||
T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
|
||||
d += T1;
|
||||
h = T1 + Sigma0(a) + Maj(a,b,c)
|
||||
permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
|
||||
4. for r = 16:63
|
||||
W[r] = W[r-16] + sigma1(W[r-2]) + W[r-7] + sigma0(W[r-15]);
|
||||
T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
|
||||
d += T1;
|
||||
h = T1 + Sigma0(a) + Maj(a,b,c)
|
||||
permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
|
||||
|
||||
In the assembly implementation:
|
||||
- a circular window of message schedule W(r:r+15) is updated and stored in q0-q3
|
||||
- its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
|
||||
- the 8 digests (a-h) will be stored in GPR or memory
|
||||
|
||||
the implementation per block looks like
|
||||
|
||||
----------------------------------------------------------------------------
|
||||
|
||||
load W(0:15) (big-endian per 4 bytes) into q0:q3
|
||||
pre_calculate and store W+K(0:15) in stack
|
||||
|
||||
load digests a-h from ctx->state;
|
||||
|
||||
for (r=0;r<48;r+=4) {
|
||||
digests a-h update and permute round r:r+3
|
||||
update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
|
||||
}
|
||||
|
||||
for (r=48;r<64;r+=4) {
|
||||
digests a-h update and permute round r:r+3
|
||||
}
|
||||
|
||||
ctx->states += digests a-h;
|
||||
|
||||
----------------------------------------------------------------------------
|
||||
|
||||
our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block
|
||||
into the last 16 rounds of its previous block:
|
||||
|
||||
----------------------------------------------------------------------------
|
||||
|
||||
load W(0:15) (big-endian per 4 bytes) into q0:q3
|
||||
pre_calculate and store W+K(0:15) in stack
|
||||
|
||||
L_loop:
|
||||
|
||||
load digests a-h from ctx->state;
|
||||
|
||||
for (r=0;r<48;r+=4) {
|
||||
digests a-h update and permute round r:r+3
|
||||
update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
|
||||
}
|
||||
|
||||
num_block--;
|
||||
if (num_block==0) jmp L_last_block;
|
||||
|
||||
for (r=48;r<64;r+=4) {
|
||||
digests a-h update and permute round r:r+3
|
||||
load W([r:r+3]%16) (big-endian per 4 bytes) into q0:q3
|
||||
pre_calculate and store W+K([r:r+3]%16) in stack
|
||||
}
|
||||
|
||||
ctx->states += digests a-h;
|
||||
|
||||
jmp L_loop;
|
||||
|
||||
L_last_block:
|
||||
|
||||
for (r=48;r<64;r+=4) {
|
||||
digests a-h update and permute round r:r+3
|
||||
}
|
||||
|
||||
ctx->states += digests a-h;
|
||||
|
||||
------------------------------------------------------------------------
|
||||
|
||||
Apple CoreOS vector & numerics
|
||||
*/
|
||||
|
||||
#if (defined(__arm__) && defined(__ARM_NEON__))
|
||||
|
||||
// associate variables with registers or memory
|
||||
|
||||
#define ctx r0
|
||||
#define data r1
|
||||
#define num_blocks [sp, #64]
|
||||
#define _i_loop [sp, #68]
|
||||
|
||||
#define a r2
|
||||
#define b r3
|
||||
#define c r4
|
||||
#define d r5
|
||||
#define e r8
|
||||
#define f r9
|
||||
#define g r10
|
||||
#define h r11
|
||||
|
||||
#define K r6
|
||||
|
||||
// 2 local variables
|
||||
#define t r12
|
||||
#define s lr
|
||||
|
||||
// a window (16 words) of message scheule
|
||||
#define W0 q0
|
||||
#define W1 q1
|
||||
#define W2 q2
|
||||
#define W3 q3
|
||||
#define zero q8
|
||||
|
||||
// circular buffer for WK[(r:r+15)%16]
|
||||
#define WK(r) [sp,#((r)&15)*4]
|
||||
|
||||
// #define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
|
||||
|
||||
.macro Ch
|
||||
mvn t, $0 // ~x
|
||||
and s, $0, $1 // (x) & (y)
|
||||
and t, t, $2 // (~(x)) & (z)
|
||||
eor t, t, s // t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
|
||||
.endm
|
||||
|
||||
// #define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
|
||||
|
||||
.macro Maj
|
||||
eor t, $1, $2 // y^z
|
||||
and s, $1, $2 // y&z
|
||||
and t, t, $0 // x&(y^z)
|
||||
eor t, t, s // t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
|
||||
.endm
|
||||
|
||||
// #define sigma0_256(x) (S32(7, (x)) ^ S32(18, (x)) ^ R(3 , (x)))
|
||||
|
||||
// performs sigma0_256 on 4 words on a Q register
|
||||
// use q6/q7 as intermediate registers
|
||||
.macro sigma0
|
||||
vshr.u32 q6, $0, #7
|
||||
vshl.i32 q7, $0, #14
|
||||
vshr.u32 $0, $0, #3
|
||||
veor $0, q6
|
||||
veor $0, q7
|
||||
vshr.u32 q6, #11
|
||||
vshl.i32 q7, #11
|
||||
veor $0, q6
|
||||
veor $0, q7
|
||||
.endm
|
||||
|
||||
// #define sigma1_256(x) (S32(17, (x)) ^ S32(19, (x)) ^ R(10, (x)))
|
||||
|
||||
// performs sigma1_256 on 4 words on a Q register
|
||||
// use q6/q7 as intermediate registers
|
||||
.macro sigma1
|
||||
vshr.u32 q6, $0, #17
|
||||
vshl.i32 q7, $0, #13
|
||||
vshr.u32 $0, $0, #10
|
||||
veor $0, q6
|
||||
veor $0, q7
|
||||
vshr.u32 q6, #2
|
||||
vshl.i32 q7, #2
|
||||
veor $0, q6
|
||||
veor $0, q7
|
||||
.endm
|
||||
|
||||
// #define Sigma0_256(x) (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x)))
|
||||
|
||||
.macro Sigma0
|
||||
ror t, $0, #2 // S32(2, (x))
|
||||
ror s, $0, #13 // S32(13, (x))
|
||||
eor t, t, s // S32(2, (x)) ^ S32(13, (x))
|
||||
ror s, s, #9 // S32(22, (x))
|
||||
eor t, t, s // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x)))
|
||||
.endm
|
||||
|
||||
// #define Sigma1_256(x) (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
|
||||
|
||||
.macro Sigma1
|
||||
ror t, $0, #6 // S32(6, (x))
|
||||
ror s, $0, #11 // S32(11, (x))
|
||||
eor t, t, s // S32(6, (x)) ^ S32(11, (x))
|
||||
ror s, s, #14 // S32(25, (x))
|
||||
eor t, t, s // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
|
||||
.endm
|
||||
|
||||
// per round digests update
|
||||
.macro round
|
||||
// ror t, $4, #6 // S32(6, (x))
|
||||
eor t, t, $4, ror #11 // S32(6, (x)) ^ S32(11, (x))
|
||||
and s, $4, $5 // (x) & (y)
|
||||
eor t, t, $4, ror #25 // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
|
||||
add $7, t // use h to store h+Sigma1(e)
|
||||
bic t, $6, $4 // (~(x)) & (z)
|
||||
eor t, t, s // t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
|
||||
ldr s, WK($8) //
|
||||
add $7, t // t = h+Sigma1(e)+Ch(e,f,g);
|
||||
ror t, $0, #2 // S32(2, (x))
|
||||
add $7, s // h = T1
|
||||
eor t, t, $0, ror #13 // S32(2, (x)) ^ S32(13, (x))
|
||||
add $3, $7 // d += T1;
|
||||
eor t, t, $0, ror #22 // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) // t = Sigma0(a);
|
||||
add $7, t // h = T1 + Sigma0(a);
|
||||
eor t, $1, $2 // y^z
|
||||
and s, $1, $2 // y&z
|
||||
and t, t, $0 // x&(y^z)
|
||||
eor s, s, t // t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
|
||||
// add $7, s // h = T1 + Sigma0(a) + Maj(a,b,c);
|
||||
.endm
|
||||
|
||||
// per 4 rounds digests update and permutation
|
||||
// permutation is absorbed by rotating the roles of digests a-h
|
||||
.macro rounds
|
||||
ror t, $4, #6
|
||||
round $0, $1, $2, $3, $4, $5, $6, $7, 0+$8
|
||||
ror t, $3, #6
|
||||
add $7, s
|
||||
round $7, $0, $1, $2, $3, $4, $5, $6, 1+$8
|
||||
ror t, $2, #6
|
||||
add $6, s
|
||||
round $6, $7, $0, $1, $2, $3, $4, $5, 2+$8
|
||||
ror t, $1, #6
|
||||
add $5, s
|
||||
round $5, $6, $7, $0, $1, $2, $3, $4, 3+$8
|
||||
add $4, s
|
||||
.endm
|
||||
|
||||
.macro rounds_a
|
||||
ror t, e, #6
|
||||
round a, b, c, d, e, f, g, h, 0+$0
|
||||
ror t, d, #6
|
||||
add h, s
|
||||
round h, a, b, c, d, e, f, g, 1+$0
|
||||
ror t, c, #6
|
||||
add g, s
|
||||
round g, h, a, b, c, d, e, f, 2+$0
|
||||
ror t, b, #6
|
||||
add f, s
|
||||
round f, g, h, a, b, c, d, e, 3+$0
|
||||
add e, s
|
||||
.endm
|
||||
|
||||
.macro rounds_a_update_W_WK
|
||||
ror t, e, #6
|
||||
round a, b, c, d, e, f, g, h, 0+$0
|
||||
vld1.s32 {$2},[data]!
|
||||
ror t, d, #6
|
||||
add h, s
|
||||
round h, a, b, c, d, e, f, g, 1+$0
|
||||
vrev32.8 $2, $2
|
||||
ror t, c, #6
|
||||
vld1.s32 {q4},[K,:128]!
|
||||
add g, s
|
||||
round g, h, a, b, c, d, e, f, 2+$0
|
||||
ror t, b, #6
|
||||
add f, s
|
||||
vadd.s32 q4, $2
|
||||
round f, g, h, a, b, c, d, e, 3+$0
|
||||
add t, sp, #($1*16)
|
||||
add e, s
|
||||
vst1.32 {q4},[t]
|
||||
.endm
|
||||
|
||||
.macro rounds_e
|
||||
ror t, a, #6
|
||||
round e, f, g, h, a, b, c, d, 0+$0
|
||||
ror t, h, #6
|
||||
add d, s
|
||||
round d, e, f, g, h, a, b, c, 1+$0
|
||||
ror t, g, #6
|
||||
add c, s
|
||||
round c, d, e, f, g, h, a, b, 2+$0
|
||||
ror t, f, #6
|
||||
add b, s
|
||||
round b, c, d, e, f, g, h, a, 3+$0
|
||||
add a, s
|
||||
.endm
|
||||
|
||||
.macro rounds_e_update_W_WK
|
||||
ror t, a, #6
|
||||
round e, f, g, h, a, b, c, d, 0+$0
|
||||
vld1.s32 {$2},[data]!
|
||||
ror t, h, #6
|
||||
add d, s
|
||||
round d, e, f, g, h, a, b, c, 1+$0
|
||||
vrev32.8 $2, $2
|
||||
ror t, g, #6
|
||||
vld1.s32 {q4},[K,:128]!
|
||||
add c, s
|
||||
round c, d, e, f, g, h, a, b, 2+$0
|
||||
ror t, f, #6
|
||||
add b, s
|
||||
vadd.s32 q4, $2
|
||||
round b, c, d, e, f, g, h, a, 3+$0
|
||||
add t, sp, #($1*16)
|
||||
add a, s
|
||||
vst1.32 {q4},[t]
|
||||
.endm
|
||||
|
||||
// update the message schedule W and W+K (4 rounds) 16 rounds ahead in the future
|
||||
.macro message_schedule
|
||||
vld1.32 {q5},[K,:128]!
|
||||
vext.32 q4, $0, $1, #1 // Q4 = w4:w1
|
||||
sigma0 q4 // sigma0(w4:w1)
|
||||
vadd.s32 $0, q4 // w3:w0 + sigma0(w4:w1)
|
||||
vext.32 q6, $2, $3, #1 // Q6 = w12:w9
|
||||
vadd.s32 $0, q6 // w3:w0 + sigma0(w4:w1) + w12:w9
|
||||
vext.64 q4, $3, zero, #1 // 0 0 w15:w14
|
||||
sigma1 q4 // Q4 = sigma1(0 0 w15:w14)
|
||||
vadd.s32 $0, q4 // w3:w0 + sigma0(w4:w1) + w12:w9 + sigma1(0 0 w15:w14)
|
||||
vext.64 q4, zero, $0, #1 // Q4 = (w17:w16 0 0)
|
||||
sigma1 q4 // sigma1(w17:w16 0 0)
|
||||
vadd.s32 $0, q4 // w19:w16 = w3:w0 + sigma0(w4:w1) + w12:w9 + sigma1(w17:w14)
|
||||
add t, sp, #(($4&15)*4)
|
||||
vadd.s32 q5, $0 // W+K
|
||||
vst1.32 {q5},[t,:128]
|
||||
.endm
|
||||
|
||||
// this macro is used in the last 16 rounds of a current block
|
||||
// it reads the next message (16 4-byte words), load it into 4 words W[r:r+3], computes WK[r:r+3]
|
||||
// and save into stack to prepare for next block
|
||||
|
||||
.macro update_W_WK
|
||||
vld1.s32 {$1},[data]!
|
||||
vrev32.8 $1, $1
|
||||
add t, sp, #($0*16)
|
||||
vld1.s32 {q4},[K,:128]!
|
||||
vadd.s32 q4, $1
|
||||
vst1.32 {q4},[t]
|
||||
.endm
|
||||
|
||||
.macro Update_Digits
|
||||
ldrd t, s, [ctx]
|
||||
add a, t
|
||||
add b, s
|
||||
strd a, b, [ctx]
|
||||
|
||||
ldrd t, s, [ctx,#8]
|
||||
add c, t
|
||||
add d, s
|
||||
strd c, d, [ctx, #8]
|
||||
|
||||
ldrd t, s, [ctx,#16]
|
||||
add e, t
|
||||
add f, s
|
||||
strd e, f, [ctx, #16]
|
||||
|
||||
ldrd t, s, [ctx,#24]
|
||||
add g, t
|
||||
add h, s
|
||||
strd g, h, [ctx, #24]
|
||||
.endm
|
||||
|
||||
.macro rounds_a_schedule_update
|
||||
eor t, e, e, ror #5 // S32(6, (x)) ^ S32(11, (x))
|
||||
vld1.32 {q5},[K,:128]!
|
||||
eor t, t, e, ror #19 // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
|
||||
vext.32 q4, $1, $2, #1 // Q4 = w4:w1
|
||||
and s, e, f // (x) & (y)
|
||||
add h, t, ror #6 // use h to store h+Sigma1(e)
|
||||
bic t, g, e // (~(x)) & (z)
|
||||
vshr.u32 q6, q4, #7
|
||||
eor t, t, s // t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
|
||||
vshl.i32 q7, q4, #14
|
||||
ldr s, WK($0) //
|
||||
add h, t // t = h+Sigma1(e)+Ch(e,f,g);
|
||||
eor t, a, a, ror #11 // S32(2, (x)) ^ S32(13, (x))
|
||||
vshr.u32 q4, q4, #3
|
||||
add h, s // h = T1
|
||||
eor t, t, a, ror #20 // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) // t = Sigma0(a);
|
||||
add d, h // d += T1;
|
||||
add h, t, ror #2 // h = T1 + Sigma0(a);
|
||||
veor q4, q6
|
||||
eor t, b, c // y^z
|
||||
vshr.u32 q6, #11
|
||||
and s, b, c // y&z
|
||||
and t, t, a // x&(y^z)
|
||||
veor q4, q7
|
||||
eor s, s, t // t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
|
||||
eor t, d, d, ror #5 // S32(6, (x)) ^ S32(11, (x))
|
||||
vshl.i32 q7, #11
|
||||
|
||||
|
||||
add h, s
|
||||
veor q4, q6
|
||||
eor t, t, d, ror #19 // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
|
||||
and s, d, e // (x) & (y)
|
||||
veor q4, q7
|
||||
add g, t, ror #6 // use h to store h+Sigma1(e)
|
||||
|
||||
bic t, f, d // (~(x)) & (z)
|
||||
vext.32 q6, $3, $4, #1 // Q6 = w12:w9
|
||||
eor t, t, s // t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
|
||||
ldr s, WK(1+$0) //
|
||||
vadd.s32 $1, q4 // w3:w0 + sigma0(w4:w1)
|
||||
add g, t // t = h+Sigma1(e)+Ch(e,f,g);
|
||||
eor t, h, h, ror #11 // S32(2, (x)) ^ S32(13, (x))
|
||||
vadd.s32 $1, q6 // w3:w0 + sigma0(w4:w1) + w12:w9
|
||||
add g, s // h = T1
|
||||
eor t, t, h, ror #20 // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) // t = Sigma0(a);
|
||||
vext.64 q4, $4, zero, #1 // 0 0 w15:w14
|
||||
add c, g // d += T1;
|
||||
add g, t, ror #2 // h = T1 + Sigma0(a);
|
||||
eor t, a, b // y^z
|
||||
and s, a, b // y&z
|
||||
vshr.u32 q6, q4, #17
|
||||
and t, t, h // x&(y^z)
|
||||
vshl.i32 q7, q4, #13
|
||||
eor s, s, t // t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
|
||||
vshr.u32 q4, q4, #10
|
||||
|
||||
|
||||
|
||||
eor t, c, c, ror #5 // S32(6, (x)) ^ S32(11, (x))
|
||||
veor q4, q6
|
||||
add g, s
|
||||
veor q4, q7
|
||||
eor t, t, c, ror #19 // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
|
||||
vshr.u32 q6, #2
|
||||
and s, c, d // (x) & (y)
|
||||
vshl.i32 q7, #2
|
||||
add f, t, ror #6 // use h to store h+Sigma1(e)
|
||||
veor q4, q6
|
||||
bic t, e, c // (~(x)) & (z)
|
||||
veor q4, q7
|
||||
eor t, t, s // t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
|
||||
vadd.s32 $1, q4 // w3:w0 + sigma0(w4:w1) + w12:w9 + sigma1(0 0 w15:w14)
|
||||
ldr s, WK(2+$0) //
|
||||
vext.64 q4, zero, $1, #1 // Q4 = (w17:w16 0 0)
|
||||
add f, t // t = h+Sigma1(e)+Ch(e,f,g);
|
||||
vshr.u32 q6, q4, #17
|
||||
eor t, g, g, ror #11 // S32(2, (x)) ^ S32(13, (x))
|
||||
vshl.i32 q7, q4, #13
|
||||
add f, s // h = T1
|
||||
vshr.u32 q4, q4, #10
|
||||
eor t, t, g, ror #20 // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) // t = Sigma0(a);
|
||||
veor q4, q6
|
||||
add b, f // d += T1;
|
||||
veor q4, q7
|
||||
add f, t, ror #2 // h = T1 + Sigma0(a);
|
||||
eor t, h, a // y^z
|
||||
vshr.u32 q6, #2
|
||||
and s, h, a // y&z
|
||||
and t, t, g // x&(y^z)
|
||||
vshl.i32 q7, #2
|
||||
eor s, s, t // t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
|
||||
|
||||
eor t, b, b, ror #5 // S32(6, (x)) ^ S32(11, (x))
|
||||
veor q4, q6
|
||||
add f, s
|
||||
eor t, t, b, ror #19 // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
|
||||
veor q4, q7
|
||||
|
||||
vadd.s32 $1, q4 // w19:w16 = w3:w0 + sigma0(w4:w1) + w12:w9 + sigma1(w17:w14)
|
||||
|
||||
and s, b, c // (x) & (y)
|
||||
add e, t, ror #6 // use h to store h+Sigma1(e)
|
||||
bic t, d, b // (~(x)) & (z)
|
||||
vadd.s32 q5, $1 // W+K
|
||||
eor t, t, s // t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
|
||||
ldr s, WK(3+$0) //
|
||||
add e, t // t = h+Sigma1(e)+Ch(e,f,g);
|
||||
eor t, f, f, ror #11 // S32(2, (x)) ^ S32(13, (x))
|
||||
add e, s // h = T1
|
||||
eor t, t, f, ror #20 // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) // t = Sigma0(a);
|
||||
add a, e // d += T1;
|
||||
add e, t, ror #2 // h = T1 + Sigma0(a);
|
||||
eor t, g, h // y^z
|
||||
and s, g, h // y&z
|
||||
and t, t, f // x&(y^z)
|
||||
eor s, s, t // t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
|
||||
|
||||
add t, sp, #(($0&15)*4)
|
||||
add e, s
|
||||
vst1.32 {q5},[t,:128]
|
||||
|
||||
.endm
|
||||
|
||||
.macro rounds_e_schedule_update
|
||||
eor t, a, a, ror #5 // S32(6, (x)) ^ S32(11, (x))
|
||||
vld1.32 {q5},[K,:128]!
|
||||
eor t, t, a, ror #19 // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
|
||||
vext.32 q4, $1, $2, #1 // Q4 = w4:w1
|
||||
and s, a, b // (x) & (y)
|
||||
add d, t, ror #6 // use h to store h+Sigma1(e)
|
||||
bic t, c, a // (~(x)) & (z)
|
||||
vshr.u32 q6, q4, #7
|
||||
eor t, t, s // t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
|
||||
vshl.i32 q7, q4, #14
|
||||
ldr s, WK($0) //
|
||||
add d, t // t = h+Sigma1(e)+Ch(e,f,g);
|
||||
eor t, e, e, ror #11 // S32(2, (x)) ^ S32(13, (x))
|
||||
vshr.u32 q4, q4, #3
|
||||
add d, s // h = T1
|
||||
eor t, t, e, ror #20 // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) // t = Sigma0(a);
|
||||
add h, d // d += T1;
|
||||
veor q4, q6
|
||||
add d, t, ror #2 // h = T1 + Sigma0(a);
|
||||
vshr.u32 q6, #11
|
||||
eor t, f, g // y^z
|
||||
and s, f, g // y&z
|
||||
veor q4, q7
|
||||
and t, t, e // x&(y^z)
|
||||
vshl.i32 q7, #11
|
||||
eor s, s, t // t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
|
||||
veor q4, q6
|
||||
eor t, h, h, ror #5 // S32(6, (x)) ^ S32(11, (x))
|
||||
|
||||
vext.32 q6, $3, $4, #1 // Q6 = w12:w9
|
||||
|
||||
add d, s
|
||||
veor q4, q7
|
||||
eor t, t, h, ror #19 // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
|
||||
and s, h, a // (x) & (y)
|
||||
vadd.s32 $1, q4 // w3:w0 + sigma0(w4:w1)
|
||||
add c, t, ror #6 // use h to store h+Sigma1(e)
|
||||
bic t, b, h // (~(x)) & (z)
|
||||
|
||||
eor t, t, s // t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
|
||||
ldr s, WK(1+$0) //
|
||||
add c, t // t = h+Sigma1(e)+Ch(e,f,g);
|
||||
vadd.s32 $1, q6 // w3:w0 + sigma0(w4:w1) + w12:w9
|
||||
eor t, d, d, ror #11 // S32(2, (x)) ^ S32(13, (x))
|
||||
vext.64 q4, $4, zero, #1 // 0 0 w15:w14
|
||||
add c, s // h = T1
|
||||
eor t, t, d, ror #20 // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) // t = Sigma0(a);
|
||||
add g, c // d += T1;
|
||||
vshr.u32 q6, q4, #17
|
||||
add c, t, ror #2 // h = T1 + Sigma0(a);
|
||||
vshl.i32 q7, q4, #13
|
||||
eor t, e, f // y^z
|
||||
vshr.u32 q4, q4, #10
|
||||
and s, e, f // y&z
|
||||
and t, t, d // x&(y^z)
|
||||
veor q4, q6
|
||||
eor s, s, t // t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
|
||||
|
||||
veor q4, q7
|
||||
|
||||
eor t, g, g, ror #5 // S32(6, (x)) ^ S32(11, (x))
|
||||
vshr.u32 q6, #2
|
||||
add c, s
|
||||
vshl.i32 q7, #2
|
||||
eor t, t, g, ror #19 // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
|
||||
veor q4, q6
|
||||
and s, g, h // (x) & (y)
|
||||
veor q4, q7
|
||||
add b, t, ror #6 // use h to store h+Sigma1(e)
|
||||
vadd.s32 $1, q4 // w3:w0 + sigma0(w4:w1) + w12:w9 + sigma1(0 0 w15:w14)
|
||||
bic t, a, g // (~(x)) & (z)
|
||||
vext.64 q4, zero, $1, #1 // Q4 = (w17:w16 0 0)
|
||||
eor t, t, s // t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
|
||||
ldr s, WK(2+$0) //
|
||||
add b, t // t = h+Sigma1(e)+Ch(e,f,g);
|
||||
vshr.u32 q6, q4, #17
|
||||
eor t, c, c, ror #11 // S32(2, (x)) ^ S32(13, (x))
|
||||
vshl.i32 q7, q4, #13
|
||||
add b, s // h = T1
|
||||
vshr.u32 q4, q4, #10
|
||||
eor t, t, c, ror #20 // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) // t = Sigma0(a);
|
||||
add f, b // d += T1;
|
||||
veor q4, q6
|
||||
add b, t, ror #2 // h = T1 + Sigma0(a);
|
||||
vshr.u32 q6, #2
|
||||
eor t, d, e // y^z
|
||||
veor q4, q7
|
||||
and s, d, e // y&z
|
||||
vshl.i32 q7, #2
|
||||
and t, t, c // x&(y^z)
|
||||
eor s, s, t // t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
|
||||
|
||||
veor q4, q6
|
||||
eor t, f, f, ror #5 // S32(6, (x)) ^ S32(11, (x))
|
||||
veor q4, q7
|
||||
add b, s
|
||||
eor t, t, f, ror #19 // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
|
||||
|
||||
and s, f, g // (x) & (y)
|
||||
add a, t, ror #6 // use h to store h+Sigma1(e)
|
||||
bic t, h, f // (~(x)) & (z)
|
||||
vadd.s32 $1, q4 // w19:w16 = w3:w0 + sigma0(w4:w1) + w12:w9 + sigma1(w17:w14)
|
||||
eor t, t, s // t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
|
||||
ldr s, WK(3+$0) //
|
||||
add a, t // t = h+Sigma1(e)+Ch(e,f,g);
|
||||
eor t, b, b, ror #11 // S32(2, (x)) ^ S32(13, (x))
|
||||
add a, s // h = T1
|
||||
eor t, t, b, ror #20 // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) // t = Sigma0(a);
|
||||
vadd.s32 q5, $1 // W+K
|
||||
add e, a // d += T1;
|
||||
add a, t, ror #2 // h = T1 + Sigma0(a);
|
||||
eor t, c, d // y^z
|
||||
and s, c, d // y&z
|
||||
and t, t, b // x&(y^z)
|
||||
eor s, s, t // t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
|
||||
add t, sp, #(($0&15)*4)
|
||||
add a, s
|
||||
|
||||
vst1.32 {q5},[t,:128]
|
||||
.endm
|
||||
|
||||
|
||||
.subsections_via_symbols
|
||||
.text
|
||||
|
||||
.p2align 4
|
||||
|
||||
K256:
|
||||
.long 0x428a2f98
|
||||
.long 0x71374491
|
||||
.long 0xb5c0fbcf
|
||||
.long 0xe9b5dba5
|
||||
.long 0x3956c25b
|
||||
.long 0x59f111f1
|
||||
.long 0x923f82a4
|
||||
.long 0xab1c5ed5
|
||||
.long 0xd807aa98
|
||||
.long 0x12835b01
|
||||
.long 0x243185be
|
||||
.long 0x550c7dc3
|
||||
.long 0x72be5d74
|
||||
.long 0x80deb1fe
|
||||
.long 0x9bdc06a7
|
||||
.long 0xc19bf174
|
||||
.long 0xe49b69c1
|
||||
.long 0xefbe4786
|
||||
.long 0x0fc19dc6
|
||||
.long 0x240ca1cc
|
||||
.long 0x2de92c6f
|
||||
.long 0x4a7484aa
|
||||
.long 0x5cb0a9dc
|
||||
.long 0x76f988da
|
||||
.long 0x983e5152
|
||||
.long 0xa831c66d
|
||||
.long 0xb00327c8
|
||||
.long 0xbf597fc7
|
||||
.long 0xc6e00bf3
|
||||
.long 0xd5a79147
|
||||
.long 0x06ca6351
|
||||
.long 0x14292967
|
||||
.long 0x27b70a85
|
||||
.long 0x2e1b2138
|
||||
.long 0x4d2c6dfc
|
||||
.long 0x53380d13
|
||||
.long 0x650a7354
|
||||
.long 0x766a0abb
|
||||
.long 0x81c2c92e
|
||||
.long 0x92722c85
|
||||
.long 0xa2bfe8a1
|
||||
.long 0xa81a664b
|
||||
.long 0xc24b8b70
|
||||
.long 0xc76c51a3
|
||||
.long 0xd192e819
|
||||
.long 0xd6990624
|
||||
.long 0xf40e3585
|
||||
.long 0x106aa070
|
||||
.long 0x19a4c116
|
||||
.long 0x1e376c08
|
||||
.long 0x2748774c
|
||||
.long 0x34b0bcb5
|
||||
.long 0x391c0cb3
|
||||
.long 0x4ed8aa4a
|
||||
.long 0x5b9cca4f
|
||||
.long 0x682e6ff3
|
||||
.long 0x748f82ee
|
||||
.long 0x78a5636f
|
||||
.long 0x84c87814
|
||||
.long 0x8cc70208
|
||||
.long 0x90befffa
|
||||
.long 0xa4506ceb
|
||||
.long 0xbef9a3f7
|
||||
.long 0xc67178f2
|
||||
|
||||
|
||||
.syntax unified
|
||||
.p2align 2
|
||||
.code 16
|
||||
.thumb_func _AccelerateCrypto_SHA256_compress
|
||||
|
||||
.globl _AccelerateCrypto_SHA256_compress
|
||||
_AccelerateCrypto_SHA256_compress:
|
||||
|
||||
// due to the change of order in the 2nd and 3rd calling argument,
|
||||
// we need to switch r1/r2 to use the original code
|
||||
mov r12, r1
|
||||
mov r1, r2
|
||||
mov r2, r12
|
||||
|
||||
// push callee-saved registers
|
||||
push {r4-r7,lr}
|
||||
add r7, sp, #12 // set up dtrace frame pointer
|
||||
push {r8-r11}
|
||||
|
||||
// align sp to 16-byte boundary
|
||||
mov r12, sp
|
||||
ands r12, r12, #15 // bytes to align to 16-byte boundary
|
||||
it eq
|
||||
addeq r12, #16 // if nothing, enforce to insert 16 bytes
|
||||
sub sp, r12
|
||||
str r12, [sp]
|
||||
|
||||
#if BUILDKERNEL
|
||||
vpush {q8}
|
||||
#endif
|
||||
vpush {q0-q7}
|
||||
#define stack_size (16*5) // circular buffer W0-W3, extra 16 to save num_blocks
|
||||
sub sp, #stack_size
|
||||
|
||||
str r2, num_blocks
|
||||
|
||||
veor zero, zero
|
||||
|
||||
// set up pointer to table K256[]
|
||||
ldr K, L_table1
|
||||
L_table0:
|
||||
mov r12, pc
|
||||
ldr K, [r12, K]
|
||||
bal 0f
|
||||
L_table1:
|
||||
.long L_Tab$non_lazy_ptr-(L_table0+4)
|
||||
0:
|
||||
|
||||
// load W[0:15]
|
||||
vld1.s32 {W0-W1},[data]!
|
||||
vld1.s32 {W2-W3},[data]!
|
||||
|
||||
// load K[0:15] & per word byte swap
|
||||
vrev32.8 W0, W0
|
||||
vrev32.8 W1, W1
|
||||
vld1.s32 {q4-q5}, [K,:128]!
|
||||
vrev32.8 W2, W2
|
||||
vrev32.8 W3, W3
|
||||
vld1.s32 {q6-q7}, [K,:128]!
|
||||
|
||||
// compute WK[0:15] and save in stack
|
||||
|
||||
vadd.s32 q4, q0
|
||||
vadd.s32 q5, q1
|
||||
vadd.s32 q6, q2
|
||||
vadd.s32 q7, q3
|
||||
|
||||
vstmia sp,{q4-q7}
|
||||
|
||||
// digests a-h = ctx->states;
|
||||
ldmia ctx,{a-d,e-h}
|
||||
|
||||
L_loop:
|
||||
|
||||
// rounds 0:47 interleaved with W/WK update for rounds 16:63
|
||||
mov t, #3
|
||||
str t, _i_loop
|
||||
L_i_loop:
|
||||
rounds_a_schedule_update 0,W0,W1,W2,W3
|
||||
rounds_e_schedule_update 4,W1,W2,W3,W0
|
||||
rounds_a_schedule_update 8,W2,W3,W0,W1
|
||||
rounds_e_schedule_update 12,W3,W0,W1,W2
|
||||
ldr t, _i_loop
|
||||
subs t, t, #1
|
||||
str t, _i_loop
|
||||
bgt L_i_loop
|
||||
|
||||
// revert K to the beginning of K256[]
|
||||
ldr t, num_blocks
|
||||
sub K, #256
|
||||
|
||||
subs t, #1 // num_blocks--
|
||||
beq L_final_block // if final block, wrap up final rounds
|
||||
str t, num_blocks
|
||||
|
||||
// rounds 48:63 interleaved with W/WK initialization for next block rounds 0:15
|
||||
#if 0
|
||||
rounds_a 48
|
||||
update_W_WK 0, W0
|
||||
rounds_e 52
|
||||
update_W_WK 1, W1
|
||||
rounds_a 56
|
||||
update_W_WK 2, W2
|
||||
rounds_e 60
|
||||
update_W_WK 3, W3
|
||||
#else
|
||||
rounds_a_update_W_WK 48, 0, W0
|
||||
rounds_e_update_W_WK 52, 1, W1
|
||||
rounds_a_update_W_WK 56, 2, W2
|
||||
rounds_e_update_W_WK 60, 3, W3
|
||||
#endif
|
||||
|
||||
// ctx->states += digests a-h
|
||||
Update_Digits
|
||||
|
||||
// digests a-h = ctx->states;
|
||||
// ldmia ctx,{a-d,e-h}
|
||||
|
||||
bal L_loop // branch for next block
|
||||
|
||||
// wrap up digest update round 48:63 for final block
|
||||
L_final_block:
|
||||
rounds_a 48
|
||||
rounds_e 52
|
||||
rounds_a 56
|
||||
rounds_e 60
|
||||
|
||||
// ctx->states += digests a-h
|
||||
Update_Digits
|
||||
|
||||
// free allocated stack memory
|
||||
add sp, #stack_size
|
||||
|
||||
// if kernel, restore q0-q8
|
||||
vpop {q0-q1}
|
||||
vpop {q2-q3}
|
||||
vpop {q4-q5}
|
||||
vpop {q6-q7}
|
||||
#if BUILDKERNEL
|
||||
vpop {q8}
|
||||
#endif
|
||||
|
||||
// dealign sp from the 16-byte boundary
|
||||
ldr r12, [sp]
|
||||
add sp, r12
|
||||
|
||||
// restore callee-save registers and return
|
||||
pop {r8-r11}
|
||||
pop {r4-r7,pc}
|
||||
|
||||
|
||||
.section __DATA,__nl_symbol_ptr,non_lazy_symbol_pointers
|
||||
.p2align 2
|
||||
L_Tab$non_lazy_ptr:
|
||||
.indirect_symbol K256
|
||||
.long 0
|
||||
|
||||
|
||||
|
||||
#endif // (defined(__arm__) && defined(__ARM_NEON__))
|
||||
|
||||
|
|
@ -0,0 +1,389 @@
|
|||
# Copyright (c) (2018-2020) Apple Inc. All rights reserved.
|
||||
#
|
||||
# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
# is contained in the License.txt file distributed with corecrypto) and only to
|
||||
# people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
# Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
# devices and computers you own or control, for the sole purpose of verifying the
|
||||
# security characteristics and correct functioning of the Apple Software. You may
|
||||
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
|
||||
/*
|
||||
This file provides armv7+neon hand implementation of the following function
|
||||
|
||||
void SHA256_Transform(SHA256_ctx *ctx, char *data, unsigned int num_blocks);
|
||||
|
||||
which is a C function in sha2.c (from xnu).
|
||||
|
||||
sha256 algorithm per block description:
|
||||
|
||||
1. W(0:15) = big-endian (per 4 bytes) loading of input data (64 byte)
|
||||
2. load 8 digests a-h from ctx->state
|
||||
3. for r = 0:15
|
||||
T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
|
||||
d += T1;
|
||||
h = T1 + Sigma0(a) + Maj(a,b,c)
|
||||
permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
|
||||
4. for r = 16:63
|
||||
W[r] = W[r-16] + sigma1(W[r-2]) + W[r-7] + sigma0(W[r-15]);
|
||||
T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
|
||||
d += T1;
|
||||
h = T1 + Sigma0(a) + Maj(a,b,c)
|
||||
permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
|
||||
|
||||
In the assembly implementation:
|
||||
- a circular window of message schedule W(r:r+15) is updated and stored in q0-q3
|
||||
- its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
|
||||
- the 8 digests (a-h) will be stored in GPR or memory
|
||||
|
||||
the implementation per block looks like
|
||||
|
||||
----------------------------------------------------------------------------
|
||||
|
||||
load W(0:15) (big-endian per 4 bytes) into q0:q3
|
||||
pre_calculate and store W+K(0:15) in stack
|
||||
|
||||
load digests a-h from ctx->state;
|
||||
|
||||
for (r=0;r<48;r+=4) {
|
||||
digests a-h update and permute round r:r+3
|
||||
update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
|
||||
}
|
||||
|
||||
for (r=48;r<64;r+=4) {
|
||||
digests a-h update and permute round r:r+3
|
||||
}
|
||||
|
||||
ctx->states += digests a-h;
|
||||
|
||||
----------------------------------------------------------------------------
|
||||
|
||||
our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block
|
||||
into the last 16 rounds of its previous block:
|
||||
|
||||
----------------------------------------------------------------------------
|
||||
|
||||
load W(0:15) (big-endian per 4 bytes) into q0:q3
|
||||
pre_calculate and store W+K(0:15) in stack
|
||||
|
||||
L_loop:
|
||||
|
||||
load digests a-h from ctx->state;
|
||||
|
||||
for (r=0;r<48;r+=4) {
|
||||
digests a-h update and permute round r:r+3
|
||||
update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
|
||||
}
|
||||
|
||||
num_block--;
|
||||
if (num_block==0) jmp L_last_block;
|
||||
|
||||
for (r=48;r<64;r+=4) {
|
||||
digests a-h update and permute round r:r+3
|
||||
load W([r:r+3]%16) (big-endian per 4 bytes) into q0:q3
|
||||
pre_calculate and store W+K([r:r+3]%16) in stack
|
||||
}
|
||||
|
||||
ctx->states += digests a-h;
|
||||
|
||||
jmp L_loop;
|
||||
|
||||
L_last_block:
|
||||
|
||||
for (r=48;r<64;r+=4) {
|
||||
digests a-h update and permute round r:r+3
|
||||
}
|
||||
|
||||
ctx->states += digests a-h;
|
||||
|
||||
------------------------------------------------------------------------
|
||||
|
||||
Apple CoreOS vector & numerics
|
||||
*/
|
||||
|
||||
#if defined(__arm64__)
|
||||
|
||||
#include "arm64_isa_compatibility.h"
|
||||
#include "ccarm_pac_bti_macros.h"
|
||||
|
||||
.subsections_via_symbols
|
||||
.text
|
||||
|
||||
.p2align 4
|
||||
|
||||
K256:
|
||||
.long 0x428a2f98
|
||||
.long 0x71374491
|
||||
.long 0xb5c0fbcf
|
||||
.long 0xe9b5dba5
|
||||
.long 0x3956c25b
|
||||
.long 0x59f111f1
|
||||
.long 0x923f82a4
|
||||
.long 0xab1c5ed5
|
||||
.long 0xd807aa98
|
||||
.long 0x12835b01
|
||||
.long 0x243185be
|
||||
.long 0x550c7dc3
|
||||
.long 0x72be5d74
|
||||
.long 0x80deb1fe
|
||||
.long 0x9bdc06a7
|
||||
.long 0xc19bf174
|
||||
.long 0xe49b69c1
|
||||
.long 0xefbe4786
|
||||
.long 0x0fc19dc6
|
||||
.long 0x240ca1cc
|
||||
.long 0x2de92c6f
|
||||
.long 0x4a7484aa
|
||||
.long 0x5cb0a9dc
|
||||
.long 0x76f988da
|
||||
.long 0x983e5152
|
||||
.long 0xa831c66d
|
||||
.long 0xb00327c8
|
||||
.long 0xbf597fc7
|
||||
.long 0xc6e00bf3
|
||||
.long 0xd5a79147
|
||||
.long 0x06ca6351
|
||||
.long 0x14292967
|
||||
.long 0x27b70a85
|
||||
.long 0x2e1b2138
|
||||
.long 0x4d2c6dfc
|
||||
.long 0x53380d13
|
||||
.long 0x650a7354
|
||||
.long 0x766a0abb
|
||||
.long 0x81c2c92e
|
||||
.long 0x92722c85
|
||||
.long 0xa2bfe8a1
|
||||
.long 0xa81a664b
|
||||
.long 0xc24b8b70
|
||||
.long 0xc76c51a3
|
||||
.long 0xd192e819
|
||||
.long 0xd6990624
|
||||
.long 0xf40e3585
|
||||
.long 0x106aa070
|
||||
.long 0x19a4c116
|
||||
.long 0x1e376c08
|
||||
.long 0x2748774c
|
||||
.long 0x34b0bcb5
|
||||
.long 0x391c0cb3
|
||||
.long 0x4ed8aa4a
|
||||
.long 0x5b9cca4f
|
||||
.long 0x682e6ff3
|
||||
.long 0x748f82ee
|
||||
.long 0x78a5636f
|
||||
.long 0x84c87814
|
||||
.long 0x8cc70208
|
||||
.long 0x90befffa
|
||||
.long 0xa4506ceb
|
||||
.long 0xbef9a3f7
|
||||
.long 0xc67178f2
|
||||
|
||||
|
||||
.p2align 4
|
||||
|
||||
.globl _AccelerateCrypto_SHA256_compress
|
||||
_AccelerateCrypto_SHA256_compress:
|
||||
|
||||
|
||||
#define hashes x0
|
||||
#define numblocks x1
|
||||
#define data x2
|
||||
#define ktable x3
|
||||
BRANCH_TARGET_CALL
|
||||
#ifdef __ILP32__
|
||||
uxtw numblocks, numblocks // in arm64_32 size_t is 32-bit, so we need to extend it
|
||||
#endif
|
||||
|
||||
|
||||
adrp ktable, K256@page
|
||||
cbnz numblocks, 1f // if number of blocks is nonzero, go on for sha256 transform operation
|
||||
ret lr // otherwise, return
|
||||
1:
|
||||
add ktable, ktable, K256@pageoff
|
||||
|
||||
#if BUILDKERNEL
|
||||
// save q0-q7, q16-q24 8+8+1=19
|
||||
sub x4, sp, #17*16
|
||||
sub sp, sp, #17*16
|
||||
st1.4s {v0, v1, v2, v3}, [x4], #64
|
||||
st1.4s {v4, v5, v6, v7}, [x4], #64
|
||||
st1.4s {v16, v17, v18, v19}, [x4], #64
|
||||
st1.4s {v20, v21, v22, v23}, [x4], #64
|
||||
st1.4s {v24}, [x4], #16
|
||||
#endif
|
||||
|
||||
ld1.4s {v0,v1,v2,v3}, [data], #64 // w0,w1,w2,w3 need to bswap into big-endian
|
||||
|
||||
rev32.16b v0, v0 // byte swap of 1st 4 ints
|
||||
ldr q21, [ktable, #16*0]
|
||||
rev32.16b v1, v1 // byte swap of 2nd 4 ints
|
||||
ldr q16, [hashes, #0]
|
||||
rev32.16b v2, v2 // byte swap of 3rd 4 ints
|
||||
ldr q17, [hashes, #16]
|
||||
rev32.16b v3, v3 // byte swap of 4th 4 ints
|
||||
ldr q22, [ktable, #16*1]
|
||||
|
||||
mov.16b v18, v16
|
||||
ldr q23, [ktable, #16*2]
|
||||
add.4s v4, v0, v21 // 1st 4 input + K256
|
||||
ldr q24, [ktable, #16*3]
|
||||
add.4s v5, v1, v22 // 2nd 4 input + K256
|
||||
mov.16b v19, v17
|
||||
add.4s v6, v2, v23 // 3rd 4 input + K256
|
||||
add.4s v7, v3, v24 // 4th 4 input + K256
|
||||
add ktable, ktable, #16*4
|
||||
|
||||
|
||||
.macro sha256_round
|
||||
mov.16b v20, v18
|
||||
SHA256SU0 $0, $1
|
||||
SHA256H 18, 19, $4
|
||||
SHA256SU1 $0, $2, $3
|
||||
SHA256H2 19, 20, $4
|
||||
add.4s $6, $5, $7
|
||||
.endm
|
||||
|
||||
// 4 vector hashes update and load next vector rounds
|
||||
.macro sha256_hash_load_round
|
||||
mov.16b v20, v18
|
||||
SHA256H 18, 19, $0
|
||||
rev32.16b $1, $1
|
||||
SHA256H2 19, 20, $0
|
||||
add.4s $2, $1, $3
|
||||
.endm
|
||||
|
||||
.macro sha256_hash_round
|
||||
mov.16b v20, v18
|
||||
SHA256H 18, 19, $0
|
||||
SHA256H2 19, 20, $0
|
||||
.endm
|
||||
|
||||
// 12 vector hash and sequence update rounds
|
||||
mov w4, #3
|
||||
L_i_loop:
|
||||
mov.16b v20, v18
|
||||
ldr q21, [ktable, #0] // k0
|
||||
SHA256SU0 0, 1
|
||||
ldr q22, [ktable, #16] // k1
|
||||
SHA256H 18, 19, 4
|
||||
ldr q23, [ktable, #32] // k2
|
||||
SHA256SU1 0, 2, 3
|
||||
ldr q24, [ktable, #48] // k3
|
||||
SHA256H2 19, 20, 4
|
||||
add ktable, ktable, #64
|
||||
add.4s v4, v0, v21
|
||||
|
||||
sha256_round 1, 2, 3, 0, 5, v1, v5, v22
|
||||
sha256_round 2, 3, 0, 1, 6, v2, v6, v23
|
||||
subs w4, w4, #1
|
||||
sha256_round 3, 0, 1, 2, 7, v3, v7, v24
|
||||
b.gt L_i_loop
|
||||
|
||||
subs numblocks, numblocks, #1 // pre-decrement num_blocks by 1
|
||||
b.le L_wrapup
|
||||
|
||||
sub ktable, ktable, #256
|
||||
|
||||
L_loop:
|
||||
|
||||
ldr q0, [data, #0]
|
||||
mov.16b v20, v18
|
||||
ldr q21, [ktable,#0]
|
||||
SHA256H 18, 19, 4
|
||||
ldr q1, [data, #16]
|
||||
rev32.16b v0, v0
|
||||
ldr q2, [data, #32]
|
||||
SHA256H2 19, 20, 4
|
||||
ldr q3, [data, #48]
|
||||
add.4s v4, v0, v21
|
||||
|
||||
ldr q22, [ktable,#16]
|
||||
mov.16b v20, v18
|
||||
add data, data, #64
|
||||
SHA256H 18, 19, 5
|
||||
ldr q23, [ktable,#32]
|
||||
rev32.16b v1, v1
|
||||
ldr q24, [ktable,#48]
|
||||
SHA256H2 19, 20, 5
|
||||
add.4s v5, v1, v22
|
||||
|
||||
sha256_hash_load_round 6, v2, v6, v23
|
||||
sha256_hash_load_round 7, v3, v7, v24
|
||||
|
||||
add.4s v18, v16, v18
|
||||
add.4s v19, v17, v19
|
||||
mov.16b v16, v18
|
||||
mov.16b v17, v19
|
||||
|
||||
// 12 vector hash and sequence update rounds
|
||||
mov.16b v20, v18
|
||||
ldr q21, [ktable, #16*4] // k0
|
||||
SHA256SU0 0, 1
|
||||
ldr q22, [ktable, #16*5] // k1
|
||||
SHA256H 18, 19, 4
|
||||
ldr q23, [ktable, #16*6] // k2
|
||||
SHA256SU1 0, 2, 3
|
||||
ldr q24, [ktable, #16*7] // k3
|
||||
SHA256H2 19, 20, 4
|
||||
add.4s v4, v0, v21
|
||||
|
||||
sha256_round 1, 2, 3, 0, 5, v1, v5, v22
|
||||
sha256_round 2, 3, 0, 1, 6, v2, v6, v23
|
||||
sha256_round 3, 0, 1, 2, 7, v3, v7, v24
|
||||
mov.16b v20, v18
|
||||
ldr q21, [ktable, #16*8] // k0
|
||||
SHA256SU0 0, 1
|
||||
ldr q22, [ktable, #16*9] // k1
|
||||
SHA256H 18, 19, 4
|
||||
ldr q23, [ktable, #16*10] // k2
|
||||
SHA256SU1 0, 2, 3
|
||||
ldr q24, [ktable, #16*11] // k3
|
||||
SHA256H2 19, 20, 4
|
||||
add.4s v4, v0, v21
|
||||
|
||||
sha256_round 1, 2, 3, 0, 5, v1, v5, v22
|
||||
sha256_round 2, 3, 0, 1, 6, v2, v6, v23
|
||||
sha256_round 3, 0, 1, 2, 7, v3, v7, v24
|
||||
|
||||
mov.16b v20, v18
|
||||
ldr q21, [ktable, #16*12] // k0
|
||||
SHA256SU0 0, 1
|
||||
ldr q22, [ktable, #16*13] // k1
|
||||
SHA256H 18, 19, 4
|
||||
ldr q23, [ktable, #16*14] // k2
|
||||
SHA256SU1 0, 2, 3
|
||||
ldr q24, [ktable, #16*15] // k3
|
||||
SHA256H2 19, 20, 4
|
||||
add.4s v4, v0, v21
|
||||
|
||||
sha256_round 1, 2, 3, 0, 5, v1, v5, v22
|
||||
sha256_round 2, 3, 0, 1, 6, v2, v6, v23
|
||||
sha256_round 3, 0, 1, 2, 7, v3, v7, v24
|
||||
|
||||
subs numblocks, numblocks, #1 // pre-decrement num_blocks by 1
|
||||
b.gt L_loop
|
||||
|
||||
L_wrapup:
|
||||
|
||||
sha256_hash_round 4
|
||||
sha256_hash_round 5
|
||||
sha256_hash_round 6
|
||||
sha256_hash_round 7
|
||||
|
||||
add.4s v16, v16, v18
|
||||
add.4s v17, v17, v19
|
||||
st1.4s {v16,v17}, [hashes] // hashes q16 : d,c,b,a q17 : h,g,f,e
|
||||
|
||||
#if BUILDKERNEL
|
||||
// restore q9-q13, q0-q7, q16-q31
|
||||
ld1.4s {v0, v1, v2, v3}, [sp], #64
|
||||
ld1.4s {v4, v5, v6, v7}, [sp], #64
|
||||
ld1.4s {v16, v17, v18, v19}, [sp], #64
|
||||
ld1.4s {v20, v21, v22, v23}, [sp], #64
|
||||
ld1.4s {v24}, [sp], #16
|
||||
#endif
|
||||
|
||||
ret lr
|
||||
|
||||
|
||||
#endif // arm64
|
||||
|
||||
|
|
@ -0,0 +1,796 @@
|
|||
# Copyright (c) (2011-2013,2015,2016,2018-2020) Apple Inc. All rights reserved.
|
||||
#
|
||||
# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
# is contained in the License.txt file distributed with corecrypto) and only to
|
||||
# people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
# Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
# devices and computers you own or control, for the sole purpose of verifying the
|
||||
# security characteristics and correct functioning of the Apple Software. You may
|
||||
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
|
||||
/*
|
||||
|
||||
This is for Chinook AOP (arm64) that does not support crypto instructions.
|
||||
|
||||
This file provides arm64 neon hand implementation of the following function
|
||||
|
||||
void SHA256_Transform(SHA256_ctx *ctx, char *data, unsigned int num_blocks);
|
||||
|
||||
which is a C function in sha2.c (from xnu).
|
||||
|
||||
sha256 algorithm per block description:
|
||||
|
||||
1. W(0:15) = big-endian (per 4 bytes) loading of input data (64 byte)
|
||||
2. load 8 digests a-h from ctx->state
|
||||
3. for r = 0:15
|
||||
T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
|
||||
d += T1;
|
||||
h = T1 + Sigma0(a) + Maj(a,b,c)
|
||||
permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
|
||||
4. for r = 16:63
|
||||
W[r] = W[r-16] + sigma1(W[r-2]) + W[r-7] + sigma0(W[r-15]);
|
||||
T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
|
||||
d += T1;
|
||||
h = T1 + Sigma0(a) + Maj(a,b,c)
|
||||
permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
|
||||
|
||||
In the assembly implementation:
|
||||
- a circular window of message schedule W(r:r+15) is updated and stored in q0-q3
|
||||
- its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
|
||||
- the 8 digests (a-h) will be stored in GPR or memory
|
||||
|
||||
the implementation per block looks like
|
||||
|
||||
----------------------------------------------------------------------------
|
||||
|
||||
load W(0:15) (big-endian per 4 bytes) into q0:q3
|
||||
pre_calculate and store W+K(0:15) in stack
|
||||
|
||||
load digests a-h from ctx->state;
|
||||
|
||||
for (r=0;r<48;r+=4) {
|
||||
digests a-h update and permute round r:r+3
|
||||
update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
|
||||
}
|
||||
|
||||
for (r=48;r<64;r+=4) {
|
||||
digests a-h update and permute round r:r+3
|
||||
}
|
||||
|
||||
ctx->states += digests a-h;
|
||||
|
||||
----------------------------------------------------------------------------
|
||||
|
||||
our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block
|
||||
into the last 16 rounds of its previous block:
|
||||
|
||||
----------------------------------------------------------------------------
|
||||
|
||||
load W(0:15) (big-endian per 4 bytes) into q0:q3
|
||||
pre_calculate and store W+K(0:15) in stack
|
||||
|
||||
L_loop:
|
||||
|
||||
load digests a-h from ctx->state;
|
||||
|
||||
for (r=0;r<48;r+=4) {
|
||||
digests a-h update and permute round r:r+3
|
||||
update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
|
||||
}
|
||||
|
||||
num_block--;
|
||||
if (num_block==0) jmp L_last_block;
|
||||
|
||||
for (r=48;r<64;r+=4) {
|
||||
digests a-h update and permute round r:r+3
|
||||
load W([r:r+3]%16) (big-endian per 4 bytes) into q0:q3
|
||||
pre_calculate and store W+K([r:r+3]%16) in stack
|
||||
}
|
||||
|
||||
ctx->states += digests a-h;
|
||||
|
||||
jmp L_loop;
|
||||
|
||||
L_last_block:
|
||||
|
||||
for (r=48;r<64;r+=4) {
|
||||
digests a-h update and permute round r:r+3
|
||||
}
|
||||
|
||||
ctx->states += digests a-h;
|
||||
|
||||
------------------------------------------------------------------------
|
||||
|
||||
Apple CoreOS vector & numerics
|
||||
*/
|
||||
|
||||
|
||||
// associate variables with registers or memory
|
||||
|
||||
#define ctx x0
|
||||
#define num_blocks x1
|
||||
#define data x2
|
||||
#define ktable x3
|
||||
|
||||
#define _i_loop x4
|
||||
|
||||
#define a w5
|
||||
#define bb w6
|
||||
#define c w7
|
||||
#define d w8
|
||||
#define e w9
|
||||
#define f w10
|
||||
#define g w11
|
||||
#define h w12
|
||||
|
||||
// 2 local variables
|
||||
#define t w13
|
||||
#define s w14
|
||||
|
||||
// a window (16 words) of message scheule
|
||||
#define W0 v0
|
||||
#define W1 v1
|
||||
#define W2 v2
|
||||
#define W3 v3
|
||||
#define qW0 q0
|
||||
#define qW1 q1
|
||||
#define qW2 q2
|
||||
#define qW3 q3
|
||||
#define zero v16
|
||||
#define WK0 v4
|
||||
#define WK1 v5
|
||||
#define WK2 v6
|
||||
#define WK3 v7
|
||||
#define qWK0 q4
|
||||
#define qWK1 q5
|
||||
#define qWK2 q6
|
||||
#define qWK3 q7
|
||||
|
||||
// circular buffer for WK[(r:r+15)%16]
|
||||
#define WK(r) [sp,#((r)&15)*4]
|
||||
|
||||
// #define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
|
||||
|
||||
.macro Ch
|
||||
mvn t, $0 // ~x
|
||||
and s, $0, $1 // (x) & (y)
|
||||
and t, t, $2 // (~(x)) & (z)
|
||||
eor t, t, s // t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
|
||||
.endm
|
||||
|
||||
// #define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
|
||||
|
||||
.macro Maj
|
||||
eor t, $1, $2 // y^z
|
||||
and s, $1, $2 // y&z
|
||||
and t, t, $0 // x&(y^z)
|
||||
eor t, t, s // t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
|
||||
.endm
|
||||
|
||||
// #define sigma0_256(x) (S32(7, (x)) ^ S32(18, (x)) ^ R(3 , (x)))
|
||||
|
||||
// performs sigma0_256 on 4 words on a Q register
|
||||
// use q6/q7 as intermediate registers
|
||||
.macro sigma0
|
||||
vshr.u32 q6, $0, #7
|
||||
vshl.i32 q7, $0, #14
|
||||
vshr.u32 $0, $0, #3
|
||||
veor $0, q6
|
||||
veor $0, q7
|
||||
vshr.u32 q6, #11
|
||||
vshl.i32 q7, #11
|
||||
veor $0, q6
|
||||
veor $0, q7
|
||||
.endm
|
||||
|
||||
// #define sigma1_256(x) (S32(17, (x)) ^ S32(19, (x)) ^ R(10, (x)))
|
||||
|
||||
// performs sigma1_256 on 4 words on a Q register
|
||||
// use q6/q7 as intermediate registers
|
||||
.macro sigma1
|
||||
vshr.u32 q6, $0, #17
|
||||
vshl.i32 q7, $0, #13
|
||||
vshr.u32 $0, $0, #10
|
||||
veor $0, q6
|
||||
veor $0, q7
|
||||
vshr.u32 q6, #2
|
||||
vshl.i32 q7, #2
|
||||
veor $0, q6
|
||||
veor $0, q7
|
||||
.endm
|
||||
|
||||
// #define Sigma0_256(x) (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x)))
|
||||
|
||||
.macro Sigma0
|
||||
ror t, $0, #2 // S32(2, (x))
|
||||
ror s, $0, #13 // S32(13, (x))
|
||||
eor t, t, s // S32(2, (x)) ^ S32(13, (x))
|
||||
ror s, s, #9 // S32(22, (x))
|
||||
eor t, t, s // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x)))
|
||||
.endm
|
||||
|
||||
// #define Sigma1_256(x) (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
|
||||
|
||||
.macro Sigma1
|
||||
ror t, $0, #6 // S32(6, (x))
|
||||
ror s, $0, #11 // S32(11, (x))
|
||||
eor t, t, s // S32(6, (x)) ^ S32(11, (x))
|
||||
ror s, s, #14 // S32(25, (x))
|
||||
eor t, t, s // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
|
||||
.endm
|
||||
|
||||
// per round digests update
|
||||
.macro round
|
||||
// ror t, $4, #6 // S32(6, (x))
|
||||
eor t, t, $4, ror #11 // S32(6, (x)) ^ S32(11, (x))
|
||||
eor t, t, $4, ror #25 // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
|
||||
and s, $4, $5 // (x) & (y)
|
||||
add $7, $7, t // use h to store h+Sigma1(e)
|
||||
bic t, $6, $4 // (~(x)) & (z)
|
||||
eor t, t, s // t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
|
||||
mov s, $8 //
|
||||
add $7, $7, t // t = h+Sigma1(e)+Ch(e,f,g);
|
||||
ror t, $0, #2 // S32(2, (x))
|
||||
add $7, $7, s // h = T1
|
||||
eor t, t, $0, ror #13 // S32(2, (x)) ^ S32(13, (x))
|
||||
add $3, $3, $7 // d += T1;
|
||||
eor t, t, $0, ror #22 // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) // t = Sigma0(a);
|
||||
add $7, $7, t // h = T1 + Sigma0(a);
|
||||
eor t, $1, $2 // y^z
|
||||
and s, $1, $2 // y&z
|
||||
and t, t, $0 // x&(y^z)
|
||||
eor s, s, t // t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
|
||||
// add $7, s // h = T1 + Sigma0(a) + Maj(a,b,c);
|
||||
.endm
|
||||
|
||||
// per 4 rounds digests update and permutation
|
||||
// permutation is absorbed by rotating the roles of digests a-h
|
||||
.macro rounds
|
||||
ror t, $4, #6
|
||||
round $0, $1, $2, $3, $4, $5, $6, $7, 0+$8
|
||||
ror t, $3, #6
|
||||
add $7, s
|
||||
round $7, $0, $1, $2, $3, $4, $5, $6, 1+$8
|
||||
ror t, $2, #6
|
||||
add $6, s
|
||||
round $6, $7, $0, $1, $2, $3, $4, $5, 2+$8
|
||||
ror t, $1, #6
|
||||
add $5, s
|
||||
round $5, $6, $7, $0, $1, $2, $3, $4, 3+$8
|
||||
add $4, s
|
||||
.endm
|
||||
|
||||
.macro rounds_a
|
||||
ror t, e, #6
|
||||
round a, bb, c, d, e, f, g, h, $0.s[0]
|
||||
ror t, d, #6
|
||||
add h, h, s
|
||||
round h, a, bb, c, d, e, f, g, $0.s[1]
|
||||
ror t, c, #6
|
||||
add g, g, s
|
||||
round g, h, a, bb, c, d, e, f, $0.s[2]
|
||||
ror t, bb, #6
|
||||
add f, f, s
|
||||
round f, g, h, a, bb, c, d, e, $0.s[3]
|
||||
add e, e, s
|
||||
.endm
|
||||
|
||||
.macro rounds_e
|
||||
ror t, a, #6
|
||||
round e, f, g, h, a, bb, c, d, $0.s[0]
|
||||
ror t, h, #6
|
||||
add d, d, s
|
||||
round d, e, f, g, h, a, bb, c, $0.s[1]
|
||||
ror t, g, #6
|
||||
add c, c, s
|
||||
round c, d, e, f, g, h, a, bb, $0.s[2]
|
||||
ror t, f, #6
|
||||
add bb, bb, s
|
||||
round bb, c, d, e, f, g, h, a, $0.s[3]
|
||||
add a, a, s
|
||||
.endm
|
||||
|
||||
.macro rounds_a_update_W_WK
|
||||
ror t, e, #6
|
||||
ldr $3, [data], #16
|
||||
round a, bb, c, d, e, f, g, h, $0.s[0]
|
||||
ror t, d, #6
|
||||
rev32.16b $1, $1
|
||||
add h, h, s
|
||||
round h, a, bb, c, d, e, f, g, $0.s[1]
|
||||
ror t, c, #6
|
||||
add g, g, s
|
||||
ldr q17, [ktable], #16
|
||||
round g, h, a, bb, c, d, e, f, $0.s[2]
|
||||
ror t, bb, #6
|
||||
add f, f, s
|
||||
round f, g, h, a, bb, c, d, e, $0.s[3]
|
||||
add e, e, s
|
||||
add.4s $0, v17, $1
|
||||
.endm
|
||||
|
||||
.macro rounds_e_update_W_WK
|
||||
ror t, a, #6
|
||||
ldr $3, [data], #16
|
||||
round e, f, g, h, a, bb, c, d, $0.s[0]
|
||||
ror t, h, #6
|
||||
rev32.16b $1, $1
|
||||
add d, d, s
|
||||
round d, e, f, g, h, a, bb, c, $0.s[1]
|
||||
ror t, g, #6
|
||||
add c, c, s
|
||||
ldr q17, [ktable], #16
|
||||
round c, d, e, f, g, h, a, bb, $0.s[2]
|
||||
ror t, f, #6
|
||||
add bb, bb, s
|
||||
round bb, c, d, e, f, g, h, a, $0.s[3]
|
||||
add a, a, s
|
||||
add.4s $0, v17, $1
|
||||
.endm
|
||||
|
||||
// this macro is used in the last 16 rounds of a current block
|
||||
// it reads the next message (16 4-byte words), load it into 4 words W[r:r+3], computes WK[r:r+3]
|
||||
// and save into stack to prepare for next block
|
||||
|
||||
.macro update_W_WK
|
||||
ldr $3, [data]
|
||||
ldr $2, [ktable]
|
||||
add data, data, #16
|
||||
rev32.16b $1, $1
|
||||
add ktable, ktable, #16
|
||||
add.4s $0, $0, $1
|
||||
.endm
|
||||
|
||||
.macro Update_Digits
|
||||
ldp t, s, [ctx]
|
||||
add a, a, t
|
||||
add bb, bb, s
|
||||
stp a, bb, [ctx]
|
||||
|
||||
ldp t, s, [ctx,#8]
|
||||
add c, c, t
|
||||
add d, d, s
|
||||
stp c, d, [ctx, #8]
|
||||
|
||||
ldp t, s, [ctx,#16]
|
||||
add e, e, t
|
||||
add f, f, s
|
||||
stp e, f, [ctx, #16]
|
||||
|
||||
ldp t, s, [ctx,#24]
|
||||
add g, g, t
|
||||
add h, h, s
|
||||
stp g, h, [ctx, #24]
|
||||
.endm
|
||||
|
||||
.macro rounds_a_schedule_update
|
||||
eor t, e, e, ror #5 // S32(6, (x)) ^ S32(11, (x))
|
||||
ldr q17, [ktable], #16
|
||||
eor t, t, e, ror #19 // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
|
||||
ext.16b v18, $1, $2, #4 // w4:w1
|
||||
ror t, t, #6
|
||||
and s, e, f // (x) & (y)
|
||||
add h, h, t // use h to store h+Sigma1(e)
|
||||
bic t, g, e // (~(x)) & (z)
|
||||
ushr.4s v19, v18, #7
|
||||
eor t, t, s // t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
|
||||
mov s, $5.s[0] //
|
||||
add h, h, t // t = h+Sigma1(e)+Ch(e,f,g);
|
||||
shl.4s v20, v18, #14
|
||||
eor t, a, a, ror #11 // S32(2, (x)) ^ S32(13, (x))
|
||||
ushr.4s v18, v18, #3
|
||||
add h, h, s // h = T1
|
||||
eor t, t, a, ror #20 // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) // t = Sigma0(a);
|
||||
add d, d, h // d += T1;
|
||||
ror t, t, #2
|
||||
eor.16b v18, v18, v19
|
||||
add h, h, t // h = T1 + Sigma0(a);
|
||||
ushr.4s v19, v19, #11
|
||||
eor t, bb, c // y^z
|
||||
and s, bb, c // y&z
|
||||
and t, t, a // x&(y^z)
|
||||
eor.16b v18, v18, v20
|
||||
eor s, s, t // t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
|
||||
shl.4s v20, v20, #11
|
||||
eor t, d, d, ror #5 // S32(6, (x)) ^ S32(11, (x))
|
||||
|
||||
add h, h, s
|
||||
eor t, t, d, ror #19 // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
|
||||
eor.16b v18, v18, v19
|
||||
and s, d, e // (x) & (y)
|
||||
ext.16b v19, $3, $4, #4 // q19 = w12:w9
|
||||
ror t, t, #6
|
||||
add g, g, t // use h to store h+Sigma1(e)
|
||||
eor.16b v18, v18, v20
|
||||
bic t, f, d // (~(x)) & (z)
|
||||
|
||||
eor t, t, s // t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
|
||||
mov s, $5.s[1] //
|
||||
add g, g, t // t = h+Sigma1(e)+Ch(e,f,g);
|
||||
eor t, h, h, ror #11 // S32(2, (x)) ^ S32(13, (x))
|
||||
add.4s $1, $1, v18 // w3:w0 + sigma0(w4:w1)
|
||||
add g, g, s // h = T1
|
||||
ext.16b v18, $4, zero, #8 // 0 0 w15:w14
|
||||
eor t, t, h, ror #20 // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) // t = Sigma0(a);
|
||||
add.4s $1, $1, v19 // w3:w0 + sigma0(w4:w1) + w12:w9
|
||||
ror t, t, #2
|
||||
add c, c, g // d += T1;
|
||||
ushr.4s v19, v18, #17
|
||||
add g, g, t // h = T1 + Sigma0(a);
|
||||
shl.4s v20, v18, #13
|
||||
eor t, a, bb // y^z
|
||||
ushr.4s v18, v18, #10
|
||||
and s, a, bb // y&z
|
||||
and t, t, h // x&(y^z)
|
||||
eor.16b v18, v18, v19
|
||||
eor s, s, t // t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
|
||||
ushr.4s v19, v19, #2
|
||||
|
||||
|
||||
|
||||
eor t, c, c, ror #5 // S32(6, (x)) ^ S32(11, (x))
|
||||
add g, g, s
|
||||
eor.16b v18, v18, v20
|
||||
eor t, t, c, ror #19 // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
|
||||
shl.4s v20, v20, #2
|
||||
ror t, t, #6
|
||||
and s, c, d // (x) & (y)
|
||||
eor.16b v18, v18, v19
|
||||
add f, f, t // use h to store h+Sigma1(e)
|
||||
eor.16b v18, v18, v20
|
||||
bic t, e, c // (~(x)) & (z)
|
||||
add.4s $1, $1, v18 // w3:w0 + sigma0(w4:w1) + w12:w9 + sigma1(0 0 w15:w14)
|
||||
eor t, t, s // t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
|
||||
mov s, $5.s[2] //
|
||||
add f, f, t // t = h+Sigma1(e)+Ch(e,f,g);
|
||||
ext.16b v18, zero, $1, #8 // Q4 = (w17:w16 0 0)
|
||||
eor t, g, g, ror #11 // S32(2, (x)) ^ S32(13, (x))
|
||||
add f, f, s // h = T1
|
||||
eor t, t, g, ror #20 // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) // t = Sigma0(a);
|
||||
ushr.4s v19, v18, #17
|
||||
add bb, bb, f // d += T1;
|
||||
shl.4s v20, v18, #13
|
||||
ror t, t, #2
|
||||
ushr.4s v18, v18, #10
|
||||
add f, f, t // h = T1 + Sigma0(a);
|
||||
eor t, h, a // y^z
|
||||
and s, h, a // y&z
|
||||
eor.16b v18, v18, v19
|
||||
and t, t, g // x&(y^z)
|
||||
eor s, s, t // t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
|
||||
|
||||
eor t, bb, bb, ror #5 // S32(6, (x)) ^ S32(11, (x))
|
||||
add f, f, s
|
||||
eor.16b v18, v18, v20
|
||||
eor t, t, bb, ror #19 // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
|
||||
ushr.4s v19, v19, #2
|
||||
ror t, t, #6
|
||||
shl.4s v20, v20, #2
|
||||
|
||||
and s, bb, c // (x) & (y)
|
||||
eor.16b v18, v18, v19
|
||||
add e, e, t // use h to store h+Sigma1(e)
|
||||
bic t, d, bb // (~(x)) & (z)
|
||||
eor t, t, s // t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
|
||||
mov s, $5.s[3] //
|
||||
add e, e, t // t = h+Sigma1(e)+Ch(e,f,g);
|
||||
eor.16b v18, v18, v20
|
||||
eor t, f, f, ror #11 // S32(2, (x)) ^ S32(13, (x))
|
||||
add e, e, s // h = T1
|
||||
eor t, t, f, ror #20 // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) // t = Sigma0(a);
|
||||
add a, a, e // d += T1;
|
||||
ror t, t, #2
|
||||
add.4s $1, $1, v18 // w19:w16 = w3:w0 + sigma0(w4:w1) + w12:w9 + sigma1(w17:w14)
|
||||
add e, e, t // h = T1 + Sigma0(a);
|
||||
eor t, g, h // y^z
|
||||
and s, g, h // y&z
|
||||
add.4s $5, v17, $1 // W+K
|
||||
and t, t, f // x&(y^z)
|
||||
eor s, s, t // t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
|
||||
add e, e, s
|
||||
|
||||
.endm
|
||||
|
||||
.macro rounds_e_schedule_update
|
||||
eor t, a, a, ror #5 // S32(6, (x)) ^ S32(11, (x))
|
||||
ldr q17, [ktable], #16 // K
|
||||
eor t, t, a, ror #19 // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
|
||||
ext.16b v18, $1, $2, #4 // Q18 = w4:w1
|
||||
ror t, t, #6
|
||||
and s, a, bb // (x) & (y)
|
||||
add d, d, t // use h to store h+Sigma1(e)
|
||||
bic t, c, a // (~(x)) & (z)
|
||||
ushr.4s v19, v18, #7
|
||||
eor t, t, s // t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
|
||||
mov s, $5.s[0]
|
||||
add d, d, t // t = h+Sigma1(e)+Ch(e,f,g);
|
||||
shl.4s v20, v18, #14
|
||||
eor t, e, e, ror #11 // S32(2, (x)) ^ S32(13, (x))
|
||||
ushr.4s v18, v18, #3
|
||||
add d, d, s // h = T1
|
||||
eor t, t, e, ror #20 // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) // t = Sigma0(a);
|
||||
add h, h, d // d += T1;
|
||||
ror t, t, #2
|
||||
eor.16b v18, v18, v19
|
||||
add d, d, t // h = T1 + Sigma0(a);
|
||||
ushr.4s v19, v19, #11
|
||||
eor t, f, g // y^z
|
||||
and s, f, g // y&z
|
||||
and t, t, e // x&(y^z)
|
||||
eor.16b v18, v18, v20
|
||||
eor s, s, t // t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
|
||||
shl.4s v20, v20, #11
|
||||
eor t, h, h, ror #5 // S32(6, (x)) ^ S32(11, (x))
|
||||
|
||||
|
||||
add d, d, s
|
||||
eor t, t, h, ror #19 // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
|
||||
eor.16b v18, v18, v19
|
||||
and s, h, a // (x) & (y)
|
||||
ext.16b v19, $3, $4, #4 // q19 = w12:w9
|
||||
ror t, t, #6
|
||||
add c, c, t // use h to store h+Sigma1(e)
|
||||
eor.16b v18, v18, v20
|
||||
bic t, bb, h // (~(x)) & (z)
|
||||
|
||||
eor t, t, s // t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
|
||||
mov s, $5.s[1]
|
||||
add c, c, t // t = h+Sigma1(e)+Ch(e,f,g);
|
||||
eor t, d, d, ror #11 // S32(2, (x)) ^ S32(13, (x))
|
||||
add.4s $1, $1, v18 // w3:w0 + sigma0(w4:w1)
|
||||
add c, c, s // h = T1
|
||||
ext.16b v18, $4, zero, #8 // 0 0 w15:w14
|
||||
eor t, t, d, ror #20 // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) // t = Sigma0(a);
|
||||
add.4s $1, $1, v19 // w3:w0 + sigma0(w4:w1) + w12:w9
|
||||
ror t, t, #2
|
||||
add g, g, c // d += T1;
|
||||
ushr.4s v19, v18, #17
|
||||
add c, c, t // h = T1 + Sigma0(a);
|
||||
shl.4s v20, v18, #13
|
||||
eor t, e, f // y^z
|
||||
ushr.4s v18, v18, #10
|
||||
and s, e, f // y&z
|
||||
and t, t, d // x&(y^z)
|
||||
eor.16b v18, v18, v19
|
||||
eor s, s, t // t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
|
||||
ushr.4s v19, v19, #2
|
||||
|
||||
|
||||
eor t, g, g, ror #5 // S32(6, (x)) ^ S32(11, (x))
|
||||
add c, c, s
|
||||
eor.16b v18, v18, v20
|
||||
eor t, t, g, ror #19 // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
|
||||
shl.4s v20, v20, #2
|
||||
ror t, t, #6
|
||||
and s, g, h // (x) & (y)
|
||||
eor.16b v18, v18, v19
|
||||
add bb, bb, t // use h to store h+Sigma1(e)
|
||||
eor.16b v18, v18, v20
|
||||
bic t, a, g // (~(x)) & (z)
|
||||
add.4s $1, $1, v18 // w3:w0 + sigma0(w4:w1) + w12:w9 + sigma1(0 0 w15:w14)
|
||||
eor t, t, s // t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
|
||||
mov s, $5.s[2]
|
||||
add bb, bb, t // t = h+Sigma1(e)+Ch(e,f,g);
|
||||
ext.16b v18, zero, $1, #8 // Q18 = (w17:w16 0 0)
|
||||
eor t, c, c, ror #11 // S32(2, (x)) ^ S32(13, (x))
|
||||
add bb, bb, s // h = T1
|
||||
eor t, t, c, ror #20 // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) // t = Sigma0(a);
|
||||
ushr.4s v19, v18, #17
|
||||
add f, f, bb // d += T1;
|
||||
shl.4s v20, v18, #13
|
||||
ror t, t, #2
|
||||
ushr.4s v18, v18, #10
|
||||
add bb, bb, t // h = T1 + Sigma0(a);
|
||||
eor t, d, e // y^z
|
||||
and s, d, e // y&z
|
||||
eor.16b v18, v18, v19
|
||||
and t, t, c // x&(y^z)
|
||||
eor s, s, t // t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
|
||||
|
||||
eor t, f, f, ror #5 // S32(6, (x)) ^ S32(11, (x))
|
||||
add bb, bb, s
|
||||
eor.16b v18, v18, v20
|
||||
eor t, t, f, ror #19 // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
|
||||
ushr.4s v19, v19, #2
|
||||
ror t, t, #6
|
||||
shl.4s v20, v20, #2
|
||||
|
||||
and s, f, g // (x) & (y)
|
||||
add a, a, t // use h to store h+Sigma1(e)
|
||||
eor.16b v18, v18, v19
|
||||
bic t, h, f // (~(x)) & (z)
|
||||
eor t, t, s // t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
|
||||
mov s, $5.s[3]
|
||||
add a, a, t // t = h+Sigma1(e)+Ch(e,f,g);
|
||||
eor.16b v18, v18, v20
|
||||
eor t, bb, bb, ror #11 // S32(2, (x)) ^ S32(13, (x))
|
||||
add a, a, s // h = T1
|
||||
eor t, t, bb, ror #20 // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) // t = Sigma0(a);
|
||||
ror t, t, #2
|
||||
add.4s $1, $1, v18 // w19:w16 = w3:w0 + sigma0(w4:w1) + w12:w9 + sigma1(w17:w14)
|
||||
add e, e, a // d += T1;
|
||||
add a, a, t // h = T1 + Sigma0(a);
|
||||
eor t, c, d // y^z
|
||||
and s, c, d // y&z
|
||||
add.4s $5, v17, $1 // W+K
|
||||
and t, t, bb // x&(y^z)
|
||||
eor s, s, t // t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
|
||||
add a, a, s
|
||||
.endm
|
||||
|
||||
|
||||
#if defined(__arm64__)
|
||||
#include "ccarm_pac_bti_macros.h"
|
||||
|
||||
.subsections_via_symbols
|
||||
.text
|
||||
.p2align 4
|
||||
|
||||
K256:
|
||||
.long 0x428a2f98
|
||||
.long 0x71374491
|
||||
.long 0xb5c0fbcf
|
||||
.long 0xe9b5dba5
|
||||
.long 0x3956c25b
|
||||
.long 0x59f111f1
|
||||
.long 0x923f82a4
|
||||
.long 0xab1c5ed5
|
||||
.long 0xd807aa98
|
||||
.long 0x12835b01
|
||||
.long 0x243185be
|
||||
.long 0x550c7dc3
|
||||
.long 0x72be5d74
|
||||
.long 0x80deb1fe
|
||||
.long 0x9bdc06a7
|
||||
.long 0xc19bf174
|
||||
.long 0xe49b69c1
|
||||
.long 0xefbe4786
|
||||
.long 0x0fc19dc6
|
||||
.long 0x240ca1cc
|
||||
.long 0x2de92c6f
|
||||
.long 0x4a7484aa
|
||||
.long 0x5cb0a9dc
|
||||
.long 0x76f988da
|
||||
.long 0x983e5152
|
||||
.long 0xa831c66d
|
||||
.long 0xb00327c8
|
||||
.long 0xbf597fc7
|
||||
.long 0xc6e00bf3
|
||||
.long 0xd5a79147
|
||||
.long 0x06ca6351
|
||||
.long 0x14292967
|
||||
.long 0x27b70a85
|
||||
.long 0x2e1b2138
|
||||
.long 0x4d2c6dfc
|
||||
.long 0x53380d13
|
||||
.long 0x650a7354
|
||||
.long 0x766a0abb
|
||||
.long 0x81c2c92e
|
||||
.long 0x92722c85
|
||||
.long 0xa2bfe8a1
|
||||
.long 0xa81a664b
|
||||
.long 0xc24b8b70
|
||||
.long 0xc76c51a3
|
||||
.long 0xd192e819
|
||||
.long 0xd6990624
|
||||
.long 0xf40e3585
|
||||
.long 0x106aa070
|
||||
.long 0x19a4c116
|
||||
.long 0x1e376c08
|
||||
.long 0x2748774c
|
||||
.long 0x34b0bcb5
|
||||
.long 0x391c0cb3
|
||||
.long 0x4ed8aa4a
|
||||
.long 0x5b9cca4f
|
||||
.long 0x682e6ff3
|
||||
.long 0x748f82ee
|
||||
.long 0x78a5636f
|
||||
.long 0x84c87814
|
||||
.long 0x8cc70208
|
||||
.long 0x90befffa
|
||||
.long 0xa4506ceb
|
||||
.long 0xbef9a3f7
|
||||
.long 0xc67178f2
|
||||
|
||||
|
||||
.p2align 4
|
||||
|
||||
.globl _AccelerateCrypto_SHA256_compress_arm64neon
|
||||
_AccelerateCrypto_SHA256_compress_arm64neon:
|
||||
BRANCH_TARGET_CALL
|
||||
adrp ktable, K256@page
|
||||
cbnz num_blocks, 1f // if number of blocks is nonzero, go on for sha256 transform operation
|
||||
ret lr // otherwise, return
|
||||
1:
|
||||
add ktable, ktable, K256@pageoff
|
||||
|
||||
#if BUILDKERNEL
|
||||
// save q0-q7, q16-q20 8+4+1=13
|
||||
sub x4, sp, #13*16
|
||||
sub sp, sp, #13*16
|
||||
st1.4s {v0, v1, v2, v3}, [x4], #64
|
||||
st1.4s {v4, v5, v6, v7}, [x4], #64
|
||||
st1.4s {v16, v17, v18, v19}, [x4], #64
|
||||
st1.4s {v20}, [x4]
|
||||
#endif
|
||||
|
||||
|
||||
// load W[0:15]
|
||||
ldr qW0, [data, #0*16]
|
||||
movi.16b zero, #0
|
||||
ldr qW1, [data, #1*16]
|
||||
ldr qW2, [data, #2*16]
|
||||
ldr qW3, [data, #3*16]
|
||||
add data, data, #4*16
|
||||
|
||||
// load K[0:15] & per word byte swap
|
||||
rev32.16b W0, W0
|
||||
ldr qWK0, [ktable, #0*16]
|
||||
rev32.16b W1, W1
|
||||
ldr qWK1, [ktable, #1*16]
|
||||
rev32.16b W2, W2
|
||||
ldr qWK2, [ktable, #2*16]
|
||||
rev32.16b W3, W3
|
||||
ldr qWK3, [ktable, #3*16]
|
||||
|
||||
// compute WK[0:15]
|
||||
add ktable, ktable, #4*16
|
||||
add.4s WK0, WK0, W0
|
||||
ldp a, bb, [ctx, #0*4]
|
||||
add.4s WK1, WK1, W1
|
||||
ldp c, d, [ctx, #2*4]
|
||||
add.4s WK2, WK2, W2
|
||||
ldp e, f, [ctx, #4*4]
|
||||
add.4s WK3, WK3, W3
|
||||
ldp g, h, [ctx, #6*4]
|
||||
|
||||
L_loop:
|
||||
|
||||
// rounds 0:47 interleaved with W/WK update for rounds 16:63
|
||||
mov _i_loop, #3
|
||||
L_i_loop:
|
||||
rounds_a_schedule_update 0,W0,W1,W2,W3, WK0
|
||||
rounds_e_schedule_update 4,W1,W2,W3,W0, WK1
|
||||
rounds_a_schedule_update 8,W2,W3,W0,W1, WK2
|
||||
rounds_e_schedule_update 12,W3,W0,W1,W2, WK3
|
||||
subs _i_loop, _i_loop, #1
|
||||
b.gt L_i_loop
|
||||
|
||||
// revert K to the beginning of K256[]
|
||||
subs num_blocks, num_blocks, #1 // num_blocks--
|
||||
sub ktable, ktable, #256
|
||||
b.eq L_final_block // if final block, wrap up final rounds
|
||||
|
||||
// rounds 48:63 interleaved with W/WK initialization for next block rounds 0:15
|
||||
rounds_a_update_W_WK WK0, W0, qWK0, qW0
|
||||
rounds_e_update_W_WK WK1, W1, qWK1, qW1
|
||||
rounds_a_update_W_WK WK2, W2, qWK2, qW2
|
||||
rounds_e_update_W_WK WK3, W3, qWK3, qW3
|
||||
|
||||
// ctx->states += digests a-h, also update digest variables a-h
|
||||
Update_Digits
|
||||
|
||||
b.al L_loop // branch for next block
|
||||
|
||||
// wrap up digest update round 48:63 for final block
|
||||
L_final_block:
|
||||
rounds_a WK0
|
||||
rounds_e WK1
|
||||
rounds_a WK2
|
||||
rounds_e WK3
|
||||
|
||||
// ctx->states += digests a-h
|
||||
Update_Digits
|
||||
|
||||
#if BUILDKERNEL
|
||||
// restore q0-q7, q16-q20
|
||||
ld1.4s {v0, v1, v2, v3}, [sp], #64
|
||||
ld1.4s {v4, v5, v6, v7}, [sp], #64
|
||||
ld1.4s {v16, v17, v18, v19}, [sp], #64
|
||||
ld1.4s {v20}, [sp], #16
|
||||
#endif
|
||||
|
||||
ret lr
|
||||
|
||||
#endif /* arm64 */
|
||||
|
||||
|
|
@ -0,0 +1,30 @@
|
|||
/* Copyright (c) (2010,2014-2016,2019,2020) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <corecrypto/cc_config.h>
|
||||
|
||||
/* the K array */
|
||||
const uint32_t sha256_K[64] CC_ALIGNED(16) = {
|
||||
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b,
|
||||
0x59f111f1, 0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01,
|
||||
0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7,
|
||||
0xc19bf174, 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
|
||||
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 0x983e5152,
|
||||
0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147,
|
||||
0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc,
|
||||
0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
|
||||
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819,
|
||||
0xd6990624, 0xf40e3585, 0x106aa070, 0x19a4c116, 0x1e376c08,
|
||||
0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f,
|
||||
0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
|
||||
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
|
||||
};
|
||||
|
|
@ -0,0 +1,30 @@
|
|||
/* Copyright (c) (2019) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
#include "config.h"
|
||||
#include "AccelerateCrypto.h"
|
||||
|
||||
#if (defined(__x86_64__) || defined(__i386__))
|
||||
extern void AccelerateCrypto_SHA256_compress_ssse3(uint32_t *state, size_t num, const void *buf) __asm__("_AccelerateCrypto_SHA256_compress_ssse3");
|
||||
extern void AccelerateCrypto_SHA256_compress_AVX1(uint32_t *state, size_t num, const void *buf) __asm__("_AccelerateCrypto_SHA256_compress_AVX1");
|
||||
extern void AccelerateCrypto_SHA256_compress_AVX2(uint32_t *state, size_t num, const void *buf) __asm__("_AccelerateCrypto_SHA256_compress_AVX2");
|
||||
|
||||
void AccelerateCrypto_SHA256_compress(uint32_t *state, size_t num, const void *buf)
|
||||
{
|
||||
#if defined(__x86_64__)
|
||||
if (HAS_AVX2()) AccelerateCrypto_SHA256_compress_AVX2(state, num, buf);
|
||||
else if (HAS_AVX1()) AccelerateCrypto_SHA256_compress_AVX1(state, num, buf);
|
||||
else
|
||||
#endif
|
||||
AccelerateCrypto_SHA256_compress_ssse3(state, num, buf);
|
||||
}
|
||||
#endif // (defined(__x86_64__) || defined(__i386__))
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,504 @@
|
|||
# Copyright (c) (2010,2011,2012,2014,2015,2016,2018,2019) Apple Inc. All rights reserved.
|
||||
#
|
||||
# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
# is contained in the License.txt file distributed with corecrypto) and only to
|
||||
# people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
# Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
# devices and computers you own or control, for the sole purpose of verifying the
|
||||
# security characteristics and correct functioning of the Apple Software. You may
|
||||
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
#include <corecrypto/cc_config.h>
|
||||
|
||||
/*
|
||||
This file provides i386 hand implementation of the following function
|
||||
|
||||
sha2_void sha256_compile(sha256_ctx ctx[1]);
|
||||
|
||||
which is a C function in CommonCrypto Source/Digest/sha2.c
|
||||
|
||||
The implementation here is modified from another sha256 i386 implementation for sha256 in the xnu.
|
||||
To modify to fit the new API,
|
||||
the old ctx (points to ctx->hashes) shoule be changed to ctx->hashes, 8(ctx).
|
||||
the old data (points to ctx->wbuf), should be changed to ctx->wbuf, 40(ctx).
|
||||
|
||||
sha256_compile handles 1 input block (64 bytes) per call.
|
||||
|
||||
|
||||
The following is comments for the initial xnu-sha256.s.
|
||||
|
||||
void SHA256_Transform(SHA256_ctx *ctx, char *data, unsigned int num_blocks);
|
||||
|
||||
which is a C function in sha2.c (from xnu).
|
||||
|
||||
sha256 algorithm per block description:
|
||||
|
||||
1. W(0:15) = big-endian (per 4 bytes) loading of input data (64 byte)
|
||||
2. load 8 digests a-h from ctx->state
|
||||
3. for r = 0:15
|
||||
T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
|
||||
d += T1;
|
||||
h = T1 + Sigma0(a) + Maj(a,b,c)
|
||||
permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
|
||||
4. for r = 16:63
|
||||
W[r] = W[r-16] + sigma1(W[r-2]) + W[r-7] + sigma0(W[r-15]);
|
||||
T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
|
||||
d += T1;
|
||||
h = T1 + Sigma0(a) + Maj(a,b,c)
|
||||
permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
|
||||
|
||||
In the assembly implementation:
|
||||
- a circular window of message schedule W(r:r+15) is updated and stored in xmm0-xmm3
|
||||
- its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
|
||||
- the 8 digests (a-h) will be stored in GPR or m32 (all in GPR for x86_64, and some in m32 for i386)
|
||||
|
||||
the implementation per block looks like
|
||||
|
||||
----------------------------------------------------------------------------
|
||||
|
||||
load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3
|
||||
pre_calculate and store W+K(0:15) in stack
|
||||
|
||||
load digests a-h from ctx->state;
|
||||
|
||||
for (r=0;r<48;r+=4) {
|
||||
digests a-h update and permute round r:r+3
|
||||
update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
|
||||
}
|
||||
|
||||
for (r=48;r<64;r+=4) {
|
||||
digests a-h update and permute round r:r+3
|
||||
}
|
||||
|
||||
ctx->states += digests a-h;
|
||||
|
||||
----------------------------------------------------------------------------
|
||||
|
||||
our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block
|
||||
into the last 16 rounds of its previous block:
|
||||
|
||||
----------------------------------------------------------------------------
|
||||
|
||||
load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3
|
||||
pre_calculate and store W+K(0:15) in stack
|
||||
|
||||
L_loop:
|
||||
|
||||
load digests a-h from ctx->state;
|
||||
|
||||
for (r=0;r<48;r+=4) {
|
||||
digests a-h update and permute round r:r+3
|
||||
update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
|
||||
}
|
||||
|
||||
num_block--;
|
||||
if (num_block==0) jmp L_last_block;
|
||||
|
||||
for (r=48;r<64;r+=4) {
|
||||
digests a-h update and permute round r:r+3
|
||||
load W([r:r+3]%16) (big-endian per 4 bytes) into xmm0:xmm3
|
||||
pre_calculate and store W+K([r:r+3]%16) in stack
|
||||
}
|
||||
|
||||
ctx->states += digests a-h;
|
||||
|
||||
jmp L_loop;
|
||||
|
||||
L_last_block:
|
||||
|
||||
for (r=48;r<64;r+=4) {
|
||||
digests a-h update and permute round r:r+3
|
||||
}
|
||||
|
||||
ctx->states += digests a-h;
|
||||
|
||||
------------------------------------------------------------------------
|
||||
|
||||
Apple CoreOS vector & numerics
|
||||
*/
|
||||
#if defined __i386__
|
||||
|
||||
// associate variables with registers or memory
|
||||
|
||||
#define sp %esp
|
||||
#define stack_size (12+16*8+16+16+64) // 12 (align) + xmm0:xmm7 + 16 (c,f,h,K) + L_aligned_bswap + WK(0:15)
|
||||
#define ctx_addr 20+stack_size(sp) // ret_addr + 4 registers = 20, 1st caller argument
|
||||
#define num_blocks 24+stack_size(sp) // 2nd caller argument
|
||||
#define data_addr 28+stack_size(sp) // 3rd caller argument
|
||||
|
||||
#define a %ebx
|
||||
#define b %edx
|
||||
#define c 64(sp)
|
||||
#define d %ebp
|
||||
#define e %esi
|
||||
#define f 68(sp)
|
||||
#define g %edi
|
||||
#define h 72(sp)
|
||||
|
||||
#define K 76(sp) // pointer to K256[] table
|
||||
#define L_aligned_bswap 80(sp) // bswap : big-endian loading of 4-byte words
|
||||
#define xmm_save 96(sp) // starting address for xmm save/restore
|
||||
|
||||
// 2 local variables
|
||||
#define t %eax
|
||||
#define s %ecx
|
||||
|
||||
// a window (16 words) of message scheule
|
||||
#define W0 %xmm0
|
||||
#define W1 %xmm1
|
||||
#define W2 %xmm2
|
||||
#define W3 %xmm3
|
||||
|
||||
// circular buffer for WK[(r:r+15)%16]
|
||||
#define WK(x) ((x)&15)*4(sp)
|
||||
|
||||
// #define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
|
||||
|
||||
.macro Ch
|
||||
mov $0, t // x
|
||||
mov $0, s // x
|
||||
not t // ~x
|
||||
and $1, s // x & y
|
||||
and $2, t // ~x & z
|
||||
xor s, t // t = ((x) & (y)) ^ ((~(x)) & (z));
|
||||
.endm
|
||||
|
||||
// #define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
|
||||
|
||||
.macro Maj
|
||||
mov $1, t // y
|
||||
mov $2, s // z
|
||||
xor $2, t // y^z
|
||||
and $1, s // y&z
|
||||
and $0, t // x&(y^z)
|
||||
xor s, t // Maj(x,y,z)
|
||||
.endm
|
||||
|
||||
// #define sigma0_256(x) (S32(7, (x)) ^ S32(18, (x)) ^ R(3 , (x)))
|
||||
|
||||
// performs sigma0_256 on 4 words on an xmm registers
|
||||
// use xmm6/xmm7 as intermediate registers
|
||||
.macro sigma0
|
||||
movdqa $0, %xmm6
|
||||
movdqa $0, %xmm7
|
||||
psrld $$3, $0 // SHR3(x)
|
||||
psrld $$7, %xmm6 // part of ROTR7
|
||||
pslld $$14, %xmm7 // part of ROTR18
|
||||
pxor %xmm6, $0
|
||||
pxor %xmm7, $0
|
||||
psrld $$11, %xmm6 // part of ROTR18
|
||||
pslld $$11, %xmm7 // part of ROTR7
|
||||
pxor %xmm6, $0
|
||||
pxor %xmm7, $0
|
||||
.endm
|
||||
|
||||
// #define sigma1_256(x) (S32(17, (x)) ^ S32(19, (x)) ^ R(10, (x)))
|
||||
|
||||
// performs sigma1_256 on 4 words on an xmm registers
|
||||
// use xmm6/xmm7 as intermediate registers
|
||||
.macro sigma1
|
||||
movdqa $0, %xmm6
|
||||
movdqa $0, %xmm7
|
||||
psrld $$10, $0 // SHR10(x)
|
||||
psrld $$17, %xmm6 // part of ROTR17
|
||||
pxor %xmm6, $0
|
||||
pslld $$13, %xmm7 // part of ROTR19
|
||||
pxor %xmm7, $0
|
||||
psrld $$2, %xmm6 // part of ROTR19
|
||||
pxor %xmm6, $0
|
||||
pslld $$2, %xmm7 // part of ROTR17
|
||||
pxor %xmm7, $0
|
||||
.endm
|
||||
|
||||
// #define Sigma0_256(x) (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x)))
|
||||
|
||||
.macro Sigma0
|
||||
mov $0, t // x
|
||||
mov $0, s // x
|
||||
ror $$2, t // S32(2, (x))
|
||||
ror $$13, s // S32(13, (x))
|
||||
xor s, t // S32(2, (x)) ^ S32(13, (x))
|
||||
ror $$9, s // S32(22, (x))
|
||||
xor s, t // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x)))
|
||||
.endm
|
||||
|
||||
// #define Sigma1_256(x) (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
|
||||
|
||||
.macro Sigma1
|
||||
mov $0, s // x
|
||||
ror $$6, s // S32(6, (x))
|
||||
mov s, t // S32(6, (x))
|
||||
ror $$5, s // S32(11, (x))
|
||||
xor s, t // S32(6, (x)) ^ S32(11, (x))
|
||||
ror $$14, s // S32(25, (x))
|
||||
xor s, t // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
|
||||
.endm
|
||||
|
||||
// per round digests update
|
||||
.macro round
|
||||
Sigma1 $4 // t = T1
|
||||
add t, $7 // use h to store h+Sigma1(e)
|
||||
Ch $4, $5, $6 // t = Ch (e, f, g);
|
||||
add $7, t // t = h+Sigma1(e)+Ch(e,f,g);
|
||||
add WK($8), t // h = T1
|
||||
add t, $3 // d += T1;
|
||||
mov t, $7 // h = T1
|
||||
Sigma0 $0 // t = Sigma0(a);
|
||||
add t, $7 // h = T1 + Sigma0(a);
|
||||
Maj $0, $1, $2 // t = Maj(a,b,c)
|
||||
add t, $7 // h = T1 + Sigma0(a) + Maj(a,b,c);
|
||||
.endm
|
||||
|
||||
// per 4 rounds digests update and permutation
|
||||
// permutation is absorbed by rotating the roles of digests a-h
|
||||
.macro rounds
|
||||
round $0, $1, $2, $3, $4, $5, $6, $7, 0+$8
|
||||
round $7, $0, $1, $2, $3, $4, $5, $6, 1+$8
|
||||
round $6, $7, $0, $1, $2, $3, $4, $5, 2+$8
|
||||
round $5, $6, $7, $0, $1, $2, $3, $4, 3+$8
|
||||
.endm
|
||||
|
||||
// update the message schedule W and W+K (4 rounds) 16 rounds ahead in the future
|
||||
.macro message_schedule
|
||||
|
||||
// 4 32-bit K256 words in xmm5
|
||||
mov K, t
|
||||
movdqu (t), %xmm5
|
||||
addl $$16, K // K points to next K256 word for next iteration
|
||||
movdqa $1, %xmm4 // W7:W4
|
||||
palignr $$4, $0, %xmm4 // W4:W1
|
||||
sigma0 %xmm4 // sigma0(W4:W1)
|
||||
movdqa $3, %xmm6 // W15:W12
|
||||
paddd %xmm4, $0 // $0 = W3:W0 + sigma0(W4:W1)
|
||||
palignr $$4, $2, %xmm6 // W12:W9
|
||||
paddd %xmm6, $0 // $0 = W12:W9 + sigma0(W4:W1) + W3:W0
|
||||
movdqa $3, %xmm4 // W15:W12
|
||||
psrldq $$8, %xmm4 // 0,0,W15,W14
|
||||
sigma1 %xmm4 // sigma1(0,0,W15,W14)
|
||||
paddd %xmm4, $0 // sigma1(0,0,W15,W14) + W12:W9 + sigma0(W4:W1) + W3:W0
|
||||
movdqa $0, %xmm4 // W19-sigma1(W17), W18-sigma1(W16), W17, W16
|
||||
pslldq $$8, %xmm4 // W17, W16, 0, 0
|
||||
sigma1 %xmm4 // sigma1(W17,W16,0,0)
|
||||
paddd %xmm4, $0 // W19:W16
|
||||
paddd $0, %xmm5 // WK
|
||||
movdqa %xmm5, WK($4)
|
||||
.endm
|
||||
|
||||
// this macro is used in the last 16 rounds of a current block
|
||||
// it reads the next message (16 4-byte words), load it into 4 words W[r:r+3], computes WK[r:r+3]
|
||||
// and save into stack to prepare for next block
|
||||
|
||||
.macro update_W_WK
|
||||
mov data_addr, t
|
||||
movdqu $0*16(t), $1 // read 4 4-byte words
|
||||
pshufb L_aligned_bswap, $1 // big-endian of each 4-byte word, W[r:r+3]
|
||||
mov K, t
|
||||
movdqu $0*16(t), %xmm4 // K[r:r+3]
|
||||
paddd $1, %xmm4 // WK[r:r+3]
|
||||
movdqa %xmm4, WK($0*4) // save WK[r:r+3] into stack circular buffer
|
||||
.endm
|
||||
|
||||
.section __IMPORT,__pointers,non_lazy_symbol_pointers
|
||||
L_sha256_K$non_lazy_ptr:
|
||||
.indirect_symbol CC_C_LABEL(sha256_K)
|
||||
.long 0
|
||||
|
||||
.text
|
||||
.globl _AccelerateCrypto_SHA256_compress_ssse3
|
||||
_AccelerateCrypto_SHA256_compress_ssse3:
|
||||
|
||||
// push callee-saved registers
|
||||
push %ebp
|
||||
push %ebx
|
||||
push %esi
|
||||
push %edi
|
||||
|
||||
// allocate stack space
|
||||
sub $stack_size, sp
|
||||
|
||||
// if kernel code, save used xmm registers
|
||||
#if BUILDKERNEL
|
||||
movdqa %xmm0, 0*16+xmm_save
|
||||
movdqa %xmm1, 1*16+xmm_save
|
||||
movdqa %xmm2, 2*16+xmm_save
|
||||
movdqa %xmm3, 3*16+xmm_save
|
||||
movdqa %xmm4, 4*16+xmm_save
|
||||
movdqa %xmm5, 5*16+xmm_save
|
||||
movdqa %xmm6, 6*16+xmm_save
|
||||
movdqa %xmm7, 7*16+xmm_save
|
||||
#endif
|
||||
|
||||
// set up bswap parameters in the aligned stack space and pointer to table K256[]
|
||||
call 0f // Push program counter onto stack.
|
||||
0: pop t // Get program counter.
|
||||
mov L_sha256_K$non_lazy_ptr-0b(t), t
|
||||
mov t, K
|
||||
call 0f // Push program counter onto stack.
|
||||
0: pop %eax // Get program counter.
|
||||
lea L_bswap-0b(%eax), %eax
|
||||
movdqa (%eax), %xmm0
|
||||
movdqa %xmm0, L_aligned_bswap
|
||||
|
||||
// load W[0:15] into xmm0-xmm3
|
||||
mov data_addr, t
|
||||
movdqu 0*16(t), W0
|
||||
movdqu 1*16(t), W1
|
||||
movdqu 2*16(t), W2
|
||||
movdqu 3*16(t), W3
|
||||
addl $64, data_addr
|
||||
|
||||
pshufb L_aligned_bswap, W0
|
||||
pshufb L_aligned_bswap, W1
|
||||
pshufb L_aligned_bswap, W2
|
||||
pshufb L_aligned_bswap, W3
|
||||
|
||||
// compute WK[0:15] and save in stack
|
||||
mov K, t
|
||||
movdqu 0*16(t), %xmm4
|
||||
movdqu 1*16(t), %xmm5
|
||||
movdqu 2*16(t), %xmm6
|
||||
movdqu 3*16(t), %xmm7
|
||||
addl $64, K
|
||||
paddd %xmm0, %xmm4
|
||||
paddd %xmm1, %xmm5
|
||||
paddd %xmm2, %xmm6
|
||||
paddd %xmm3, %xmm7
|
||||
movdqa %xmm4, WK(0)
|
||||
movdqa %xmm5, WK(4)
|
||||
movdqa %xmm6, WK(8)
|
||||
movdqa %xmm7, WK(12)
|
||||
|
||||
L_loop:
|
||||
|
||||
// digests a-h = ctx->states;
|
||||
mov ctx_addr, t
|
||||
mov 0*4(t), a
|
||||
mov 1*4(t), b
|
||||
mov 2*4(t), s
|
||||
mov s, c
|
||||
mov 3*4(t), d
|
||||
mov 4*4(t), e
|
||||
mov 5*4(t), s
|
||||
mov s, f
|
||||
mov 6*4(t), g
|
||||
mov 7*4(t), s
|
||||
mov s, h
|
||||
|
||||
// rounds 0:47 interleaved with W/WK update for rounds 16:63
|
||||
rounds a, b, c, d, e, f, g, h, 0
|
||||
message_schedule W0,W1,W2,W3,16
|
||||
rounds e, f, g, h, a, b, c, d, 4
|
||||
message_schedule W1,W2,W3,W0,20
|
||||
rounds a, b, c, d, e, f, g, h, 8
|
||||
message_schedule W2,W3,W0,W1,24
|
||||
rounds e, f, g, h, a, b, c, d, 12
|
||||
message_schedule W3,W0,W1,W2,28
|
||||
rounds a, b, c, d, e, f, g, h, 16
|
||||
message_schedule W0,W1,W2,W3,32
|
||||
rounds e, f, g, h, a, b, c, d, 20
|
||||
message_schedule W1,W2,W3,W0,36
|
||||
rounds a, b, c, d, e, f, g, h, 24
|
||||
message_schedule W2,W3,W0,W1,40
|
||||
rounds e, f, g, h, a, b, c, d, 28
|
||||
message_schedule W3,W0,W1,W2,44
|
||||
rounds a, b, c, d, e, f, g, h, 32
|
||||
message_schedule W0,W1,W2,W3,48
|
||||
rounds e, f, g, h, a, b, c, d, 36
|
||||
message_schedule W1,W2,W3,W0,52
|
||||
rounds a, b, c, d, e, f, g, h, 40
|
||||
message_schedule W2,W3,W0,W1,56
|
||||
rounds e, f, g, h, a, b, c, d, 44
|
||||
message_schedule W3,W0,W1,W2,60
|
||||
|
||||
// revert K to the beginning of K256[]
|
||||
subl $256, K
|
||||
subl $1, num_blocks // num_blocks--
|
||||
|
||||
je L_final_block // if final block, wrap up final rounds
|
||||
|
||||
// rounds 48:63 interleaved with W/WK initialization for next block rounds 0:15
|
||||
rounds a, b, c, d, e, f, g, h, 48
|
||||
update_W_WK 0, W0
|
||||
rounds e, f, g, h, a, b, c, d, 52
|
||||
update_W_WK 1, W1
|
||||
rounds a, b, c, d, e, f, g, h, 56
|
||||
update_W_WK 2, W2
|
||||
rounds e, f, g, h, a, b, c, d, 60
|
||||
update_W_WK 3, W3
|
||||
|
||||
addl $64, K
|
||||
addl $64, data_addr
|
||||
|
||||
// ctx->states += digests a-h
|
||||
mov ctx_addr, t
|
||||
add a, 0*4(t)
|
||||
add b, 1*4(t)
|
||||
mov c, s
|
||||
add s, 2*4(t)
|
||||
add d, 3*4(t)
|
||||
add e, 4*4(t)
|
||||
mov f, s
|
||||
add s, 5*4(t)
|
||||
add g, 6*4(t)
|
||||
mov h, s
|
||||
add s, 7*4(t)
|
||||
|
||||
jmp L_loop // branch for next block
|
||||
|
||||
// wrap up digest update round 48:63 for final block
|
||||
L_final_block:
|
||||
rounds a, b, c, d, e, f, g, h, 48
|
||||
rounds e, f, g, h, a, b, c, d, 52
|
||||
rounds a, b, c, d, e, f, g, h, 56
|
||||
rounds e, f, g, h, a, b, c, d, 60
|
||||
|
||||
// ctx->states += digests a-h
|
||||
mov ctx_addr, t
|
||||
add a, 0*4(t)
|
||||
add b, 1*4(t)
|
||||
mov c, s
|
||||
add s, 2*4(t)
|
||||
add d, 3*4(t)
|
||||
add e, 4*4(t)
|
||||
mov f, s
|
||||
add s, 5*4(t)
|
||||
add g, 6*4(t)
|
||||
mov h, s
|
||||
add s, 7*4(t)
|
||||
|
||||
// if kernel, restore xmm0-xmm7
|
||||
#if BUILDKERNEL
|
||||
movdqa 0*16+xmm_save, %xmm0
|
||||
movdqa 1*16+xmm_save, %xmm1
|
||||
movdqa 2*16+xmm_save, %xmm2
|
||||
movdqa 3*16+xmm_save, %xmm3
|
||||
movdqa 4*16+xmm_save, %xmm4
|
||||
movdqa 5*16+xmm_save, %xmm5
|
||||
movdqa 6*16+xmm_save, %xmm6
|
||||
movdqa 7*16+xmm_save, %xmm7
|
||||
#endif
|
||||
|
||||
// free allocated stack memory
|
||||
add $stack_size, sp
|
||||
|
||||
// restore callee-saved registers
|
||||
pop %edi
|
||||
pop %esi
|
||||
pop %ebx
|
||||
pop %ebp
|
||||
|
||||
// return
|
||||
ret
|
||||
|
||||
// data for using ssse3 pshufb instruction (big-endian loading of data)
|
||||
CC_ASM_SECTION_CONST
|
||||
.p2align 4, 0x90
|
||||
|
||||
L_bswap:
|
||||
.long 0x00010203
|
||||
.long 0x04050607
|
||||
.long 0x08090a0b
|
||||
.long 0x0c0d0e0f
|
||||
|
||||
|
||||
#endif // i386
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,564 @@
|
|||
# Copyright (c) (2016,2018,2019) Apple Inc. All rights reserved.
|
||||
#
|
||||
# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
# is contained in the License.txt file distributed with corecrypto) and only to
|
||||
# people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
# Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
# devices and computers you own or control, for the sole purpose of verifying the
|
||||
# security characteristics and correct functioning of the Apple Software. You may
|
||||
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
/*
|
||||
This file provides armv7 neon hand implementation of the following function
|
||||
|
||||
void sha512_compress(uint64_t *state, size_t nblocks, const void *in);
|
||||
|
||||
sha512 algorithm per block description:
|
||||
|
||||
1. W(0:15) = big-endian (per 8 bytes) loading of input data (128 bytes)
|
||||
2. load 8 digests (each 64bit) a-h from state
|
||||
3. for r = 0:15
|
||||
T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
|
||||
d += T1;
|
||||
h = T1 + Sigma0(a) + Maj(a,b,c)
|
||||
permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
|
||||
4. for r = 16:79
|
||||
W[r] = W[r-16] + Gamma1(W[r-2]) + W[r-7] + Gamma0(W[r-15]);
|
||||
T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
|
||||
d += T1;
|
||||
h = T1 + Sigma0(a) + Maj(a,b,c)
|
||||
permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
|
||||
|
||||
In the assembly implementation:
|
||||
- a circular window of message schedule W(r:r+15) is updated and stored in v0-v7
|
||||
- its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
|
||||
- the 8 digests (a-h) will be stored in GPR (%r8-%r15)
|
||||
|
||||
----------------------------------------------------------------------------
|
||||
|
||||
our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block
|
||||
into the last 16 rounds of its previous block:
|
||||
|
||||
----------------------------------------------------------------------------
|
||||
|
||||
load W(0:15) (big-endian per 8 bytes) into v0:v7
|
||||
pre_calculate and store W+K(0:15) in stack
|
||||
|
||||
L_loop:
|
||||
|
||||
load digests a-h from ctx->state;
|
||||
|
||||
for (r=0;r<64;r+=2) {
|
||||
digests a-h update and permute round r:r+1
|
||||
update W([r:r+1]%16) and WK([r:r+1]%16) for the next 8th iteration
|
||||
}
|
||||
|
||||
num_block--;
|
||||
if (num_block==0) jmp L_last_block;
|
||||
|
||||
for (r=64;r<80;r+=2) {
|
||||
digests a-h update and permute round r:r+1
|
||||
load W([r:r+1]%16) (big-endian per 8 bytes) into v0:v7
|
||||
pre_calculate and store W+K([r:r+1]%16) in stack
|
||||
}
|
||||
|
||||
ctx->states += digests a-h;
|
||||
|
||||
jmp L_loop;
|
||||
|
||||
L_last_block:
|
||||
|
||||
for (r=64;r<80;r+=2) {
|
||||
digests a-h update and permute round r:r+2
|
||||
}
|
||||
|
||||
ctx->states += digests a-h;
|
||||
|
||||
------------------------------------------------------------------------
|
||||
|
||||
Apple CoreOS vector & numerics
|
||||
*/
|
||||
|
||||
#if (defined(__arm__) && defined(__ARM_NEON__))
|
||||
|
||||
// associate variables with registers or memory
|
||||
|
||||
#define stack_size (16*8)
|
||||
|
||||
#define ctx r0
|
||||
#define num_blocks r1
|
||||
#define data r2
|
||||
|
||||
/* use d0-d7 (q0-q3) for 8 digests */
|
||||
#define a d0
|
||||
#define b d1
|
||||
#define c d2
|
||||
#define d d3
|
||||
#define e d4
|
||||
#define f d5
|
||||
#define g d6
|
||||
#define h d7
|
||||
|
||||
#define K r3
|
||||
|
||||
// 3 local variables
|
||||
#define s d8
|
||||
#define t d9
|
||||
#define u d10
|
||||
|
||||
// a window (16 quad-words) of message scheule
|
||||
#define W0 q8
|
||||
#define W1 q9
|
||||
#define W2 q10
|
||||
#define W3 q11
|
||||
#define W4 q12
|
||||
#define W5 q13
|
||||
#define W6 q14
|
||||
#define W7 q15
|
||||
|
||||
// circular buffer for WK[(r:r+15)%16]
|
||||
#define WK(x) [sp,#((x)&15)*8]
|
||||
|
||||
// #define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
|
||||
|
||||
/* t = Ch($0, $1, $2) */
|
||||
.macro Ch
|
||||
veor t, $1, $2
|
||||
vand t, t, $0
|
||||
veor t, t, $2
|
||||
.endm
|
||||
|
||||
// #define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
|
||||
|
||||
|
||||
/* t = Maj($0, $1, $2) */
|
||||
.macro Maj
|
||||
veor t, $1, $2 // y^z
|
||||
vand s, $1,$2 // y&z
|
||||
vand t, t, $0 // x&(y^z)
|
||||
veor t, t, s // Maj(x,y,z)
|
||||
.endm
|
||||
|
||||
// #define Gamma0(x) (S64(1, (x)) ^ S64(8, (x)) ^ R(7 , (x)))
|
||||
|
||||
// performs Gamma0_512 on 2 words on an vector registers
|
||||
// use q6/q7 as intermediate registers
|
||||
.macro Gamma0
|
||||
vshr.u64 q6, $0, #1 // part of S64(1, x)
|
||||
vshl.i64 q7, $0, #56 // part of S64(8, x)
|
||||
vshr.u64 $0, $0, #7 // R(7, x)
|
||||
veor $0, $0, q6
|
||||
vshr.u64 q6, q6, #7 // part of S64(8, x)
|
||||
veor $0, $0, q7
|
||||
vshl.i64 q7, q7, #7 // part of S64(1, x)
|
||||
veor $0, $0, q6
|
||||
veor $0, $0, q7
|
||||
.endm
|
||||
|
||||
// #define Gamma1(x) (S64(19, (x)) ^ S64(61, (x)) ^ R(6, (x)))
|
||||
|
||||
// performs Gamma1_512 on 2 words on an vector registers
|
||||
// use v16/v17 as intermediate registers
|
||||
.macro Gamma1
|
||||
vshr.u64 q6, $0, #19 // part of S64(19, x)
|
||||
vshl.i64 q7, $0, #3 // part of S64(61, x)
|
||||
vshr.u64 $0, $0, #6 // R(6, x)
|
||||
veor $0, $0, q6
|
||||
vshr.u64 q6, q6, #42 // part of S64(61, x)
|
||||
veor $0, $0, q7
|
||||
vshl.i64 q7, q7, #42 // part of S64(19, x)
|
||||
veor $0, $0, q6
|
||||
veor $0, $0, q7
|
||||
.endm
|
||||
|
||||
// W[r] = W[r-16] + Gamma1(W[r-2]) + W[r-7] + Gamma0(W[r-15]);
|
||||
/*
|
||||
W0 W1 W2 W3 W4 W5 W6 W7
|
||||
|
||||
update 2 quad words in W0 = W0 + Gamma1(W7) + vext(W4,W5) + Gamma0(vext(W0,W1)).
|
||||
use q5-q7 for temp
|
||||
*/
|
||||
.macro message_update2
|
||||
vext.64 q7, $4, $5, #1 // W[r-7]
|
||||
vext.64 q5, $0, $1, #1 // W[r-15]
|
||||
vadd.s64 $0, $0, q7 // W[r-16] + W[r-7];
|
||||
Gamma0 q5
|
||||
vadd.s64 $0, $0, q5 // W[r-16] + W[r-7] + Gamma0(W[r-15])
|
||||
vshr.u64 q6, $7, #19 // Gamma1(W[r-2]), part of S64(19, x)
|
||||
vshl.i64 q7, $7, #3 // part of S64(61, x)
|
||||
vshr.u64 q5, $7, #6 // R(6, x)
|
||||
veor q5, q5, q6
|
||||
vshr.u64 q6, q6, #42 // part of S64(61, x)
|
||||
veor q5, q5, q7
|
||||
vshl.i64 q7, q7, #42 // part of S64(19, x)
|
||||
veor q5, q5, q6
|
||||
veor q5, q5, q7
|
||||
vadd.s64 $0, $0, q5 // W[r-16] + W[r-7] + Gamma1(W7)
|
||||
.endm
|
||||
|
||||
// #define Sigma0(x) (S64(28, (x)) ^ S64(34, (x)) ^ S64(39, (x)))
|
||||
|
||||
.macro Sigma0
|
||||
vshr.u64 t, $0, #28
|
||||
vshl.i64 s, $0, #25
|
||||
vshr.u64 u, t, #6
|
||||
veor t, t, s
|
||||
vshl.i64 s, s, #5
|
||||
veor t, t, u
|
||||
vshr.u64 u, u, #5
|
||||
veor t, t, s
|
||||
vshl.i64 s, s, #6
|
||||
veor t, t, u
|
||||
veor t, t, s
|
||||
.endm
|
||||
|
||||
// #define Sigma1(x) (S(14, (x)) ^ S(18, (x)) ^ S(41, (x)))
|
||||
|
||||
.macro Sigma1
|
||||
vshr.u64 t, $0, #14
|
||||
vshl.i64 s, $0, #23
|
||||
vshr.u64 u, t, #4
|
||||
veor t, t, s
|
||||
vshl.i64 s, s, #23
|
||||
veor t, t, u
|
||||
vshr.u64 u, u, #23
|
||||
veor t, t, s
|
||||
vshl.i64 s, s, #4
|
||||
veor t, t, u
|
||||
veor t, t, s
|
||||
.endm
|
||||
|
||||
// per round digests update
|
||||
.macro round_ref
|
||||
Sigma1 $4 // t = Sigma1(e);
|
||||
vadd.s64 $7, $7, t // h = h+Sigma1(e)
|
||||
Ch $4, $5, $6 // t = Ch (e, f, g);
|
||||
vldr s, WK($8) // s = WK
|
||||
vadd.s64 $7, $7, t // h = h+Sigma1(e)+Ch(e,f,g);
|
||||
vadd.s64 $7, $7, s // h = h+Sigma1(e)+Ch(e,f,g)+WK
|
||||
vadd.s64 $3, $3, $7 // d += h;
|
||||
Sigma0 $0 // t = Sigma0(a);
|
||||
vadd.s64 $7, $7, t // h += Sigma0(a);
|
||||
Maj $0, $1, $2 // t = Maj(a,b,c)
|
||||
vadd.s64 $7, $7, t // h = T1 + Sigma0(a) + Maj(a,b,c);
|
||||
.endm
|
||||
|
||||
.macro round
|
||||
Sigma1 $4 // t = Sigma1(e);
|
||||
vldr s, WK($8) // s = WK
|
||||
vadd.s64 $7, $7, t // h = h+Sigma1(e)
|
||||
veor t, $5, $6
|
||||
vadd.s64 $7, $7, s // h = h+Sigma1(e)+WK
|
||||
vand t, t, $4
|
||||
veor t, t, $6 // t = Ch (e, f, g);
|
||||
vadd.s64 $7, $7, t // h = h+Sigma1(e)+Ch(e,f,g);
|
||||
Sigma0 $0 // t = Sigma0(a);
|
||||
vadd.s64 $3, $3, $7 // d += h;
|
||||
vadd.s64 $7, $7, t // h += Sigma0(a);
|
||||
Maj $0, $1, $2 // t = Maj(a,b,c)
|
||||
vadd.s64 $7, $7, t // h = T1 + Sigma0(a) + Maj(a,b,c);
|
||||
.endm
|
||||
|
||||
/*
|
||||
16 rounds of hash update, update input schedule W (in vector register v0-v7) and WK = W + K (in stack)
|
||||
*/
|
||||
.macro rounds_schedule
|
||||
mov r12, sp
|
||||
|
||||
message_update2 W0, W1, W2, W3, W4, W5, W6, W7
|
||||
round $0, $1, $2, $3, $4, $5, $6, $7, 0+$8
|
||||
round $7, $0, $1, $2, $3, $4, $5, $6, 1+$8
|
||||
vld1.64 {q7}, [K,:128]!
|
||||
vadd.s64 q7, q7, W0
|
||||
vst1.64 {q7}, [r12]!
|
||||
|
||||
message_update2 W1, W2, W3, W4, W5, W6, W7, W0
|
||||
round $6, $7, $0, $1, $2, $3, $4, $5, 2+$8
|
||||
round $5, $6, $7, $0, $1, $2, $3, $4, 3+$8
|
||||
vld1.64 {q7}, [K,:128]!
|
||||
vadd.s64 q7, q7, W1
|
||||
vst1.64 {q7}, [r12]!
|
||||
|
||||
|
||||
message_update2 W2, W3, W4, W5, W6, W7, W0, W1
|
||||
round $4, $5, $6, $7, $0, $1, $2, $3, 4+$8
|
||||
round $3, $4, $5, $6, $7, $0, $1, $2, 5+$8
|
||||
vld1.64 {q7}, [K,:128]!
|
||||
vadd.s64 q7, q7, W2
|
||||
vst1.64 {q7}, [r12]!
|
||||
|
||||
message_update2 W3, W4, W5, W6, W7, W0, W1, W2
|
||||
round $2, $3, $4, $5, $6, $7, $0, $1, 6+$8
|
||||
round $1, $2, $3, $4, $5, $6, $7, $0, 7+$8
|
||||
vld1.64 {q7}, [K,:128]!
|
||||
vadd.s64 q7, q7, W3
|
||||
vst1.64 {q7}, [r12]!
|
||||
|
||||
message_update2 W4, W5, W6, W7, W0, W1, W2, W3
|
||||
round $0, $1, $2, $3, $4, $5, $6, $7, 8+$8
|
||||
round $7, $0, $1, $2, $3, $4, $5, $6, 9+$8
|
||||
vld1.64 {q7}, [K,:128]!
|
||||
vadd.s64 q7, q7, W4
|
||||
vst1.64 {q7}, [r12]!
|
||||
|
||||
message_update2 W5, W6, W7, W0, W1, W2, W3, W4
|
||||
round $6, $7, $0, $1, $2, $3, $4, $5, 10+$8
|
||||
round $5, $6, $7, $0, $1, $2, $3, $4, 11+$8
|
||||
vld1.64 {q7}, [K,:128]!
|
||||
vadd.s64 q7, q7, W5
|
||||
vst1.64 {q7}, [r12]!
|
||||
|
||||
message_update2 W6, W7, W0, W1, W2, W3, W4, W5
|
||||
round $4, $5, $6, $7, $0, $1, $2, $3, 12+$8
|
||||
round $3, $4, $5, $6, $7, $0, $1, $2, 13+$8
|
||||
vld1.64 {q7}, [K,:128]!
|
||||
vadd.s64 q7, q7, W6
|
||||
vst1.64 {q7}, [r12]!
|
||||
|
||||
message_update2 W7, W0, W1, W2, W3, W4, W5, W6
|
||||
round $2, $3, $4, $5, $6, $7, $0, $1, 14+$8
|
||||
round $1, $2, $3, $4, $5, $6, $7, $0, 15+$8
|
||||
|
||||
vld1.64 {q7}, [K,:128]!
|
||||
vadd.s64 q7, q7, W7
|
||||
vst1.64 {q7}, [r12]!
|
||||
|
||||
.endm
|
||||
|
||||
.macro rev64
|
||||
vrev64.8 $0, $0
|
||||
.endm
|
||||
/*
|
||||
16 rounds of hash update, load new input schedule W (in vector register v0-v7) and update WK = W + K (in stack)
|
||||
*/
|
||||
.macro rounds_schedule_initial
|
||||
mov r12, sp
|
||||
vld1.8 {W0}, [data]!
|
||||
round $0, $1, $2, $3, $4, $5, $6, $7, 0+$8
|
||||
rev64 W0
|
||||
round $7, $0, $1, $2, $3, $4, $5, $6, 1+$8
|
||||
vld1.64 {q7}, [K,:128]!
|
||||
vadd.s64 q7, q7, W0
|
||||
vst1.64 {q7}, [r12]!
|
||||
|
||||
vld1.8 {W1}, [data]!
|
||||
round $6, $7, $0, $1, $2, $3, $4, $5, 2+$8
|
||||
rev64 W1
|
||||
round $5, $6, $7, $0, $1, $2, $3, $4, 3+$8
|
||||
vld1.64 {q7}, [K,:128]!
|
||||
vadd.s64 q7, q7, W1
|
||||
vst1.64 {q7}, [r12]!
|
||||
|
||||
vld1.8 {W2}, [data]!
|
||||
round $4, $5, $6, $7, $0, $1, $2, $3, 4+$8
|
||||
rev64 W2
|
||||
round $3, $4, $5, $6, $7, $0, $1, $2, 5+$8
|
||||
vld1.64 {q7}, [K,:128]!
|
||||
vadd.s64 q7, q7, W2
|
||||
vst1.64 {q7}, [r12]!
|
||||
|
||||
vld1.8 {W3}, [data]!
|
||||
round $2, $3, $4, $5, $6, $7, $0, $1, 6+$8
|
||||
rev64 W3
|
||||
round $1, $2, $3, $4, $5, $6, $7, $0, 7+$8
|
||||
vld1.64 {q7}, [K,:128]!
|
||||
vadd.s64 q7, q7, W3
|
||||
vst1.64 {q7}, [r12]!
|
||||
|
||||
vld1.8 {W4}, [data]!
|
||||
round $0, $1, $2, $3, $4, $5, $6, $7, 8+$8
|
||||
rev64 W4
|
||||
round $7, $0, $1, $2, $3, $4, $5, $6, 9+$8
|
||||
vld1.64 {q7}, [K,:128]!
|
||||
vadd.s64 q7, q7, W4
|
||||
vst1.64 {q7}, [r12]!
|
||||
|
||||
vld1.8 {W5}, [data]!
|
||||
round $6, $7, $0, $1, $2, $3, $4, $5, 10+$8
|
||||
rev64 W5
|
||||
round $5, $6, $7, $0, $1, $2, $3, $4, 11+$8
|
||||
vld1.64 {q7}, [K,:128]!
|
||||
vadd.s64 q7, q7, W5
|
||||
vst1.64 {q7}, [r12]!
|
||||
|
||||
vld1.8 {W6}, [data]!
|
||||
round $4, $5, $6, $7, $0, $1, $2, $3, 12+$8
|
||||
rev64 W6
|
||||
round $3, $4, $5, $6, $7, $0, $1, $2, 13+$8
|
||||
vld1.64 {q7}, [K,:128]!
|
||||
vadd.s64 q7, q7, W6
|
||||
vst1.64 {q7}, [r12]!
|
||||
|
||||
vld1.8 {W7}, [data]!
|
||||
round $2, $3, $4, $5, $6, $7, $0, $1, 14+$8
|
||||
rev64 W7
|
||||
round $1, $2, $3, $4, $5, $6, $7, $0, 15+$8
|
||||
vld1.64 {q7}, [K,:128]!
|
||||
vadd.s64 q7, q7, W7
|
||||
vst1.64 {q7}, [r12]!
|
||||
|
||||
.endm
|
||||
|
||||
/*
|
||||
16 rounds of hash update
|
||||
*/
|
||||
.macro rounds_schedule_final
|
||||
round $0, $1, $2, $3, $4, $5, $6, $7, 0+$8
|
||||
round $7, $0, $1, $2, $3, $4, $5, $6, 1+$8
|
||||
|
||||
round $6, $7, $0, $1, $2, $3, $4, $5, 2+$8
|
||||
round $5, $6, $7, $0, $1, $2, $3, $4, 3+$8
|
||||
|
||||
round $4, $5, $6, $7, $0, $1, $2, $3, 4+$8
|
||||
round $3, $4, $5, $6, $7, $0, $1, $2, 5+$8
|
||||
|
||||
round $2, $3, $4, $5, $6, $7, $0, $1, 6+$8
|
||||
round $1, $2, $3, $4, $5, $6, $7, $0, 7+$8
|
||||
|
||||
round $0, $1, $2, $3, $4, $5, $6, $7, 8+$8
|
||||
round $7, $0, $1, $2, $3, $4, $5, $6, 9+$8
|
||||
|
||||
round $6, $7, $0, $1, $2, $3, $4, $5, 10+$8
|
||||
round $5, $6, $7, $0, $1, $2, $3, $4, 11+$8
|
||||
|
||||
round $4, $5, $6, $7, $0, $1, $2, $3, 12+$8
|
||||
round $3, $4, $5, $6, $7, $0, $1, $2, 13+$8
|
||||
|
||||
round $2, $3, $4, $5, $6, $7, $0, $1, 14+$8
|
||||
round $1, $2, $3, $4, $5, $6, $7, $0, 15+$8
|
||||
.endm
|
||||
|
||||
.p2align 4
|
||||
L_table1:
|
||||
.long L_Tab$non_lazy_ptr-(L_table0+8)
|
||||
|
||||
.p2align 4
|
||||
.text
|
||||
.globl _AccelerateCrypto_SHA512_compress
|
||||
_AccelerateCrypto_SHA512_compress:
|
||||
|
||||
// push callee-saved registers
|
||||
push {r4,r5,r7,lr}
|
||||
add r7, sp, #8 // set up dtrace frame pointer
|
||||
|
||||
vpush {q4-q7}
|
||||
#if BUILDKERNEL
|
||||
vpush {q0-q3}
|
||||
vpush {q8-q15}
|
||||
#endif
|
||||
|
||||
|
||||
// allocate stack space for WK[0:15]
|
||||
sub sp, sp, #stack_size
|
||||
|
||||
ldr K, L_table1
|
||||
L_table0:
|
||||
mov r12, pc
|
||||
ldr K, [r12, K]
|
||||
|
||||
vld1.8 {W0,W1}, [data]!
|
||||
vld1.8 {W2,W3}, [data]!
|
||||
vld1.8 {W4,W5}, [data]!
|
||||
vld1.8 {W6,W7}, [data]!
|
||||
|
||||
rev64 W0
|
||||
rev64 W1
|
||||
rev64 W2
|
||||
rev64 W3
|
||||
rev64 W4
|
||||
rev64 W5
|
||||
rev64 W6
|
||||
rev64 W7
|
||||
|
||||
mov r12, sp
|
||||
// compute WK[0:15] and save in stack, use q0-q7 as they have not yet being used
|
||||
vld1.8 {q0,q1}, [K,:128]!
|
||||
vld1.8 {q2,q3}, [K,:128]!
|
||||
vld1.8 {q4,q5}, [K,:128]!
|
||||
vld1.8 {q6,q7}, [K,:128]!
|
||||
|
||||
vadd.s64 q0, q0, W0
|
||||
vadd.s64 q1, q1, W1
|
||||
vadd.s64 q2, q2, W2
|
||||
vadd.s64 q3, q3, W3
|
||||
vadd.s64 q4, q4, W4
|
||||
vadd.s64 q5, q5, W5
|
||||
vadd.s64 q6, q6, W6
|
||||
vadd.s64 q7, q7, W7
|
||||
|
||||
vst1.32 {q0,q1}, [r12]!
|
||||
vst1.32 {q2,q3}, [r12]!
|
||||
vst1.32 {q4,q5}, [r12]!
|
||||
vst1.32 {q6,q7}, [r12]!
|
||||
|
||||
L_loop:
|
||||
|
||||
// digests a-h = ctx->states;
|
||||
mov r12, ctx
|
||||
vld1.64 {q0,q1}, [r12]!
|
||||
vld1.64 {q2,q3}, [r12]
|
||||
|
||||
// rounds 0:47 interleaved with W/WK update for rounds 16:63
|
||||
mov r4, #4
|
||||
L_i_loop:
|
||||
rounds_schedule a, b, c, d, e, f, g, h, 16
|
||||
subs r4, r4, #1
|
||||
bgt L_i_loop
|
||||
|
||||
// revert K to the beginning of K256[]
|
||||
sub K, K, #640
|
||||
subs num_blocks, num_blocks, #1 // num_blocks--
|
||||
|
||||
beq L_final_block // if final block, wrap up final rounds
|
||||
|
||||
rounds_schedule_initial a, b, c, d, e, f, g, h, 0
|
||||
|
||||
// ctx->states += digests a-h
|
||||
mov r12, ctx
|
||||
vld1.64 {q4,q5}, [r12]!
|
||||
vld1.64 {q6,q7}, [r12]
|
||||
vadd.s64 q4, q0, q4
|
||||
vadd.s64 q5, q1, q5
|
||||
vadd.s64 q6, q2, q6
|
||||
vadd.s64 q7, q3, q7
|
||||
vst1.64 {q4,q5}, [ctx]
|
||||
vst1.64 {q6,q7}, [r12]
|
||||
|
||||
bal L_loop // branch for next block
|
||||
|
||||
// wrap up digest update round 48:63 for final block
|
||||
L_final_block:
|
||||
rounds_schedule_final a, b, c, d, e, f, g, h, 0
|
||||
|
||||
// ctx->states += digests a-h
|
||||
mov r12, ctx
|
||||
vld1.64 {q4,q5}, [r12]!
|
||||
vld1.64 {q6,q7}, [r12]
|
||||
vadd.s64 q4, q0, q4
|
||||
vadd.s64 q5, q1, q5
|
||||
vadd.s64 q6, q2, q6
|
||||
vadd.s64 q7, q3, q7
|
||||
vst1.64 {q4,q5}, [ctx]
|
||||
vst1.64 {q6,q7}, [r12]
|
||||
|
||||
// free allocated stack memory
|
||||
add sp, sp, #stack_size
|
||||
|
||||
// if kernel, restore used vector registers
|
||||
#if BUILDKERNEL
|
||||
vpop {q8-q15}
|
||||
vpop {q0-q3}
|
||||
#endif
|
||||
vpop {q4-q7}
|
||||
|
||||
// return
|
||||
pop {r4,r5,r7,pc}
|
||||
|
||||
|
||||
.section __DATA,__nl_symbol_ptr,non_lazy_symbol_pointers
|
||||
.p2align 4
|
||||
L_Tab$non_lazy_ptr:
|
||||
.indirect_symbol _sha512_K
|
||||
.long 0
|
||||
|
||||
|
||||
#endif // (defined(__arm__) && defined(__ARM_NEON__))
|
||||
|
|
@ -0,0 +1,622 @@
|
|||
# Copyright (c) (2016,2018-2020) Apple Inc. All rights reserved.
|
||||
#
|
||||
# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
# is contained in the License.txt file distributed with corecrypto) and only to
|
||||
# people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
# Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
# devices and computers you own or control, for the sole purpose of verifying the
|
||||
# security characteristics and correct functioning of the Apple Software. You may
|
||||
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
|
||||
/*
|
||||
This file provides arm64 hand implementation of the following function
|
||||
|
||||
void sha512_compress(uint64_t *state, size_t nblocks, const void *in);
|
||||
|
||||
sha512 algorithm per block description:
|
||||
|
||||
1. W(0:15) = big-endian (per 8 bytes) loading of input data (128 bytes)
|
||||
2. load 8 digests (each 64bit) a-h from state
|
||||
3. for r = 0:15
|
||||
T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
|
||||
d += T1;
|
||||
h = T1 + Sigma0(a) + Maj(a,b,c)
|
||||
permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
|
||||
4. for r = 16:79
|
||||
W[r] = W[r-16] + Gamma1(W[r-2]) + W[r-7] + Gamma0(W[r-15]);
|
||||
T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
|
||||
d += T1;
|
||||
h = T1 + Sigma0(a) + Maj(a,b,c)
|
||||
permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
|
||||
|
||||
In the assembly implementation:
|
||||
- a circular window of message schedule W(r:r+15) is updated and stored in v0-v7
|
||||
- its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
|
||||
- the 8 digests (a-h) will be stored in GPR (%r8-%r15)
|
||||
|
||||
----------------------------------------------------------------------------
|
||||
|
||||
our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block
|
||||
into the last 16 rounds of its previous block:
|
||||
|
||||
----------------------------------------------------------------------------
|
||||
|
||||
load W(0:15) (big-endian per 8 bytes) into v0:v7
|
||||
pre_calculate and store W+K(0:15) in stack
|
||||
|
||||
L_loop:
|
||||
|
||||
load digests a-h from ctx->state;
|
||||
|
||||
for (r=0;r<64;r+=2) {
|
||||
digests a-h update and permute round r:r+1
|
||||
update W([r:r+1]%16) and WK([r:r+1]%16) for the next 8th iteration
|
||||
}
|
||||
|
||||
num_block--;
|
||||
if (num_block==0) jmp L_last_block;
|
||||
|
||||
for (r=64;r<80;r+=2) {
|
||||
digests a-h update and permute round r:r+1
|
||||
load W([r:r+1]%16) (big-endian per 8 bytes) into v0:v7
|
||||
pre_calculate and store W+K([r:r+1]%16) in stack
|
||||
}
|
||||
|
||||
ctx->states += digests a-h;
|
||||
|
||||
jmp L_loop;
|
||||
|
||||
L_last_block:
|
||||
|
||||
for (r=64;r<80;r+=2) {
|
||||
digests a-h update and permute round r:r+2
|
||||
}
|
||||
|
||||
ctx->states += digests a-h;
|
||||
|
||||
------------------------------------------------------------------------
|
||||
|
||||
Apple CoreOS vector & numerics
|
||||
*/
|
||||
|
||||
#if defined __arm64__
|
||||
|
||||
#include "ccarm_pac_bti_macros.h"
|
||||
// associate variables with registers or memory
|
||||
|
||||
#define stack_size (16*8)
|
||||
|
||||
#define ctx x0
|
||||
#define num_blocks x1
|
||||
#define data x2
|
||||
|
||||
#define a x4
|
||||
#define bb x5
|
||||
#define c x6
|
||||
#define d x7
|
||||
#define e x8
|
||||
#define f x9
|
||||
#define g x10
|
||||
#define h x11
|
||||
|
||||
#define K x3
|
||||
|
||||
// 3 local variables
|
||||
#define s x12
|
||||
#define t x13
|
||||
#define u x14
|
||||
|
||||
// a window (16 quad-words) of message scheule
|
||||
#define W0 v0
|
||||
#define W1 v1
|
||||
#define W2 v2
|
||||
#define W3 v3
|
||||
#define W4 v4
|
||||
#define W5 v5
|
||||
#define W6 v6
|
||||
#define W7 v7
|
||||
|
||||
// circular buffer for WK[(r:r+15)%16]
|
||||
#define WK(x) [sp,#((x)&15)*8]
|
||||
|
||||
// #define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
|
||||
|
||||
/* t = Ch($0, $1, $2) */
|
||||
.macro Ch
|
||||
eor t, $1, $2
|
||||
and t, t, $0
|
||||
eor t, t, $2
|
||||
.endm
|
||||
|
||||
// #define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
|
||||
|
||||
|
||||
/* t = Maj($0, $1, $2) */
|
||||
.macro Maj
|
||||
eor t, $1, $2 // y^z
|
||||
and s, $1,$2 // y&z
|
||||
and t, t, $0 // x&(y^z)
|
||||
eor t, t, s // Maj(x,y,z)
|
||||
.endm
|
||||
|
||||
// #define Gamma0(x) (S64(1, (x)) ^ S64(8, (x)) ^ R(7 , (x)))
|
||||
|
||||
// performs Gamma0_512 on 2 words on an vector registers
|
||||
// use v20/v21 as intermediate registers
|
||||
.macro Gamma0
|
||||
ushr.2d v20, $0, #1 // part of S64(1, x)
|
||||
shl.2d v21, $0, #56 // part of S64(8, x)
|
||||
ushr.2d $0, $0, #7 // R(7, x)
|
||||
eor.16b $0, $0, v20
|
||||
ushr.2d v20, v20, #7 // part of S64(8, x)
|
||||
eor.16b $0, $0, v21
|
||||
shl.2d v21,v21, #7 // part of S64(1, x)
|
||||
eor.16b $0, $0, v20
|
||||
eor.16b $0, $0, v21
|
||||
.endm
|
||||
|
||||
// #define Gamma1(x) (S64(19, (x)) ^ S64(61, (x)) ^ R(6, (x)))
|
||||
|
||||
// performs Gamma1_512 on 2 words on an vector registers
|
||||
// use v16/v17 as intermediate registers
|
||||
.macro Gamma1
|
||||
ushr.2d v16, $0, #19 // part of S64(19, x)
|
||||
shl.2d v17, $0, #3 // part of S64(61, x)
|
||||
ushr.2d $0, $0, #6 // R(6, x)
|
||||
eor.16b $0, $0, v16
|
||||
ushr.2d v16, v16, #42 // part of S64(61, x)
|
||||
eor.16b $0, $0, v17
|
||||
shl.2d v17,v17, #42 // part of S64(19, x)
|
||||
eor.16b $0, $0, v16
|
||||
eor.16b $0, $0, v17
|
||||
.endm
|
||||
|
||||
// W[r] = W[r-16] + Gamma1(W[r-2]) + W[r-7] + Gamma0(W[r-15]);
|
||||
/*
|
||||
W0 W1 W2 W3 W4 W5 W6 W7
|
||||
|
||||
update 2 quad words in W0 = W0 + Gamma1(W7) + vext(W4,W5) + Gamma0(vext(W0,W1)).
|
||||
use v16-v19 for temp
|
||||
*/
|
||||
.macro message_update2 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7
|
||||
ext.16b v18, \vec4, \vec5, #8 // vext(W4,W5)
|
||||
ext.16b v19, \vec0, \vec1, #8 // vext(W0,W1)
|
||||
|
||||
ushr.2d v16, \vec7, #19 // part of S64(19, x)
|
||||
shl.2d v17, \vec7, #3 // part of S64(61, x)
|
||||
add.2d \vec0, \vec0, v18 // W0 + vext(W4,W5)
|
||||
ushr.2d v18, \vec7, #6 // R(6,x)
|
||||
ushr.2d v20, v19, #1 // part of S64(1, x)
|
||||
shl.2d v21, v19, #56 // part of S64(8, x)
|
||||
ushr.2d v19, v19, #7 // R(7, x)
|
||||
|
||||
eor.16b v18, v18, v16
|
||||
ushr.2d v16, v16, #42 // part of S64(61, x)
|
||||
eor.16b v19, v19, v20
|
||||
ushr.2d v20, v20, #7 // part of S64(8, x)
|
||||
|
||||
eor.16b v18, v18, v17
|
||||
shl.2d v17, v17, #42 // part of S64(19, x)
|
||||
eor.16b v19, v19, v21
|
||||
shl.2d v21,v21, #7 // part of S64(1, x)
|
||||
eor.16b v18, v18, v16
|
||||
eor.16b v19, v19, v20
|
||||
|
||||
eor.16b v18, v18, v17
|
||||
eor.16b v19, v19, v21
|
||||
|
||||
add.2d \vec0, \vec0, v18 // W0 + Gamma1(W7) + vext(W4,W5)
|
||||
add.2d \vec0, \vec0, v19 // W0 + Gamma1(W7) + vext(W4,W5) + Gamma0(vext(W0,W1))
|
||||
.endm
|
||||
|
||||
// #define Sigma0(x) (S64(28, (x)) ^ S64(34, (x)) ^ S64(39, (x)))
|
||||
|
||||
.macro Sigma0
|
||||
ror t, $0, #28
|
||||
eor t, t, $0, ror #34
|
||||
eor t, t, $0, ror #39
|
||||
.endm
|
||||
|
||||
// #define Sigma1(x) (S(14, (x)) ^ S(18, (x)) ^ S(41, (x)))
|
||||
|
||||
.macro Sigma1
|
||||
ror t, $0, #14
|
||||
eor t, t, $0, ror #18
|
||||
eor t, t, $0, ror #41
|
||||
.endm
|
||||
|
||||
// per round digests update
|
||||
.macro round_ref
|
||||
Sigma1 $4 // t = Sigma1(e);
|
||||
add $7, $7, t // h = h+Sigma1(e)
|
||||
Ch $4, $5, $6 // t = Ch (e, f, g);
|
||||
ldr s, WK($8) // s = WK
|
||||
add $7, $7, t // h = h+Sigma1(e)+Ch(e,f,g);
|
||||
add $7, $7, s // h = h+Sigma1(e)+Ch(e,f,g)+WK
|
||||
add $3, $3, $7 // d += h;
|
||||
Sigma0 $0 // t = Sigma0(a);
|
||||
add $7, $7, t // h += Sigma0(a);
|
||||
Maj $0, $1, $2 // t = Maj(a,b,c)
|
||||
add $7, $7, t // h = T1 + Sigma0(a) + Maj(a,b,c);
|
||||
.endm
|
||||
|
||||
.macro round s0, s1, s2, s3, s4, s5, s6, s7, s8
|
||||
ror t, \s4, #14
|
||||
eor s, \s5, \s6
|
||||
ldr u, WK(\s8) // t = WK
|
||||
eor t, t, \s4, ror #18
|
||||
and s, s, \s4
|
||||
add \s7, \s7, u // h = h+WK
|
||||
eor t, t, \s4, ror #41
|
||||
eor s, s, \s6
|
||||
add \s7, \s7, t // h = h+WK+Sigma1(e)
|
||||
eor t, \s1, \s2 // y^z
|
||||
add \s7, \s7, s // h = h+WK+Sigma1(e)+Ch(e,f,g);
|
||||
ror s, \s0, #28
|
||||
add \s3, \s3, \s7 // d += h;
|
||||
and u, \s1,\s2 // y&z
|
||||
eor s, s, \s0, ror #34
|
||||
and t, t, \s0 // x&(y^z)
|
||||
eor s, s, \s0, ror #39
|
||||
eor t, t, u // Maj(x,y,z)
|
||||
add \s7, \s7, s // h += Sigma0(a);
|
||||
add \s7, \s7, t // h = T1 + Sigma0(a) + Maj(a,b,c);
|
||||
.endm
|
||||
|
||||
.macro combined_message_round_update2 s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7
|
||||
|
||||
//
|
||||
// message_update2 \vec0, \vec1, \vec2, \vec3, \vec4, \vec5, \vec6, \vec7
|
||||
// round \s0, \s1, \s2, \s3, \s4, \s5, \s6, \s7, 0+\s8+\s9
|
||||
// round \s7, \s0, \s1, \s2, \s3, \s4, \s5, \s6, 1+\s8+\s9
|
||||
|
||||
ror t, \s4, #14
|
||||
ldr u, WK(0+\s8+\s9) // t = WK
|
||||
eor s, \s5, \s6
|
||||
ext.16b v18, \vec4, \vec5, #8 // vext(W4,W5)
|
||||
eor t, t, \s4, ror #18
|
||||
and s, s, \s4
|
||||
ext.16b v19, \vec0, \vec1, #8 // vext(W0,W1)
|
||||
|
||||
add \s7, \s7, u // h = h+WK
|
||||
eor t, t, \s4, ror #41
|
||||
ushr.2d v16, \vec7, #19 // part of S64(19, x)
|
||||
eor s, s, \s6
|
||||
add \s7, \s7, t // h = h+WK+Sigma1(e)
|
||||
shl.2d v17, \vec7, #3 // part of S64(61, x)
|
||||
eor t, \s1, \s2 // y^z
|
||||
add.2d \vec0, \vec0, v18 // W0 + vext(W4,W5)
|
||||
ushr.2d v18, \vec7, #6 // R(6,x)
|
||||
add \s7, \s7, s // h = h+WK+Sigma1(e)+Ch(e,f,g);
|
||||
ushr.2d v20, v19, #1 // part of S64(1, x)
|
||||
ror s, \s0, #28
|
||||
shl.2d v21, v19, #56 // part of S64(8, x)
|
||||
add \s3, \s3, \s7 // d += h;
|
||||
ushr.2d v19, v19, #7 // R(7, x)
|
||||
and u, \s1,\s2 // y&z
|
||||
|
||||
eor.16b v18, v18, v16
|
||||
eor s, s, \s0, ror #34
|
||||
ushr.2d v16, v16, #42 // part of S64(61, x)
|
||||
and t, t, \s0 // x&(y^z)
|
||||
eor.16b v19, v19, v20
|
||||
eor s, s, \s0, ror #39
|
||||
ushr.2d v20, v20, #7 // part of S64(8, x)
|
||||
eor t, t, u // Maj(x,y,z)
|
||||
|
||||
eor.16b v18, v18, v17
|
||||
add \s7, \s7, s // h += Sigma0(a);
|
||||
shl.2d v17, v17, #42 // part of S64(19, x)
|
||||
add \s7, \s7, t // h = T1 + Sigma0(a) + Maj(a,b,c);
|
||||
eor.16b v19, v19, v21
|
||||
ror t, \s3, #14
|
||||
shl.2d v21,v21, #7 // part of S64(1, x)
|
||||
ldr u, WK(1+\s8+\s9) // t = WK
|
||||
eor s, \s4, \s5
|
||||
eor.16b v18, v18, v16
|
||||
ldr q16, [K]
|
||||
eor t, t, \s3, ror #18
|
||||
eor.16b v19, v19, v20
|
||||
add K, K, #16
|
||||
|
||||
eor.16b v18, v18, v17
|
||||
and s, s, \s3
|
||||
eor.16b v19, v19, v21
|
||||
add \s6, \s6, u // h = h+WK
|
||||
|
||||
add.2d \vec0, \vec0, v18 // W0 + Gamma1(W7) + vext(W4,W5)
|
||||
eor t, t, \s3, ror #41
|
||||
add.2d \vec0, \vec0, v19 // W0 + Gamma1(W7) + vext(W4,W5) + Gamma0(vext(W0,W1))
|
||||
eor s, s, \s5
|
||||
add \s6, \s6, t // h = h+WK+Sigma1(e)
|
||||
eor t, \s0, \s1 // y^z
|
||||
add.2d v16, v16, \vec0
|
||||
add \s6, \s6, s // h = h+WK+Sigma1(e)+Ch(e,f,g);
|
||||
ror s, \s7, #28
|
||||
add \s2, \s2, \s6 // d += h;
|
||||
and u, \s0,\s1 // y&z
|
||||
eor s, s, \s7, ror #34
|
||||
and t, t, \s7 // x&(y^z)
|
||||
eor s, s, \s7, ror #39
|
||||
eor t, t, u // Maj(x,y,z)
|
||||
add \s6, \s6, s // h += Sigma0(a);
|
||||
add \s6, \s6, t // h = T1 + Sigma0(a) + Maj(a,b,c);
|
||||
|
||||
str q16, WK(\s9)
|
||||
.endm
|
||||
|
||||
/*
|
||||
16 rounds of hash update, update input schedule W (in vector register v0-v7) and WK = W + K (in stack)
|
||||
*/
|
||||
.macro rounds_schedule
|
||||
|
||||
combined_message_round_update2 $0, $1, $2, $3, $4, $5, $6, $7, $8, 0, W0, W1, W2, W3, W4, W5, W6, W7
|
||||
combined_message_round_update2 $6, $7, $0, $1, $2, $3, $4, $5, $8, 2, W1, W2, W3, W4, W5, W6, W7, W0
|
||||
combined_message_round_update2 $4, $5, $6, $7, $0, $1, $2, $3, $8, 4, W2, W3, W4, W5, W6, W7, W0, W1
|
||||
combined_message_round_update2 $2, $3, $4, $5, $6, $7, $0, $1, $8, 6, W3, W4, W5, W6, W7, W0, W1, W2
|
||||
combined_message_round_update2 $0, $1, $2, $3, $4, $5, $6, $7, $8, 8, W4, W5, W6, W7, W0, W1, W2, W3
|
||||
combined_message_round_update2 $6, $7, $0, $1, $2, $3, $4, $5, $8,10, W5, W6, W7, W0, W1, W2, W3, W4
|
||||
combined_message_round_update2 $4, $5, $6, $7, $0, $1, $2, $3, $8,12, W6, W7, W0, W1, W2, W3, W4, W5
|
||||
combined_message_round_update2 $2, $3, $4, $5, $6, $7, $0, $1, $8,14, W7, W0, W1, W2, W3, W4, W5, W6
|
||||
|
||||
.endm
|
||||
|
||||
/*
|
||||
16 rounds of hash update, load new input schedule W (in vector register v0-v7) and update WK = W + K (in stack)
|
||||
*/
|
||||
.macro combined_initial_round_update2 s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, vec0
|
||||
|
||||
ror t, \s4, #14
|
||||
ldr u, WK(0+\s8+\s9) // t = WK
|
||||
eor s, \s5, \s6
|
||||
ld1.16b {\vec0}, [data], #16
|
||||
eor t, t, \s4, ror #18
|
||||
and s, s, \s4
|
||||
add \s7, \s7, u // h = h+WK
|
||||
eor t, t, \s4, ror #41
|
||||
eor s, s, \s6
|
||||
add \s7, \s7, t // h = h+WK+Sigma1(e)
|
||||
eor t, \s1, \s2 // y^z
|
||||
add \s7, \s7, s // h = h+WK+Sigma1(e)+Ch(e,f,g);
|
||||
ror s, \s0, #28
|
||||
ld1.2d {v16}, [K], #16
|
||||
|
||||
add \s3, \s3, \s7 // d += h;
|
||||
and u, \s1,\s2 // y&z
|
||||
eor s, s, \s0, ror #34
|
||||
and t, t, \s0 // x&(y^z)
|
||||
|
||||
|
||||
|
||||
eor s, s, \s0, ror #39
|
||||
eor t, t, u // Maj(x,y,z)
|
||||
add \s7, \s7, s // h += Sigma0(a);
|
||||
add \s7, \s7, t // h = T1 + Sigma0(a) + Maj(a,b,c);
|
||||
ror t, \s3, #14
|
||||
eor s, \s4, \s5
|
||||
ldr u, WK(1+\s8+\s9) // t = WK
|
||||
|
||||
eor t, t, \s3, ror #18
|
||||
and s, s, \s3
|
||||
|
||||
add \s6, \s6, u // h = h+WK
|
||||
|
||||
rev64.16b \vec0, \vec0
|
||||
|
||||
eor t, t, \s3, ror #41
|
||||
eor s, s, \s5
|
||||
add \s6, \s6, t // h = h+WK+Sigma1(e)
|
||||
eor t, \s0, \s1 // y^z
|
||||
add \s6, \s6, s // h = h+WK+Sigma1(e)+Ch(e,f,g);
|
||||
ror s, \s7, #28
|
||||
add.2d v16, v16, \vec0
|
||||
add \s2, \s2, \s6 // d += h;
|
||||
and u, \s0,\s1 // y&z
|
||||
eor s, s, \s7, ror #34
|
||||
and t, t, \s7 // x&(y^z)
|
||||
eor s, s, \s7, ror #39
|
||||
eor t, t, u // Maj(x,y,z)
|
||||
add \s6, \s6, s // h += Sigma0(a);
|
||||
str q16, WK(\s9)
|
||||
add \s6, \s6, t // h = T1 + Sigma0(a) + Maj(a,b,c);
|
||||
|
||||
.endm
|
||||
|
||||
.macro rounds_schedule_initial
|
||||
|
||||
combined_initial_round_update2 $0, $1, $2, $3, $4, $5, $6, $7, $8, 0, W0
|
||||
combined_initial_round_update2 $6, $7, $0, $1, $2, $3, $4, $5, $8, 2, W1
|
||||
combined_initial_round_update2 $4, $5, $6, $7, $0, $1, $2, $3, $8, 4, W2
|
||||
combined_initial_round_update2 $2, $3, $4, $5, $6, $7, $0, $1, $8, 6, W3
|
||||
combined_initial_round_update2 $0, $1, $2, $3, $4, $5, $6, $7, $8, 8, W4
|
||||
combined_initial_round_update2 $6, $7, $0, $1, $2, $3, $4, $5, $8,10, W5
|
||||
combined_initial_round_update2 $4, $5, $6, $7, $0, $1, $2, $3, $8,12, W6
|
||||
combined_initial_round_update2 $2, $3, $4, $5, $6, $7, $0, $1, $8,14, W7
|
||||
|
||||
.endm
|
||||
|
||||
/*
|
||||
16 rounds of hash update
|
||||
*/
|
||||
.macro rounds_schedule_final
|
||||
round $0, $1, $2, $3, $4, $5, $6, $7, 0+$8
|
||||
round $7, $0, $1, $2, $3, $4, $5, $6, 1+$8
|
||||
|
||||
round $6, $7, $0, $1, $2, $3, $4, $5, 2+$8
|
||||
round $5, $6, $7, $0, $1, $2, $3, $4, 3+$8
|
||||
|
||||
round $4, $5, $6, $7, $0, $1, $2, $3, 4+$8
|
||||
round $3, $4, $5, $6, $7, $0, $1, $2, 5+$8
|
||||
|
||||
round $2, $3, $4, $5, $6, $7, $0, $1, 6+$8
|
||||
round $1, $2, $3, $4, $5, $6, $7, $0, 7+$8
|
||||
|
||||
round $0, $1, $2, $3, $4, $5, $6, $7, 8+$8
|
||||
round $7, $0, $1, $2, $3, $4, $5, $6, 9+$8
|
||||
|
||||
round $6, $7, $0, $1, $2, $3, $4, $5, 10+$8
|
||||
round $5, $6, $7, $0, $1, $2, $3, $4, 11+$8
|
||||
|
||||
round $4, $5, $6, $7, $0, $1, $2, $3, 12+$8
|
||||
round $3, $4, $5, $6, $7, $0, $1, $2, 13+$8
|
||||
|
||||
round $2, $3, $4, $5, $6, $7, $0, $1, 14+$8
|
||||
round $1, $2, $3, $4, $5, $6, $7, $0, 15+$8
|
||||
.endm
|
||||
|
||||
.subsections_via_symbols
|
||||
.text
|
||||
.p2align 4
|
||||
.globl _AccelerateCrypto_SHA512_compress
|
||||
_AccelerateCrypto_SHA512_compress:
|
||||
BRANCH_TARGET_CALL
|
||||
|
||||
#ifdef __ILP32__
|
||||
uxtw num_blocks, num_blocks // in arm64_32 size_t is 32-bit, so we need to extend it
|
||||
#endif
|
||||
|
||||
|
||||
adrp K, _sha512_K@page
|
||||
cbnz num_blocks, 1f // if number of blocks is nonzero, go on for sha256 transform operation
|
||||
ret lr // otherwise, return
|
||||
1:
|
||||
add K, K, _sha512_K@pageoff
|
||||
|
||||
#if BUILDKERNEL
|
||||
// v0-v7, v16-v23
|
||||
sub x4, sp, #16*16
|
||||
sub sp, sp, #16*16
|
||||
st1.4s {v0, v1, v2, v3}, [x4], #64
|
||||
st1.4s {v4, v5, v6, v7}, [x4], #64
|
||||
st1.4s {v16, v17, v18, v19}, [x4], #64
|
||||
st1.4s {v20, v21, v22, v23}, [x4], #64
|
||||
#endif
|
||||
|
||||
|
||||
// allocate stack space for WK[0:15]
|
||||
sub sp, sp, #stack_size
|
||||
ldr q0, [data], #128
|
||||
ldr q1, [data, #-112]
|
||||
ldr q2, [data, #-96]
|
||||
|
||||
ldr q3, [data, #-80]
|
||||
rev64.16b v0, v0
|
||||
ldr q4, [data, #-64]
|
||||
rev64.16b v1, v1
|
||||
ldr q5, [data, #-48]
|
||||
rev64.16b v2, v2
|
||||
ldr q6, [data, #-32]
|
||||
rev64.16b v3, v3
|
||||
ldr q7, [data, #-16]
|
||||
rev64.16b v4, v4
|
||||
ldr q16, [K], #64
|
||||
rev64.16b v5, v5
|
||||
ldr q17, [K, #-48]
|
||||
rev64.16b v6, v6
|
||||
ldr q18, [K, #-32]
|
||||
rev64.16b v7, v7
|
||||
ldr q19, [K, #-16]
|
||||
|
||||
|
||||
// compute WK[0:15] and save in stack
|
||||
add.2d v20, v16, v0
|
||||
ldr q16, [K], #64
|
||||
add.2d v21, v17, v1
|
||||
ldr q17, [K, #-48]
|
||||
add.2d v22, v18, v2
|
||||
ldr q18, [K, #-32]
|
||||
add.2d v23, v19, v3
|
||||
ldr q19, [K, #-16]
|
||||
add.2d v16, v16, v4
|
||||
str q20, [sp]
|
||||
add.2d v17, v17, v5
|
||||
str q21, [sp, #16*1]
|
||||
add.2d v18, v18, v6
|
||||
str q22, [sp, #16*2]
|
||||
add.2d v19, v19, v7
|
||||
str q23, [sp, #16*3]
|
||||
str q16, [sp, #16*4]
|
||||
str q17, [sp, #16*5]
|
||||
str q18, [sp, #16*6]
|
||||
str q19, [sp, #16*7]
|
||||
|
||||
L_loop:
|
||||
|
||||
// digests a-h = ctx->states;
|
||||
ldp a, bb, [ctx]
|
||||
ldp c, d, [ctx, #16]
|
||||
ldp e, f, [ctx, #32]
|
||||
ldp g, h, [ctx, #48]
|
||||
|
||||
// rounds 0:47 interleaved with W/WK update for rounds 16:63
|
||||
mov w15, #4
|
||||
L_i_loop:
|
||||
rounds_schedule a, bb, c, d, e, f, g, h, 16
|
||||
subs w15, w15, #1
|
||||
b.gt L_i_loop
|
||||
|
||||
// revert K to the beginning of K256[]
|
||||
sub K, K, #640
|
||||
subs num_blocks, num_blocks, #1 // num_blocks--
|
||||
|
||||
b.eq L_final_block // if final block, wrap up final rounds
|
||||
|
||||
rounds_schedule_initial a, bb, c, d, e, f, g, h, 0
|
||||
|
||||
// ctx->states += digests a-h
|
||||
ldp s, t, [ctx]
|
||||
add s, s, a
|
||||
add t, t, bb
|
||||
stp s, t, [ctx]
|
||||
ldp s, t, [ctx, #16]
|
||||
add s, s, c
|
||||
add t, t, d
|
||||
stp s, t, [ctx, #16]
|
||||
ldp s, t, [ctx, #32]
|
||||
add s, s, e
|
||||
add t, t, f
|
||||
stp s, t, [ctx, #32]
|
||||
ldp s, t, [ctx, #48]
|
||||
add s, s, g
|
||||
add t, t, h
|
||||
stp s, t, [ctx, #48]
|
||||
|
||||
b L_loop // branch for next block
|
||||
|
||||
// wrap up digest update round 48:63 for final block
|
||||
L_final_block:
|
||||
rounds_schedule_final a, bb, c, d, e, f, g, h, 0
|
||||
|
||||
// ctx->states += digests a-h
|
||||
ldp s, t, [ctx]
|
||||
add s, s, a
|
||||
add t, t, bb
|
||||
stp s, t, [ctx]
|
||||
ldp s, t, [ctx, #16]
|
||||
add s, s, c
|
||||
add t, t, d
|
||||
stp s, t, [ctx, #16]
|
||||
ldp s, t, [ctx, #32]
|
||||
add s, s, e
|
||||
add t, t, f
|
||||
stp s, t, [ctx, #32]
|
||||
ldp s, t, [ctx, #48]
|
||||
add s, s, g
|
||||
add t, t, h
|
||||
stp s, t, [ctx, #48]
|
||||
|
||||
// if kernel, restore used vector registers
|
||||
#if BUILDKERNEL
|
||||
ld1.4s {v0, v1, v2, v3}, [sp], #64
|
||||
ld1.4s {v4, v5, v6, v7}, [sp], #64
|
||||
ld1.4s {v16, v17, v18, v19}, [sp], #64
|
||||
ld1.4s {v20, v21, v22, v23}, [sp], #64
|
||||
#endif
|
||||
|
||||
// free allocated stack memory
|
||||
add sp, sp, #stack_size
|
||||
|
||||
// return
|
||||
ret lr
|
||||
|
||||
#endif // __arm64__
|
||||
|
|
@ -0,0 +1,259 @@
|
|||
# Copyright (c) (2016,2018,2019,2020) Apple Inc. All rights reserved.
|
||||
#
|
||||
# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
# is contained in the License.txt file distributed with corecrypto) and only to
|
||||
# people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
# Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
# devices and computers you own or control, for the sole purpose of verifying the
|
||||
# security characteristics and correct functioning of the Apple Software. You may
|
||||
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
/*
|
||||
This file provides arm64 hand implementation of the following function
|
||||
|
||||
void sha512_compress(uint64_t *state, size_t nblocks, const void *in);
|
||||
|
||||
sha512 algorithm per block description:
|
||||
|
||||
1. W(0:15) = big-endian (per 8 bytes) loading of input data (128 bytes)
|
||||
2. load 8 digests (each 64bit) a-h from state
|
||||
3. for r = 0:15
|
||||
T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
|
||||
d += T1;
|
||||
h = T1 + Sigma0(a) + Maj(a,b,c)
|
||||
permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
|
||||
4. for r = 16:79
|
||||
W[r] = W[r-16] + Gamma1(W[r-2]) + W[r-7] + Gamma0(W[r-15]);
|
||||
T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
|
||||
d += T1;
|
||||
h = T1 + Sigma0(a) + Maj(a,b,c)
|
||||
permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
|
||||
|
||||
In the assembly implementation:
|
||||
- a circular window of message schedule W(r:r+15) is updated and stored in v0-v7
|
||||
- its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
|
||||
- the 8 digests (a-h) will be stored in GPR (%r8-%r15)
|
||||
|
||||
----------------------------------------------------------------------------
|
||||
|
||||
our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block
|
||||
into the last 16 rounds of its previous block:
|
||||
|
||||
----------------------------------------------------------------------------
|
||||
|
||||
load W(0:15) (big-endian per 8 bytes) into v0:v7
|
||||
pre_calculate and store W+K(0:15) in stack
|
||||
|
||||
L_loop:
|
||||
|
||||
load digests a-h from ctx->state;
|
||||
|
||||
for (r=0;r<64;r+=2) {
|
||||
digests a-h update and permute round r:r+1
|
||||
update W([r:r+1]%16) and WK([r:r+1]%16) for the next 8th iteration
|
||||
}
|
||||
|
||||
num_block--;
|
||||
if (num_block==0) jmp L_last_block;
|
||||
|
||||
for (r=64;r<80;r+=2) {
|
||||
digests a-h update and permute round r:r+1
|
||||
load W([r:r+1]%16) (big-endian per 8 bytes) into v0:v7
|
||||
pre_calculate and store W+K([r:r+1]%16) in stack
|
||||
}
|
||||
|
||||
ctx->states += digests a-h;
|
||||
|
||||
jmp L_loop;
|
||||
|
||||
L_last_block:
|
||||
|
||||
for (r=64;r<80;r+=2) {
|
||||
digests a-h update and permute round r:r+2
|
||||
}
|
||||
|
||||
ctx->states += digests a-h;
|
||||
|
||||
------------------------------------------------------------------------
|
||||
|
||||
Apple CoreOS vector & numerics
|
||||
*/
|
||||
|
||||
#if defined __arm64__
|
||||
|
||||
#include "ccarm_pac_bti_macros.h"
|
||||
|
||||
.macro swap_hilo
|
||||
ext.16b $0, $0, $0, #8
|
||||
.endm
|
||||
|
||||
.macro ext16b
|
||||
ext.16b $0, $1, $2, #8
|
||||
.endm
|
||||
|
||||
|
||||
.text
|
||||
.align 4
|
||||
.globl _AccelerateCrypto_SHA512_compress_hwassist
|
||||
|
||||
_AccelerateCrypto_SHA512_compress_hwassist:
|
||||
|
||||
BRANCH_TARGET_CALL
|
||||
|
||||
|
||||
#define hashes x0
|
||||
#define numblocks x1
|
||||
#define data x2
|
||||
#define ktable x3
|
||||
|
||||
#ifdef __ILP32__
|
||||
uxtw numblocks, numblocks // in arm64_32 size_t is 32-bit, so we need to extend it
|
||||
#endif
|
||||
|
||||
|
||||
adrp ktable, _ccsha512_K@page
|
||||
cbnz numblocks, 1f
|
||||
ret lr // otherwise, return
|
||||
1:
|
||||
add ktable, ktable, _ccsha512_K@pageoff
|
||||
|
||||
#if BUILDKERNEL
|
||||
sub x4, sp, #28*16
|
||||
sub sp, sp, #28*16
|
||||
st1.4s {v0, v1, v2, v3}, [x4], #64
|
||||
st1.4s {v4, v5, v6, v7}, [x4], #64
|
||||
st1.4s {v16, v17, v18, v19}, [x4], #64
|
||||
st1.4s {v20, v21, v22, v23}, [x4], #64
|
||||
st1.4s {v24, v25, v26, v27}, [x4], #64
|
||||
st1.4s {v28, v29, v30, v31}, [x4], #64
|
||||
#else
|
||||
sub x4, sp, #4*16
|
||||
sub sp, sp, #4*16
|
||||
#endif
|
||||
st1.4s {v8, v9, v10, v11}, [x4], #64
|
||||
|
||||
ld1.2d {v8,v9,v10,v11}, [hashes] // (a,b) (c,d) (e,f) (g,h)
|
||||
|
||||
L_loop:
|
||||
|
||||
mov.16b v24, v8
|
||||
ldr q0, [data, #0*16]
|
||||
mov.16b v25, v9
|
||||
ldr q1, [data, #1*16]
|
||||
mov.16b v26, v10
|
||||
ldr q2, [data, #2*16]
|
||||
mov.16b v27, v11
|
||||
ldr q3, [data, #3*16]
|
||||
|
||||
rev64.16b v0, v0
|
||||
ldr q4, [data, #4*16]
|
||||
rev64.16b v1, v1
|
||||
ldr q5, [data, #5*16]
|
||||
rev64.16b v2, v2
|
||||
ldr q6, [data, #6*16]
|
||||
rev64.16b v3, v3
|
||||
ldr q7, [data, #7*16]
|
||||
rev64.16b v4, v4
|
||||
ldr q16, [ktable, #0*16]
|
||||
rev64.16b v5, v5
|
||||
ldr q17, [ktable, #1*16]
|
||||
rev64.16b v6, v6
|
||||
ldr q18, [ktable, #2*16]
|
||||
rev64.16b v7, v7
|
||||
ldr q19, [ktable, #3*16]
|
||||
|
||||
add.2d v16, v16, v0
|
||||
ldr q20, [ktable, #4*16]
|
||||
add.2d v17, v17, v1
|
||||
ldr q21, [ktable, #5*16]
|
||||
add.2d v18, v18, v2
|
||||
ldr q22, [ktable, #6*16]
|
||||
add.2d v19, v19, v3
|
||||
ldr q23, [ktable, #7*16]
|
||||
add.2d v20, v20, v4
|
||||
add data, data, #8*16
|
||||
add.2d v21, v21, v5
|
||||
add ktable, ktable, #8*16
|
||||
add.2d v22, v22, v6
|
||||
add.2d v23, v23, v7
|
||||
|
||||
.macro sha512_round S0, S1, S2, S3, WK, w0, w1, w4, w5, w7, i
|
||||
ext16b \WK, \WK, \WK
|
||||
ext16b v29, \S2, \S3
|
||||
ext16b v28, \S1, \S2
|
||||
add.2d \S3, \S3, \WK
|
||||
ext16b v31, \w4, \w5
|
||||
ldr q30, [ktable, #\i*16]
|
||||
sha512h.2d \S3, v29, v28
|
||||
sha512su0.2d \w0, \w1
|
||||
mov.16b v28, \S3
|
||||
sha512h2.2d \S3, \S1, \S0
|
||||
sha512su1.2d \w0, \w7, v31
|
||||
add.2d \S1, \S1, v28
|
||||
add.2d \WK, \w0, v30
|
||||
.endm
|
||||
|
||||
.macro sha512_8_rounds
|
||||
sha512_round v24, v25, v26, v27, v16, v0, v1, v4, v5, v7, 0
|
||||
sha512_round v27, v24, v25, v26, v17, v1, v2, v5, v6, v0, 1
|
||||
sha512_round v26, v27, v24, v25, v18, v2, v3, v6, v7, v1, 2
|
||||
sha512_round v25, v26, v27, v24, v19, v3, v4, v7, v0, v2, 3
|
||||
sha512_round v24, v25, v26, v27, v20, v4, v5, v0, v1, v3, 4
|
||||
sha512_round v27, v24, v25, v26, v21, v5, v6, v1, v2, v4, 5
|
||||
sha512_round v26, v27, v24, v25, v22, v6, v7, v2, v3, v5, 6
|
||||
sha512_round v25, v26, v27, v24, v23, v7, v0, v3, v4, v6, 7
|
||||
add ktable, ktable, #16*8
|
||||
.endm
|
||||
|
||||
.macro sha512_round_final S0, S1, S2, S3, WK, w0, w1, w4, w5, w7
|
||||
ext16b \WK, \WK, \WK
|
||||
ext16b v29, \S2, \S3
|
||||
ext16b v28, \S1, \S2
|
||||
add.2d v30, \S3, \WK
|
||||
sha512h.2d v30, v29, v28
|
||||
mov.16b \S3, v30
|
||||
sha512h2.2d \S3, \S1, \S0
|
||||
add.2d \S1, \S1, v30
|
||||
.endm
|
||||
|
||||
.macro sha512_8_rounds_final
|
||||
sha512_round_final v24, v25, v26, v27, v16
|
||||
sha512_round_final v27, v24, v25, v26, v17
|
||||
sha512_round_final v26, v27, v24, v25, v18
|
||||
sha512_round_final v25, v26, v27, v24, v19
|
||||
sha512_round_final v24, v25, v26, v27, v20
|
||||
sha512_round_final v27, v24, v25, v26, v21
|
||||
sha512_round_final v26, v27, v24, v25, v22
|
||||
sha512_round_final v25, v26, v27, v24, v23
|
||||
.endm
|
||||
|
||||
sha512_8_rounds
|
||||
sha512_8_rounds
|
||||
sha512_8_rounds
|
||||
sha512_8_rounds
|
||||
sha512_8_rounds_final
|
||||
|
||||
add.2d v8, v8, v24
|
||||
add.2d v9, v9, v25
|
||||
add.2d v10, v10, v26
|
||||
add.2d v11, v11, v27
|
||||
|
||||
subs numblocks, numblocks, #1 // pre-decrement num_blocks by 1
|
||||
sub ktable, ktable, #640
|
||||
b.gt L_loop
|
||||
|
||||
st1.2d {v8,v9,v10,v11}, [hashes]
|
||||
|
||||
#if BUILDKERNEL
|
||||
ld1.4s {v0, v1, v2, v3}, [sp], #64
|
||||
ld1.4s {v4, v5, v6, v7}, [sp], #64
|
||||
ld1.4s {v16, v17, v18, v19}, [sp], #64
|
||||
ld1.4s {v20, v21, v22, v23}, [sp], #64
|
||||
ld1.4s {v24, v25, v26, v27}, [sp], #64
|
||||
ld1.4s {v28, v29, v30, v31}, [sp], #64
|
||||
#endif
|
||||
ld1.4s {v8, v9, v10, v11}, [sp], #64
|
||||
|
||||
ret lr
|
||||
|
||||
#endif
|
||||
|
||||
|
|
@ -0,0 +1,29 @@
|
|||
/* Copyright (c) (2019) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
#include "config.h"
|
||||
#include "AccelerateCrypto.h"
|
||||
|
||||
#if defined(__x86_64__)
|
||||
|
||||
extern void AccelerateCrypto_SHA512_compress_ssse3(uint64_t *state, size_t num, const void *buf) __asm__("_AccelerateCrypto_SHA512_compress_ssse3");
|
||||
extern void AccelerateCrypto_SHA512_compress_AVX1(uint64_t *state, size_t num, const void *buf) __asm__("_AccelerateCrypto_SHA512_compress_AVX1");
|
||||
extern void AccelerateCrypto_SHA512_compress_AVX2(uint64_t *state, size_t num, const void *buf)__asm__("_AccelerateCrypto_SHA512_compress_AVX2");
|
||||
|
||||
void AccelerateCrypto_SHA512_compress(uint64_t *state, size_t num, const void *buf)
|
||||
{
|
||||
if (HAS_AVX2()) AccelerateCrypto_SHA512_compress_AVX2(state, num, buf);
|
||||
else if (HAS_AVX1()) AccelerateCrypto_SHA512_compress_AVX1(state, num, buf);
|
||||
else
|
||||
AccelerateCrypto_SHA512_compress_ssse3(state, num, buf);
|
||||
}
|
||||
#endif // defined(__x86_64__)
|
||||
|
|
@ -0,0 +1,616 @@
|
|||
# Copyright (c) (2016,2018,2019) Apple Inc. All rights reserved.
|
||||
#
|
||||
# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
# is contained in the License.txt file distributed with corecrypto) and only to
|
||||
# people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
# Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
# devices and computers you own or control, for the sole purpose of verifying the
|
||||
# security characteristics and correct functioning of the Apple Software. You may
|
||||
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
#include <corecrypto/cc_config.h>
|
||||
|
||||
/*
|
||||
This file provides x86_64 hand implementation of the following function
|
||||
|
||||
void sha512_compress(uint64_t *state, size_t nblocks, const void *in);
|
||||
|
||||
sha512 algorithm per block description:
|
||||
|
||||
1. W(0:15) = big-endian (per 8 bytes) loading of input data (128 bytes)
|
||||
2. load 8 digests (each 64bit) a-h from state
|
||||
3. for r = 0:15
|
||||
T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
|
||||
d += T1;
|
||||
h = T1 + Sigma0(a) + Maj(a,b,c)
|
||||
permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
|
||||
4. for r = 16:79
|
||||
W[r] = W[r-16] + Gamma1(W[r-2]) + W[r-7] + Gamma0(W[r-15]);
|
||||
T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
|
||||
d += T1;
|
||||
h = T1 + Sigma0(a) + Maj(a,b,c)
|
||||
permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
|
||||
|
||||
In the assembly implementation:
|
||||
- a circular window of message schedule W(r:r+15) is updated and stored in xmm0-xmm7 (or ymm0-ymm3/zmm0-zmm1 for avx1/avx2)
|
||||
- its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
|
||||
- the 8 digests (a-h) will be stored in GPR (%r8-%r15)
|
||||
|
||||
----------------------------------------------------------------------------
|
||||
|
||||
our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block
|
||||
into the last 16 rounds of its previous block:
|
||||
|
||||
----------------------------------------------------------------------------
|
||||
|
||||
load W(0:15) (big-endian per 8 bytes) into xmm0:xmm7
|
||||
pre_calculate and store W+K(0:15) in stack
|
||||
|
||||
L_loop:
|
||||
|
||||
load digests a-h from ctx->state;
|
||||
|
||||
for (r=0;r<64;r+=2) {
|
||||
digests a-h update and permute round r:r+1
|
||||
update W([r:r+1]%16) and WK([r:r+1]%16) for the next 8th iteration
|
||||
}
|
||||
|
||||
num_block--;
|
||||
if (num_block==0) jmp L_last_block;
|
||||
|
||||
for (r=64;r<80;r+=2) {
|
||||
digests a-h update and permute round r:r+1
|
||||
load W([r:r+1]%16) (big-endian per 8 bytes) into xmm0:xmm7
|
||||
pre_calculate and store W+K([r:r+1]%16) in stack
|
||||
}
|
||||
|
||||
ctx->states += digests a-h;
|
||||
|
||||
jmp L_loop;
|
||||
|
||||
L_last_block:
|
||||
|
||||
for (r=64;r<80;r+=2) {
|
||||
digests a-h update and permute round r:r+2
|
||||
}
|
||||
|
||||
ctx->states += digests a-h;
|
||||
|
||||
------------------------------------------------------------------------
|
||||
|
||||
Apple CoreOS vector & numerics
|
||||
*/
|
||||
#if defined __x86_64__
|
||||
|
||||
// associate variables with registers or memory
|
||||
|
||||
#define sp %rsp
|
||||
#define ctx %rdi
|
||||
#define num_blocks %rsi // later move this to stack, use %rsi for temp variable u
|
||||
#define data %rdx
|
||||
|
||||
#define a %r8
|
||||
#define b %r9
|
||||
#define c %r10
|
||||
#define d %r11
|
||||
#define e %r12
|
||||
#define f %r13
|
||||
#define g %r14
|
||||
#define h %r15
|
||||
|
||||
#define K %rbx
|
||||
#define _num_blocks (-48)(%rbp) // rbx/r12-r15
|
||||
#define stack_size (8+32*12+128+16) // 8 (_num_blocks) + ymm0:ymm11 + WK(0:15) + 16byte for 32-byte alignment
|
||||
|
||||
#define L_aligned_bswap L_bswap(%rip) // bswap : big-endian loading of 4-byte words
|
||||
#define ymm_save 128(sp) // starting address for xmm save/restore
|
||||
|
||||
// 3 local variables
|
||||
#define s %rax
|
||||
#define t %rcx
|
||||
#define u %rsi
|
||||
|
||||
// a window (16 quad-words) of message scheule
|
||||
#define W0 %xmm0
|
||||
#define W1 %xmm1
|
||||
#define W2 %xmm2
|
||||
#define W3 %xmm3
|
||||
#define W4 %xmm4
|
||||
#define W5 %xmm5
|
||||
#define W6 %xmm6
|
||||
#define W7 %xmm7
|
||||
|
||||
// circular buffer for WK[(r:r+15)%16]
|
||||
#define WK(x) ((x)&15)*8(sp)
|
||||
|
||||
// #define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
|
||||
|
||||
.macro Ch arg0, arg1, arg2
|
||||
#if 1
|
||||
mov \arg2, t
|
||||
xor \arg1, t
|
||||
and \arg0, t
|
||||
xor \arg2, t
|
||||
#else
|
||||
mov \arg0, t // x
|
||||
mov \arg0, s // x
|
||||
not t // ~x
|
||||
and \arg1, s // x & y
|
||||
and \arg2, t // ~x & z
|
||||
xor s, t // t = ((x) & (y)) ^ ((~(x)) & (z));
|
||||
#endif
|
||||
.endm
|
||||
|
||||
// #define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
|
||||
|
||||
.macro Maj arg0, arg1, arg2
|
||||
mov \arg1, t // y
|
||||
mov \arg2, s // z
|
||||
xor \arg2, t // y^z
|
||||
and \arg1, s // y&z
|
||||
and \arg0, t // x&(y^z)
|
||||
xor s, t // Maj(x,y,z)
|
||||
.endm
|
||||
|
||||
// #define Gamma0(x) (S64(1, (x)) ^ S64(8, (x)) ^ R(7 , (x)))
|
||||
|
||||
// performs Gamma0_512 on 2 words on an xmm registers
|
||||
// use xmm8/xmm9 as intermediate registers
|
||||
.macro Gamma0 arg0
|
||||
vpsrlq $1, \arg0, %xmm8 // part of S64(1, x)
|
||||
vpsllq $56, \arg0, %xmm9 // part of S64(8, x)
|
||||
vpsrlq $7, \arg0, \arg0 // R(7, x)
|
||||
vpxor %xmm8, \arg0, \arg0
|
||||
vpsrlq $7, %xmm8, %xmm8 // part of S64(8, x)
|
||||
vpxor %xmm9, \arg0, \arg0
|
||||
vpsllq $7, %xmm9, %xmm9 // part of S64(1, x)
|
||||
vpxor %xmm8, \arg0, \arg0
|
||||
vpxor %xmm9, \arg0, \arg0
|
||||
.endm
|
||||
|
||||
// #define Gamma1(x) (S64(19, (x)) ^ S64(61, (x)) ^ R(6, (x)))
|
||||
|
||||
// performs Gamma1_512 on 2 words on an xmm registers
|
||||
// use xmm8/xmm9 as intermediate registers
|
||||
.macro Gamma1 arg0
|
||||
vpsrlq $19, \arg0, %xmm8 // part of S64(19, x)
|
||||
vpsllq $3, \arg0, %xmm9 // part of S64(61, x)
|
||||
vpsrlq $6, \arg0, \arg0 // R(6, x)
|
||||
vpxor %xmm8, \arg0, \arg0
|
||||
vpsrlq $42, %xmm8, %xmm8 // part of S64(61, x)
|
||||
vpxor %xmm9, \arg0, \arg0
|
||||
vpsllq $42, %xmm9, %xmm9 // part of S64(19, x)
|
||||
vpxor %xmm8, \arg0, \arg0
|
||||
vpxor %xmm9, \arg0, \arg0
|
||||
.endm
|
||||
|
||||
// W[r] = W[r-16] + Gamma1(W[r-2]) + W[r-7] + Gamma0(W[r-15]);
|
||||
/*
|
||||
W0 W1 W2 W3 W4 W5 W6 W7
|
||||
|
||||
update 2 quad words in W0 = W0 + Gamma1(W7) + vext(W4,W5) + Gamma0(vext(W0,W1)).
|
||||
use %xmm10, %xmm11 for temp
|
||||
*/
|
||||
.macro message_update2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
|
||||
vpalignr $8, \arg4, \arg5, %xmm10 // vext(W4,W5)
|
||||
vpalignr $8, \arg0, \arg1, %xmm11 // vext(W0,W1)
|
||||
vpaddq %xmm10, \arg0, \arg0 // W0 + vext(W4,W5)
|
||||
// vmovdqa \arg7, %xmm10
|
||||
// Gamma1 %xmm10 // Gamma1(W7)
|
||||
vpsrlq $19, \arg7, %xmm8 // part of S64(19, x)
|
||||
vpsllq $3, \arg7, %xmm9 // part of S64(61, x)
|
||||
vpsrlq $6, \arg7, %xmm10 // R(6, x)
|
||||
vpxor %xmm8, %xmm10, %xmm10
|
||||
vpsrlq $42, %xmm8, %xmm8 // part of S64(61, x)
|
||||
vpxor %xmm9, %xmm10, %xmm10
|
||||
vpsllq $42, %xmm9, %xmm9 // part of S64(19, x)
|
||||
vpxor %xmm8, %xmm10, %xmm10
|
||||
vpxor %xmm9, %xmm10, %xmm10
|
||||
Gamma0 %xmm11 // Gamma0(vext(W0,W1))
|
||||
vpaddq %xmm10, \arg0, \arg0 // W0 + Gamma1(W7) + vext(W4,W5)
|
||||
vpaddq %xmm11, \arg0, \arg0 // W0 + Gamma1(W7) + vext(W4,W5) + Gamma0(vext(W0,W1))
|
||||
.endm
|
||||
|
||||
// #define Sigma0(x) (S64(28, (x)) ^ S64(34, (x)) ^ S64(39, (x)))
|
||||
|
||||
.macro Sigma0 arg0
|
||||
mov \arg0, t // x
|
||||
mov \arg0, s // x
|
||||
ror $28, t // S(28, (x))
|
||||
ror $34, s // S(34, (x))
|
||||
xor s, t // S(28, (x)) ^ S(34, (x))
|
||||
ror $5, s // S(39, (x))
|
||||
xor s, t // t = (S(28, (x)) ^ S(34, (x)) ^ S(39, (x)))
|
||||
.endm
|
||||
|
||||
// #define Sigma1(x) (S(14, (x)) ^ S(18, (x)) ^ S(41, (x)))
|
||||
|
||||
.macro Sigma1 arg0
|
||||
mov \arg0, s // x
|
||||
ror $14, s // S(14, (x))
|
||||
mov s, t // S(14, (x))
|
||||
ror $4, s // S(18, (x))
|
||||
xor s, t // S(14, (x)) ^ S(18, (x))
|
||||
ror $23, s // S(41, (x))
|
||||
xor s, t // t = (S(14, (x)) ^ S(18, (x)) ^ S(41, (x)))
|
||||
.endm
|
||||
|
||||
// per round digests update
|
||||
.macro round_ref arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
Sigma1 \arg4 // t = Sigma1(e);
|
||||
add t, \arg7 // h = h+Sigma1(e)
|
||||
Ch \arg4, \arg5, \arg6 // t = Ch (e, f, g);
|
||||
add t, \arg7 // h = h+Sigma1(e)+Ch(e,f,g);
|
||||
add WK(\arg8), \arg7 // h = h+Sigma1(e)+Ch(e,f,g)+WK
|
||||
add \arg7, \arg3 // d += h;
|
||||
Sigma0 \arg0 // t = Sigma0(a);
|
||||
add t, \arg7 // h += Sigma0(a);
|
||||
Maj \arg0, \arg1, \arg2 // t = Maj(a,b,c)
|
||||
add t, \arg7 // h = T1 + Sigma0(a) + Maj(a,b,c);
|
||||
.endm
|
||||
|
||||
.macro round arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
mov \arg4, s
|
||||
mov \arg0, t
|
||||
ror $(41-18), s
|
||||
ror $(39-34), t
|
||||
xor \arg4, s
|
||||
mov \arg5, u
|
||||
xor \arg0, t
|
||||
ror $(18-14), s
|
||||
xor \arg6, u
|
||||
xor \arg4, s
|
||||
ror $(34-28), t
|
||||
and \arg4, u
|
||||
xor \arg0, t
|
||||
xor \arg6, u
|
||||
ror $14, s
|
||||
ror $28, t
|
||||
add s, u
|
||||
mov \arg0, s
|
||||
add WK(\arg8), u
|
||||
or \arg2, s
|
||||
add u, \arg7
|
||||
mov \arg0, u
|
||||
add \arg7, \arg3
|
||||
and \arg1, s
|
||||
and \arg2, u
|
||||
or u, s
|
||||
add t, \arg7
|
||||
add s, \arg7
|
||||
.endm
|
||||
|
||||
/*
|
||||
16 rounds of hash update, update input schedule W (in vector register xmm0-xmm7) and WK = W + K (in stack)
|
||||
*/
|
||||
.macro rounds_schedule arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
message_update2 W0, W1, W2, W3, W4, W5, W6, W7
|
||||
vmovdqa 0*16(K), %xmm8
|
||||
round \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 0+\arg8
|
||||
vpaddq W0, %xmm8, %xmm8
|
||||
round \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 1+\arg8
|
||||
vmovdqa %xmm8, WK(0)
|
||||
|
||||
message_update2 W1, W2, W3, W4, W5, W6, W7, W0
|
||||
vmovdqa 1*16(K), %xmm8
|
||||
round \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 2+\arg8
|
||||
vpaddq W1, %xmm8, %xmm8
|
||||
round \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 3+\arg8
|
||||
vmovdqa %xmm8, WK(2)
|
||||
|
||||
message_update2 W2, W3, W4, W5, W6, W7, W0, W1
|
||||
vmovdqa 2*16(K), %xmm8
|
||||
round \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 4+\arg8
|
||||
vpaddq W2, %xmm8, %xmm8
|
||||
round \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 5+\arg8
|
||||
vmovdqa %xmm8, WK(4)
|
||||
|
||||
message_update2 W3, W4, W5, W6, W7, W0, W1, W2
|
||||
vmovdqa 3*16(K), %xmm8
|
||||
round \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 6+\arg8
|
||||
vpaddq W3, %xmm8, %xmm8
|
||||
round \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 7+\arg8
|
||||
vmovdqa %xmm8, WK(6)
|
||||
|
||||
message_update2 W4, W5, W6, W7, W0, W1, W2, W3
|
||||
movdqa 4*16(K), %xmm8
|
||||
round \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 8+\arg8
|
||||
paddq W4, %xmm8
|
||||
round \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 9+\arg8
|
||||
movdqa %xmm8, WK(8)
|
||||
|
||||
message_update2 W5, W6, W7, W0, W1, W2, W3, W4
|
||||
vmovdqa 5*16(K), %xmm8
|
||||
round \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 10+\arg8
|
||||
vpaddq W5, %xmm8, %xmm8
|
||||
round \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 11+\arg8
|
||||
vmovdqa %xmm8, WK(10)
|
||||
|
||||
message_update2 W6, W7, W0, W1, W2, W3, W4, W5
|
||||
vmovdqa 6*16(K), %xmm8
|
||||
round \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 12+\arg8
|
||||
vpaddq W6, %xmm8, %xmm8
|
||||
round \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 13+\arg8
|
||||
vmovdqa %xmm8, WK(12)
|
||||
|
||||
message_update2 W7, W0, W1, W2, W3, W4, W5, W6
|
||||
vmovdqa 7*16(K), %xmm8
|
||||
round \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 14+\arg8
|
||||
vpaddq W7, %xmm8, %xmm8
|
||||
round \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 15+\arg8
|
||||
vmovdqa %xmm8, WK(14)
|
||||
|
||||
addq $128, K
|
||||
.endm
|
||||
|
||||
/*
|
||||
16 rounds of hash update, load new input schedule W (in vector register xmm0-xmm7) and update WK = W + K (in stack)
|
||||
*/
|
||||
.macro rounds_schedule_initial arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
vmovdqu 0*16(data), W0
|
||||
vmovdqa 0*16(K), %xmm8
|
||||
vpshufb L_aligned_bswap, W0, W0
|
||||
round \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 0+\arg8
|
||||
vpaddq W0, %xmm8, %xmm8
|
||||
round \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 1+\arg8
|
||||
vmovdqa %xmm8, WK(0)
|
||||
|
||||
vmovdqu 1*16(data), W1
|
||||
vmovdqa 1*16(K), %xmm8
|
||||
vpshufb L_aligned_bswap, W1, W1
|
||||
round \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 2+\arg8
|
||||
vpaddq W1, %xmm8, %xmm8
|
||||
round \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 3+\arg8
|
||||
vmovdqa %xmm8, WK(2)
|
||||
|
||||
vmovdqu 2*16(data), W2
|
||||
vmovdqa 2*16(K), %xmm8
|
||||
vpshufb L_aligned_bswap, W2, W2
|
||||
round \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 4+\arg8
|
||||
vpaddq W2, %xmm8, %xmm8
|
||||
round \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 5+\arg8
|
||||
vmovdqa %xmm8, WK(4)
|
||||
|
||||
vmovdqu 3*16(data), W3
|
||||
vmovdqa 3*16(K), %xmm8
|
||||
vpshufb L_aligned_bswap, W3, W3
|
||||
round \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 6+\arg8
|
||||
vpaddq W3, %xmm8, %xmm8
|
||||
round \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 7+\arg8
|
||||
vmovdqa %xmm8, WK(6)
|
||||
|
||||
vmovdqu 4*16(data), W4
|
||||
vmovdqa 4*16(K), %xmm8
|
||||
vpshufb L_aligned_bswap, W4, W4
|
||||
round \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 8+\arg8
|
||||
vpaddq W4, %xmm8, %xmm8
|
||||
round \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 9+\arg8
|
||||
vmovdqa %xmm8, WK(8)
|
||||
|
||||
vmovdqu 5*16(data), W5
|
||||
vmovdqa 5*16(K), %xmm8
|
||||
vpshufb L_aligned_bswap, W5, W5
|
||||
round \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 10+\arg8
|
||||
vpaddq W5, %xmm8, %xmm8
|
||||
round \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 11+\arg8
|
||||
vmovdqa %xmm8, WK(10)
|
||||
|
||||
vmovdqu 6*16(data), W6
|
||||
vmovdqa 6*16(K), %xmm8
|
||||
vpshufb L_aligned_bswap, W6, W6
|
||||
round \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 12+\arg8
|
||||
vpaddq W6, %xmm8, %xmm8
|
||||
round \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 13+\arg8
|
||||
vmovdqa %xmm8, WK(12)
|
||||
|
||||
vmovdqu 7*16(data), W7
|
||||
vmovdqa 7*16(K), %xmm8
|
||||
vpshufb L_aligned_bswap, W7, W7
|
||||
round \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 14+\arg8
|
||||
vpaddq W7, %xmm8, %xmm8
|
||||
round \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 15+\arg8
|
||||
vmovdqa %xmm8, WK(14)
|
||||
|
||||
addq $128, K
|
||||
addq $128, data
|
||||
.endm
|
||||
|
||||
/*
|
||||
16 rounds of hash update
|
||||
*/
|
||||
.macro rounds_schedule_final arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
round \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 0+\arg8
|
||||
round \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 1+\arg8
|
||||
|
||||
round \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 2+\arg8
|
||||
round \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 3+\arg8
|
||||
|
||||
round \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 4+\arg8
|
||||
round \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 5+\arg8
|
||||
|
||||
round \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 6+\arg8
|
||||
round \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 7+\arg8
|
||||
|
||||
round \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 8+\arg8
|
||||
round \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 9+\arg8
|
||||
|
||||
round \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 10+\arg8
|
||||
round \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 11+\arg8
|
||||
|
||||
round \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 12+\arg8
|
||||
round \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 13+\arg8
|
||||
|
||||
round \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 14+\arg8
|
||||
round \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 15+\arg8
|
||||
.endm
|
||||
|
||||
.text
|
||||
.globl _AccelerateCrypto_SHA512_compress_AVX1
|
||||
_AccelerateCrypto_SHA512_compress_AVX1:
|
||||
|
||||
// push callee-saved registers
|
||||
push %rbp
|
||||
movq %rsp, %rbp
|
||||
push %rbx
|
||||
push %r12
|
||||
push %r13
|
||||
push %r14
|
||||
push %r15
|
||||
|
||||
// allocate stack space
|
||||
sub $stack_size, sp
|
||||
andq $-32, sp // aligned sp to 32-bytes
|
||||
|
||||
// if kernel code, save used xmm registers
|
||||
#if BUILDKERNEL
|
||||
vmovdqa %ymm0, 0*32+ymm_save
|
||||
vmovdqa %ymm1, 1*32+ymm_save
|
||||
vmovdqa %ymm2, 2*32+ymm_save
|
||||
vmovdqa %ymm3, 3*32+ymm_save
|
||||
vmovdqa %ymm4, 4*32+ymm_save
|
||||
vmovdqa %ymm5, 5*32+ymm_save
|
||||
vmovdqa %ymm6, 6*32+ymm_save
|
||||
vmovdqa %ymm7, 7*32+ymm_save
|
||||
vmovdqa %ymm8, 8*32+ymm_save
|
||||
vmovdqa %ymm9, 9*32+ymm_save
|
||||
vmovdqa %ymm10, 10*32+ymm_save
|
||||
vmovdqa %ymm11, 11*32+ymm_save
|
||||
#endif
|
||||
|
||||
movq num_blocks, _num_blocks
|
||||
|
||||
// set up bswap parameters in the aligned stack space and pointer to table K512[]
|
||||
lea CC_C_LABEL(sha512_K)(%rip), K
|
||||
|
||||
// load W[0:15] into xmm0-xmm7
|
||||
vmovdqu 0*16(data), W0
|
||||
vmovdqu 1*16(data), W1
|
||||
vmovdqu 2*16(data), W2
|
||||
vmovdqu 3*16(data), W3
|
||||
vmovdqu 4*16(data), W4
|
||||
vmovdqu 5*16(data), W5
|
||||
vmovdqu 6*16(data), W6
|
||||
vmovdqu 7*16(data), W7
|
||||
addq $128, data
|
||||
|
||||
vmovdqa L_aligned_bswap, %xmm8
|
||||
vpshufb %xmm8, W0, W0
|
||||
vpshufb %xmm8, W1, W1
|
||||
vpshufb %xmm8, W2, W2
|
||||
vpshufb %xmm8, W3, W3
|
||||
vpshufb %xmm8, W4, W4
|
||||
vpshufb %xmm8, W5, W5
|
||||
vpshufb %xmm8, W6, W6
|
||||
vpshufb %xmm8, W7, W7
|
||||
|
||||
// compute WK[0:15] and save in stack
|
||||
vpaddq 0*16(K), %xmm0, %xmm8
|
||||
vpaddq 1*16(K), %xmm1, %xmm9
|
||||
vpaddq 2*16(K), %xmm2, %xmm10
|
||||
vpaddq 3*16(K), %xmm3, %xmm11
|
||||
vmovdqa %xmm8, WK(0)
|
||||
vmovdqa %xmm9, WK(2)
|
||||
vmovdqa %xmm10, WK(4)
|
||||
vmovdqa %xmm11, WK(6)
|
||||
|
||||
vpaddq 4*16(K), %xmm4, %xmm8
|
||||
vpaddq 5*16(K), %xmm5, %xmm9
|
||||
vpaddq 6*16(K), %xmm6, %xmm10
|
||||
vpaddq 7*16(K), %xmm7, %xmm11
|
||||
vmovdqa %xmm8, WK(8)
|
||||
vmovdqa %xmm9, WK(10)
|
||||
vmovdqa %xmm10, WK(12)
|
||||
vmovdqa %xmm11, WK(14)
|
||||
addq $128, K
|
||||
|
||||
L_loop:
|
||||
|
||||
// digests a-h = ctx->states;
|
||||
mov 0*8(ctx), a
|
||||
mov 1*8(ctx), b
|
||||
mov 2*8(ctx), c
|
||||
mov 3*8(ctx), d
|
||||
mov 4*8(ctx), e
|
||||
mov 5*8(ctx), f
|
||||
mov 6*8(ctx), g
|
||||
mov 7*8(ctx), h
|
||||
|
||||
// rounds 0:47 interleaved with W/WK update for rounds 16:63
|
||||
rounds_schedule a, b, c, d, e, f, g, h, 16
|
||||
rounds_schedule a, b, c, d, e, f, g, h, 32
|
||||
rounds_schedule a, b, c, d, e, f, g, h, 48
|
||||
rounds_schedule a, b, c, d, e, f, g, h, 64
|
||||
|
||||
// revert K to the beginning of K256[]
|
||||
subq $640, K
|
||||
subq $1, _num_blocks // num_blocks--
|
||||
|
||||
je L_final_block // if final block, wrap up final rounds
|
||||
|
||||
rounds_schedule_initial a, b, c, d, e, f, g, h, 0
|
||||
|
||||
// ctx->states += digests a-h
|
||||
add a, 0*8(ctx)
|
||||
add b, 1*8(ctx)
|
||||
add c, 2*8(ctx)
|
||||
add d, 3*8(ctx)
|
||||
add e, 4*8(ctx)
|
||||
add f, 5*8(ctx)
|
||||
add g, 6*8(ctx)
|
||||
add h, 7*8(ctx)
|
||||
|
||||
jmp L_loop // branch for next block
|
||||
|
||||
// wrap up digest update round 48:63 for final block
|
||||
L_final_block:
|
||||
rounds_schedule_final a, b, c, d, e, f, g, h, 0
|
||||
|
||||
// ctx->states += digests a-h
|
||||
add a, 0*8(ctx)
|
||||
add b, 1*8(ctx)
|
||||
add c, 2*8(ctx)
|
||||
add d, 3*8(ctx)
|
||||
add e, 4*8(ctx)
|
||||
add f, 5*8(ctx)
|
||||
add g, 6*8(ctx)
|
||||
add h, 7*8(ctx)
|
||||
|
||||
// if kernel, restore ymm0-ymm11
|
||||
#if BUILDKERNEL
|
||||
vmovdqa 0*32+ymm_save, %ymm0
|
||||
vmovdqa 1*32+ymm_save, %ymm1
|
||||
vmovdqa 2*32+ymm_save, %ymm2
|
||||
vmovdqa 3*32+ymm_save, %ymm3
|
||||
vmovdqa 4*32+ymm_save, %ymm4
|
||||
vmovdqa 5*32+ymm_save, %ymm5
|
||||
vmovdqa 6*32+ymm_save, %ymm6
|
||||
vmovdqa 7*32+ymm_save, %ymm7
|
||||
vmovdqa 8*32+ymm_save, %ymm8
|
||||
vmovdqa 9*32+ymm_save, %ymm9
|
||||
vmovdqa 10*32+ymm_save, %ymm10
|
||||
vmovdqa 11*32+ymm_save, %ymm11
|
||||
#endif
|
||||
|
||||
// free allocated stack memory
|
||||
leaq -40(%rbp), sp
|
||||
|
||||
// restore callee-saved registers
|
||||
pop %r15
|
||||
pop %r14
|
||||
pop %r13
|
||||
pop %r12
|
||||
pop %rbx
|
||||
pop %rbp
|
||||
|
||||
// return
|
||||
ret
|
||||
|
||||
// data for using ssse3 pshufb instruction (big-endian loading of data)
|
||||
CC_ASM_SECTION_CONST
|
||||
.p2align 4
|
||||
|
||||
L_bswap:
|
||||
.quad 0x0001020304050607
|
||||
.quad 0x08090a0b0c0d0e0f
|
||||
|
||||
#endif // x86_64
|
||||
|
||||
|
|
@ -0,0 +1,552 @@
|
|||
# Copyright (c) (2016,2018,2019) Apple Inc. All rights reserved.
|
||||
#
|
||||
# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
# is contained in the License.txt file distributed with corecrypto) and only to
|
||||
# people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
# Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
# devices and computers you own or control, for the sole purpose of verifying the
|
||||
# security characteristics and correct functioning of the Apple Software. You may
|
||||
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
#include <corecrypto/cc_config.h>
|
||||
|
||||
/*
|
||||
This file provides x86_64 avx2 hand implementation of the following function
|
||||
|
||||
void sha512_compress(uint64_t *state, size_t nblocks, const void *in);
|
||||
|
||||
sha512 algorithm per block description:
|
||||
|
||||
1. W(0:15) = big-endian (per 8 bytes) loading of input data (128 bytes)
|
||||
2. load 8 digests (each 64bit) a-h from state
|
||||
3. for r = 0:15
|
||||
T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
|
||||
d += T1;
|
||||
h = T1 + Sigma0(a) + Maj(a,b,c)
|
||||
permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
|
||||
4. for r = 16:79
|
||||
W[r] = W[r-16] + Gamma1(W[r-2]) + W[r-7] + Gamma0(W[r-15]);
|
||||
T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
|
||||
d += T1;
|
||||
h = T1 + Sigma0(a) + Maj(a,b,c)
|
||||
permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
|
||||
|
||||
In the assembly implementation:
|
||||
- a circular window of message schedule W(r:r+15) is updated and stored in xmm0-xmm7 (or ymm0-ymm3/zmm0-zmm1 for avx2/avx512)
|
||||
- its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
|
||||
- the 8 digests (a-h) will be stored in GPR (%r8-%r15)
|
||||
|
||||
----------------------------------------------------------------------------
|
||||
|
||||
our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block
|
||||
into the last 16 rounds of its previous block:
|
||||
|
||||
----------------------------------------------------------------------------
|
||||
|
||||
load W(0:15) (big-endian per 8 bytes) into xmm0:xmm7
|
||||
pre_calculate and store W+K(0:15) in stack
|
||||
|
||||
L_loop:
|
||||
|
||||
load digests a-h from ctx->state;
|
||||
|
||||
for (r=0;r<64;r+=2) {
|
||||
digests a-h update and permute round r:r+1
|
||||
update W([r:r+1]%16) and WK([r:r+1]%16) for the next 8th iteration
|
||||
}
|
||||
|
||||
num_block--;
|
||||
if (num_block==0) jmp L_last_block;
|
||||
|
||||
for (r=64;r<80;r+=2) {
|
||||
digests a-h update and permute round r:r+1
|
||||
load W([r:r+1]%16) (big-endian per 8 bytes) into xmm0:xmm7
|
||||
pre_calculate and store W+K([r:r+1]%16) in stack
|
||||
}
|
||||
|
||||
ctx->states += digests a-h;
|
||||
|
||||
jmp L_loop;
|
||||
|
||||
L_last_block:
|
||||
|
||||
for (r=64;r<80;r+=2) {
|
||||
digests a-h update and permute round r:r+2
|
||||
}
|
||||
|
||||
ctx->states += digests a-h;
|
||||
|
||||
------------------------------------------------------------------------
|
||||
|
||||
Apple CoreOS vector & numerics
|
||||
*/
|
||||
#if defined __x86_64__
|
||||
|
||||
// associate variables with registers or memory
|
||||
|
||||
#define sp %rsp
|
||||
#define ctx %rdi
|
||||
#define num_blocks %rsi // later move this to stack, use %rsi for temp variable u
|
||||
#define data %rdx
|
||||
|
||||
#define a %r8
|
||||
#define b %r9
|
||||
#define c %r10
|
||||
#define d %r11
|
||||
#define e %r12
|
||||
#define f %r13
|
||||
#define g %r14
|
||||
#define h %r15
|
||||
|
||||
#define K %rbx
|
||||
#define _num_blocks (-48)(%rbp) // rbx/r12-r15
|
||||
#define L_aligned_bswap L_bswap(%rip)
|
||||
#define stack_size (8+32*8+128) // 8 (_num_blocks) + ymm save/restore + WK(0:15)
|
||||
#define ymm_save 128(sp) // starting address for ymm save/restore
|
||||
|
||||
// 3 local variables
|
||||
#define s %rax
|
||||
#define t %rcx
|
||||
#define u %rsi
|
||||
|
||||
// a window (16 quad-words) of message scheule
|
||||
#define W0 %ymm0
|
||||
#define W1 %ymm1
|
||||
#define W2 %ymm2
|
||||
#define W3 %ymm3
|
||||
|
||||
// circular buffer for WK[(r:r+15)%16]
|
||||
#define WK(x) ((x)&15)*8(sp)
|
||||
|
||||
// #define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
|
||||
|
||||
.macro Ch arg0, arg1, arg2
|
||||
#if 1
|
||||
mov \arg2, t
|
||||
xor \arg1, t
|
||||
and \arg0, t
|
||||
xor \arg2, t
|
||||
#else
|
||||
mov \arg0, t // x
|
||||
mov \arg0, s // x
|
||||
not t // ~x
|
||||
and \arg1, s // x & y
|
||||
and \arg2, t // ~x & z
|
||||
xor s, t // t = ((x) & (y)) ^ ((~(x)) & (z));
|
||||
#endif
|
||||
.endm
|
||||
|
||||
// #define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
|
||||
|
||||
.macro Maj arg0, arg1, arg2
|
||||
mov \arg1, t // y
|
||||
mov \arg2, s // z
|
||||
xor \arg2, t // y^z
|
||||
and \arg1, s // y&z
|
||||
and \arg0, t // x&(y^z)
|
||||
xor s, t // Maj(x,y,z)
|
||||
.endm
|
||||
|
||||
// #define Gamma0(x) (S64(1, (x)) ^ S64(8, (x)) ^ R(7 , (x)))
|
||||
|
||||
// performs Gamma0_512 on 4 quad-words on an ymm registers
|
||||
// use ymm6/ymm7 as intermediate registers
|
||||
.macro Gamma0 arg0
|
||||
vpsrlq $1, \arg0, %ymm6 // part of S64(1, x)
|
||||
vpsllq $56, \arg0, %ymm7 // part of S64(8, x)
|
||||
vpsrlq $7, \arg0, \arg0 // R(7, x)
|
||||
vpxor %ymm6, \arg0, \arg0
|
||||
vpsrlq $7, %ymm6, %ymm6 // part of S64(8, x)
|
||||
vpxor %ymm7, \arg0, \arg0
|
||||
vpsllq $7, %ymm7, %ymm7 // part of S64(1, x)
|
||||
vpxor %ymm6, \arg0, \arg0
|
||||
vpxor %ymm7, \arg0, \arg0
|
||||
.endm
|
||||
|
||||
// #define Gamma1(x) (S64(19, (x)) ^ S64(61, (x)) ^ R(6, (x)))
|
||||
|
||||
// performs Gamma1_512 on 4 words on an ymm registers
|
||||
// use ymm6/ymm7 as intermediate registers
|
||||
.macro Gamma1 arg0
|
||||
vpsrlq $19, \arg0, %ymm6 // part of S64(19, x)
|
||||
vpsllq $3, \arg0, %ymm7 // part of S64(61, x)
|
||||
vpsrlq $6, \arg0, \arg0 // R(6, x)
|
||||
vpxor %ymm6, \arg0, \arg0
|
||||
vpsrlq $42, %ymm6, %ymm6 // part of S64(61, x)
|
||||
vpxor %ymm7, \arg0, \arg0
|
||||
vpsllq $42, %ymm7, %ymm7 // part of S64(19, x)
|
||||
vpxor %ymm6, \arg0, \arg0
|
||||
vpxor %ymm7, \arg0, \arg0
|
||||
.endm
|
||||
|
||||
.macro rightshift16 arg0, arg1
|
||||
vpxor \arg1, \arg1, \arg1
|
||||
vperm2f128 $33, \arg1, \arg0, \arg1
|
||||
.endm
|
||||
|
||||
.macro leftshift16 arg0, arg1
|
||||
vpxor \arg1, \arg1, \arg1
|
||||
vperm2f128 $2, \arg1, \arg0, \arg1
|
||||
.endm
|
||||
|
||||
.macro vpalignr8 arg0, arg1, arg2
|
||||
vpblendd $3, \arg1, \arg0, \arg2
|
||||
vpermq $57, \arg2, \arg2
|
||||
.endm
|
||||
|
||||
// W[r] = W[r-16] + Gamma1(W[r-2]) + W[r-7] + Gamma0(W[r-15]);
|
||||
/*
|
||||
W0 W1 W2 W3
|
||||
update 4 quad words in W0 += vext(W2,W3,#8) + Gamma0(vext(W0,W1, #8)) + Gamma1(W1<<16);
|
||||
W0 += Gamma1(vext(W3,W0, #16)).
|
||||
*/
|
||||
.macro message_update4 arg0, arg1, arg2, arg3
|
||||
vpblendd $3, \arg1, \arg0, %ymm5
|
||||
vpxor %ymm4, %ymm4, %ymm4
|
||||
vpermq $57, %ymm5, %ymm5 // ymm5 = W[r-15] = vpalignr8 \arg0, \arg1, %ymm5
|
||||
vperm2f128 $33, %ymm4, \arg3, %ymm4 // ymm4 = [W[16] W[17] 0 0] half of W[r-2] = rightshift16 \arg3, %ymm4
|
||||
Gamma0 %ymm5 // Gamma0(W[r-15])
|
||||
Gamma1 %ymm4 // Gamma1(W[r-2]) half
|
||||
vpaddq %ymm5, \arg0, \arg0 // W0 += Gamma0([r-15]);
|
||||
vpblendd $3, \arg3, \arg2, %ymm5
|
||||
vpaddq %ymm4, \arg0, \arg0 // W0 += Gamma1(W[r-2]) + Gamma0(vext(W0,W1, #8));
|
||||
vpermq $57, %ymm5, %ymm5 // W[r-7] = vpalignr8 \arg2, \arg3, %ymm5 // W[r-7]
|
||||
vpxor %ymm4, %ymm4, %ymm4
|
||||
vpaddq %ymm5, \arg0, \arg0 // W0 += W[r-7]
|
||||
vperm2f128 $2, %ymm4, \arg0, %ymm4 // leftshift16 \arg0, %ymm4 for W0<<16
|
||||
Gamma1 %ymm4 // Gamma1(W0<<16)
|
||||
vpaddq %ymm4, \arg0, \arg0 // W0 += Gamma1(W0<<16);
|
||||
.endm
|
||||
|
||||
// #define Sigma0(x) (S64(28, (x)) ^ S64(34, (x)) ^ S64(39, (x)))
|
||||
|
||||
.macro Sigma0 arg0
|
||||
rorx $28, \arg0, s // S(28, (x))
|
||||
rorx $34, \arg0, t // S(34, (x))
|
||||
rorx $11, s, u // S(39, (x))
|
||||
xor s, t // S(28, (x)) ^ S(34, (x))
|
||||
xor u, t // t = (S(28, (x)) ^ S(34, (x)) ^ S(39, (x)))
|
||||
.endm
|
||||
|
||||
// #define Sigma1(x) (S(14, (x)) ^ S(18, (x)) ^ S(41, (x)))
|
||||
|
||||
.macro Sigma1 arg0
|
||||
rorx $14, \arg0, s // S(14, (x))
|
||||
rorx $18, \arg0, t // S(18, (x))
|
||||
rorx $27, s, u // S(41, (x))
|
||||
xor s, t // S(14, (x)) ^ S(18, (x))
|
||||
xor u, t // t = (S(14, (x)) ^ S(18, (x)) ^ S(41, (x)))
|
||||
.endm
|
||||
|
||||
// per round digests update
|
||||
.macro round_ref arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
Sigma1 \arg4 // t = T1
|
||||
add t, \arg7 // use h to store h+Sigma1(e)
|
||||
Ch \arg4, \arg5, \arg6 // t = Ch (e, f, g);
|
||||
add \arg7, t // t = h+Sigma1(e)+Ch(e,f,g);
|
||||
add WK(\arg8), t // h = T1
|
||||
add t, \arg3 // d += T1;
|
||||
mov t, \arg7 // h = T1
|
||||
Sigma0 \arg0 // t = Sigma0(a);
|
||||
add t, \arg7 // h = T1 + Sigma0(a);
|
||||
Maj \arg0, \arg1, \arg2 // t = Maj(a,b,c)
|
||||
add t, \arg7 // h = T1 + Sigma0(a) + Maj(a,b,c);
|
||||
.endm
|
||||
|
||||
.macro round arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
|
||||
rorx $14, \arg4, s // S(14, (x))
|
||||
mov \arg6, t // Ch(e,f,g) : 1
|
||||
rorx $18, \arg4, u // S(18, (x))
|
||||
xor \arg5, t // Ch(e,f,g) : 2
|
||||
xor s, u // S(14, (x)) ^ S(18, (x))
|
||||
and \arg4, t // Ch(e,f,g) : 3
|
||||
rorx $27, s, s // S(41, (x))
|
||||
xor \arg6, t // t = Ch(e,f,g);
|
||||
xor s, u // u = Sigma1(e);
|
||||
add t, \arg7 // h = h+Ch(e,f,g);
|
||||
add u, \arg7 // h = h+Sigma1(e)+Ch(e,f,g);
|
||||
|
||||
add WK(\arg8), \arg7 // h = T1
|
||||
add \arg7, \arg3 // d += T1;
|
||||
|
||||
rorx $28, \arg0, s // S(28, (x))
|
||||
rorx $34, \arg0, u // S(34, (x))
|
||||
xor s, u // S(28, (x)) ^ S(34, (x))
|
||||
rorx $11, s, s // S(39, (x))
|
||||
xor s, u // t = (S(28, (x)) ^ S(34, (x)) ^ S(39, (x)))
|
||||
add u, \arg7 // h = T1 + Sigma0(a);
|
||||
|
||||
mov \arg1, t // b
|
||||
mov \arg2, s // c
|
||||
xor \arg2, t // b^c
|
||||
and \arg1, s // b&c
|
||||
and \arg0, t // a&(b^c)
|
||||
xor s, t // t = Maj(a,b,c)
|
||||
|
||||
add t, \arg7 // h = T1 + Sigma0(a) + Maj(a,b,c);
|
||||
|
||||
.endm
|
||||
|
||||
/*
|
||||
16 rounds of hash update, update input schedule W (in vector register ymm0-ymm3) and WK = W + K (in stack)
|
||||
*/
|
||||
.macro rounds_schedule arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
|
||||
message_update4 W0, W1, W2, W3
|
||||
round \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 0+\arg8
|
||||
round \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 1+\arg8
|
||||
vpaddq 0*32(K), W0, %ymm4
|
||||
round \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 2+\arg8
|
||||
round \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 3+\arg8
|
||||
vmovdqa %ymm4, WK(0)
|
||||
|
||||
message_update4 W1, W2, W3, W0
|
||||
round \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 4+\arg8
|
||||
round \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 5+\arg8
|
||||
vpaddq 1*32(K), W1, %ymm4
|
||||
round \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 6+\arg8
|
||||
round \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 7+\arg8
|
||||
vmovdqa %ymm4, WK(4)
|
||||
|
||||
message_update4 W2, W3, W0, W1
|
||||
round \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 8+\arg8
|
||||
round \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 9+\arg8
|
||||
vpaddq 2*32(K), W2, %ymm4
|
||||
round \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 10+\arg8
|
||||
round \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 11+\arg8
|
||||
vmovdqa %ymm4, WK(8)
|
||||
|
||||
message_update4 W3, W0, W1, W2
|
||||
round \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 12+\arg8
|
||||
round \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 13+\arg8
|
||||
vpaddq 3*32(K), W3, %ymm4
|
||||
round \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 14+\arg8
|
||||
round \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 15+\arg8
|
||||
vmovdqa %ymm4, WK(12)
|
||||
|
||||
addq $128, K
|
||||
.endm
|
||||
|
||||
/*
|
||||
16 rounds of hash update, load new input schedule W (in vector register xmm0-xmm7) and update WK = W + K (in stack)
|
||||
*/
|
||||
.macro rounds_schedule_initial arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
vmovdqu 0*32(data), W0
|
||||
round \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 0+\arg8
|
||||
vpshufb L_aligned_bswap, W0, W0
|
||||
round \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 1+\arg8
|
||||
vpaddq 0*32(K), W0, %ymm4
|
||||
round \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 2+\arg8
|
||||
round \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 3+\arg8
|
||||
vmovdqa %ymm4, WK(0)
|
||||
|
||||
|
||||
vmovdqu 1*32(data), W1
|
||||
round \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 4+\arg8
|
||||
vpshufb L_aligned_bswap, W1, W1
|
||||
round \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 5+\arg8
|
||||
vpaddq 1*32(K), W1, %ymm4
|
||||
round \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 6+\arg8
|
||||
round \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 7+\arg8
|
||||
vmovdqa %ymm4, WK(4)
|
||||
|
||||
vmovdqu 2*32(data), W2
|
||||
round \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 8+\arg8
|
||||
vpshufb L_aligned_bswap, W2, W2
|
||||
round \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 9+\arg8
|
||||
vpaddq 2*32(K), W2, %ymm4
|
||||
|
||||
round \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 10+\arg8
|
||||
round \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 11+\arg8
|
||||
vmovdqa %ymm4, WK(8)
|
||||
|
||||
vmovdqu 3*32(data), W3
|
||||
round \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 12+\arg8
|
||||
vpshufb L_aligned_bswap, W3, W3
|
||||
round \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 13+\arg8
|
||||
vpaddq 3*32(K), W3, %ymm4
|
||||
|
||||
round \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 14+\arg8
|
||||
round \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 15+\arg8
|
||||
vmovdqa %ymm4, WK(12)
|
||||
|
||||
addq $128, K
|
||||
addq $128, data
|
||||
.endm
|
||||
|
||||
/*
|
||||
16 rounds of hash update
|
||||
*/
|
||||
.macro rounds_schedule_final arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
round \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 0+\arg8
|
||||
round \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 1+\arg8
|
||||
|
||||
round \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 2+\arg8
|
||||
round \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 3+\arg8
|
||||
|
||||
round \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 4+\arg8
|
||||
round \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 5+\arg8
|
||||
|
||||
round \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 6+\arg8
|
||||
round \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 7+\arg8
|
||||
|
||||
round \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 8+\arg8
|
||||
round \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 9+\arg8
|
||||
|
||||
round \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 10+\arg8
|
||||
round \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 11+\arg8
|
||||
|
||||
round \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 12+\arg8
|
||||
round \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 13+\arg8
|
||||
|
||||
round \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 14+\arg8
|
||||
round \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 15+\arg8
|
||||
.endm
|
||||
|
||||
.text
|
||||
.globl _AccelerateCrypto_SHA512_compress_AVX2
|
||||
_AccelerateCrypto_SHA512_compress_AVX2:
|
||||
|
||||
// push callee-saved registers
|
||||
push %rbp
|
||||
movq %rsp, %rbp
|
||||
push %rbx
|
||||
push %r12
|
||||
push %r13
|
||||
push %r14
|
||||
push %r15
|
||||
|
||||
// allocate stack space
|
||||
sub $stack_size, sp
|
||||
andq $-32, sp // aligned sp to 32-bytes
|
||||
|
||||
// if kernel code, save used xmm registers
|
||||
#if BUILDKERNEL
|
||||
vmovdqa %ymm0, 0*32+ymm_save
|
||||
vmovdqa %ymm1, 1*32+ymm_save
|
||||
vmovdqa %ymm2, 2*32+ymm_save
|
||||
vmovdqa %ymm3, 3*32+ymm_save
|
||||
vmovdqa %ymm4, 4*32+ymm_save
|
||||
vmovdqa %ymm5, 5*32+ymm_save
|
||||
vmovdqa %ymm6, 6*32+ymm_save
|
||||
vmovdqa %ymm7, 7*32+ymm_save
|
||||
#endif
|
||||
|
||||
movq num_blocks, _num_blocks
|
||||
|
||||
// set up bswap parameters in the aligned stack space and pointer to table K512[]
|
||||
lea CC_C_LABEL(sha512_K)(%rip), K
|
||||
|
||||
// load W[0:15] into ymm0-ymm3
|
||||
vmovdqu 0*32(data), W0
|
||||
vmovdqu 1*32(data), W1
|
||||
vmovdqu 2*32(data), W2
|
||||
vmovdqu 3*32(data), W3
|
||||
addq $128, data
|
||||
|
||||
vmovdqa L_aligned_bswap, %ymm4
|
||||
vpshufb %ymm4, W0, W0
|
||||
vpshufb %ymm4, W1, W1
|
||||
vpshufb %ymm4, W2, W2
|
||||
vpshufb %ymm4, W3, W3
|
||||
|
||||
// compute WK[0:15] and save in stack
|
||||
vpaddq 0*32(K), W0, %ymm4
|
||||
vpaddq 1*32(K), W1, %ymm5
|
||||
vpaddq 2*32(K), W2, %ymm6
|
||||
vpaddq 3*32(K), W3, %ymm7
|
||||
addq $128, K
|
||||
vmovdqa %ymm4, WK(0)
|
||||
vmovdqa %ymm5, WK(4)
|
||||
vmovdqa %ymm6, WK(8)
|
||||
vmovdqa %ymm7, WK(12)
|
||||
|
||||
L_loop:
|
||||
|
||||
// digests a-h = ctx->states;
|
||||
mov 0*8(ctx), a
|
||||
mov 1*8(ctx), b
|
||||
mov 2*8(ctx), c
|
||||
mov 3*8(ctx), d
|
||||
mov 4*8(ctx), e
|
||||
mov 5*8(ctx), f
|
||||
mov 6*8(ctx), g
|
||||
mov 7*8(ctx), h
|
||||
|
||||
// rounds 0:47 interleaved with W/WK update for rounds 16:63
|
||||
rounds_schedule a, b, c, d, e, f, g, h, 16
|
||||
rounds_schedule a, b, c, d, e, f, g, h, 32
|
||||
rounds_schedule a, b, c, d, e, f, g, h, 48
|
||||
rounds_schedule a, b, c, d, e, f, g, h, 64
|
||||
|
||||
// revert K to the beginning of K256[]
|
||||
subq $640, K
|
||||
subq $1, _num_blocks // num_blocks--
|
||||
|
||||
je L_final_block // if final block, wrap up final rounds
|
||||
|
||||
rounds_schedule_initial a, b, c, d, e, f, g, h, 0
|
||||
|
||||
// ctx->states += digests a-h
|
||||
add a, 0*8(ctx)
|
||||
add b, 1*8(ctx)
|
||||
add c, 2*8(ctx)
|
||||
add d, 3*8(ctx)
|
||||
add e, 4*8(ctx)
|
||||
add f, 5*8(ctx)
|
||||
add g, 6*8(ctx)
|
||||
add h, 7*8(ctx)
|
||||
|
||||
jmp L_loop // branch for next block
|
||||
|
||||
// wrap up digest update round 48:63 for final block
|
||||
L_final_block:
|
||||
rounds_schedule_final a, b, c, d, e, f, g, h, 0
|
||||
|
||||
// ctx->states += digests a-h
|
||||
add a, 0*8(ctx)
|
||||
add b, 1*8(ctx)
|
||||
add c, 2*8(ctx)
|
||||
add d, 3*8(ctx)
|
||||
add e, 4*8(ctx)
|
||||
add f, 5*8(ctx)
|
||||
add g, 6*8(ctx)
|
||||
add h, 7*8(ctx)
|
||||
|
||||
// if kernel, restore xmm0-xmm7
|
||||
#if BUILDKERNEL
|
||||
vmovdqa 0*32+ymm_save, %ymm0
|
||||
vmovdqa 1*32+ymm_save, %ymm1
|
||||
vmovdqa 2*32+ymm_save, %ymm2
|
||||
vmovdqa 3*32+ymm_save, %ymm3
|
||||
vmovdqa 4*32+ymm_save, %ymm4
|
||||
vmovdqa 5*32+ymm_save, %ymm5
|
||||
vmovdqa 6*32+ymm_save, %ymm6
|
||||
vmovdqa 7*32+ymm_save, %ymm7
|
||||
#endif
|
||||
|
||||
// free allocated stack memory
|
||||
leaq -40(%rbp), sp
|
||||
|
||||
// restore callee-saved registers
|
||||
pop %r15
|
||||
pop %r14
|
||||
pop %r13
|
||||
pop %r12
|
||||
pop %rbx
|
||||
pop %rbp
|
||||
|
||||
// return
|
||||
ret
|
||||
|
||||
// data for using ssse3 pshufb instruction (big-endian loading of data)
|
||||
CC_ASM_SECTION_CONST
|
||||
.p2align 5
|
||||
|
||||
L_bswap:
|
||||
.quad 0x0001020304050607
|
||||
.quad 0x08090a0b0c0d0e0f
|
||||
.quad 0x1011121314151617
|
||||
.quad 0x18191a1b1c1d1e1f
|
||||
|
||||
#endif // x86_64
|
||||
|
|
@ -0,0 +1,619 @@
|
|||
# Copyright (c) (2016,2018,2019) Apple Inc. All rights reserved.
|
||||
#
|
||||
# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
# is contained in the License.txt file distributed with corecrypto) and only to
|
||||
# people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
# Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
# devices and computers you own or control, for the sole purpose of verifying the
|
||||
# security characteristics and correct functioning of the Apple Software. You may
|
||||
# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
#include <corecrypto/cc_config.h>
|
||||
|
||||
/*
|
||||
This file provides x86_64 hand implementation of the following function
|
||||
|
||||
void sha512_compress(uint64_t *state, size_t nblocks, const void *in);
|
||||
|
||||
sha512 algorithm per block description:
|
||||
|
||||
1. W(0:15) = big-endian (per 8 bytes) loading of input data (128 bytes)
|
||||
2. load 8 digests (each 64bit) a-h from state
|
||||
3. for r = 0:15
|
||||
T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
|
||||
d += T1;
|
||||
h = T1 + Sigma0(a) + Maj(a,b,c)
|
||||
permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
|
||||
4. for r = 16:79
|
||||
W[r] = W[r-16] + Gamma1(W[r-2]) + W[r-7] + Gamma0(W[r-15]);
|
||||
T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
|
||||
d += T1;
|
||||
h = T1 + Sigma0(a) + Maj(a,b,c)
|
||||
permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
|
||||
|
||||
In the assembly implementation:
|
||||
- a circular window of message schedule W(r:r+15) is updated and stored in xmm0-xmm7 (or ymm0-ymm3/zmm0-zmm1 for avx1/avx2)
|
||||
- its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
|
||||
- the 8 digests (a-h) will be stored in GPR (%r8-%r15)
|
||||
|
||||
----------------------------------------------------------------------------
|
||||
|
||||
our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block
|
||||
into the last 16 rounds of its previous block:
|
||||
|
||||
----------------------------------------------------------------------------
|
||||
|
||||
load W(0:15) (big-endian per 8 bytes) into xmm0:xmm7
|
||||
pre_calculate and store W+K(0:15) in stack
|
||||
|
||||
L_loop:
|
||||
|
||||
load digests a-h from ctx->state;
|
||||
|
||||
for (r=0;r<64;r+=2) {
|
||||
digests a-h update and permute round r:r+1
|
||||
update W([r:r+1]%16) and WK([r:r+1]%16) for the next 8th iteration
|
||||
}
|
||||
|
||||
num_block--;
|
||||
if (num_block==0) jmp L_last_block;
|
||||
|
||||
for (r=64;r<80;r+=2) {
|
||||
digests a-h update and permute round r:r+1
|
||||
load W([r:r+1]%16) (big-endian per 8 bytes) into xmm0:xmm7
|
||||
pre_calculate and store W+K([r:r+1]%16) in stack
|
||||
}
|
||||
|
||||
ctx->states += digests a-h;
|
||||
|
||||
jmp L_loop;
|
||||
|
||||
L_last_block:
|
||||
|
||||
for (r=64;r<80;r+=2) {
|
||||
digests a-h update and permute round r:r+2
|
||||
}
|
||||
|
||||
ctx->states += digests a-h;
|
||||
|
||||
------------------------------------------------------------------------
|
||||
|
||||
Apple CoreOS vector & numerics
|
||||
*/
|
||||
#if defined __x86_64__
|
||||
|
||||
// associate variables with registers or memory
|
||||
|
||||
#define sp %rsp
|
||||
#define ctx %rdi
|
||||
#define num_blocks %rsi // later move this to stack, use %rsi for temp variable u
|
||||
#define data %rdx
|
||||
|
||||
#define a %r8
|
||||
#define b %r9
|
||||
#define c %r10
|
||||
#define d %r11
|
||||
#define e %r12
|
||||
#define f %r13
|
||||
#define g %r14
|
||||
#define h %r15
|
||||
|
||||
#define K %rbx
|
||||
#define _num_blocks (-48)(%rbp) // rbx/r12-r15
|
||||
#define stack_size (8+16*12+128) // 8 (_num_blocks) + xmm0:xmm11 + WK(0:15)
|
||||
|
||||
#define L_aligned_bswap L_bswap(%rip) // bswap : big-endian loading of 4-byte words
|
||||
#define xmm_save 128(sp) // starting address for xmm save/restore
|
||||
|
||||
// 3 local variables
|
||||
#define s %rax
|
||||
#define t %rcx
|
||||
#define u %rsi
|
||||
|
||||
// a window (16 quad-words) of message scheule
|
||||
#define W0 %xmm0
|
||||
#define W1 %xmm1
|
||||
#define W2 %xmm2
|
||||
#define W3 %xmm3
|
||||
#define W4 %xmm4
|
||||
#define W5 %xmm5
|
||||
#define W6 %xmm6
|
||||
#define W7 %xmm7
|
||||
|
||||
// circular buffer for WK[(r:r+15)%16]
|
||||
#define WK(x) ((x)&15)*8(sp)
|
||||
|
||||
// #define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
|
||||
|
||||
.macro Ch arg0, arg1, arg2
|
||||
#if 1
|
||||
mov \arg2, t
|
||||
xor \arg1, t
|
||||
and \arg0, t
|
||||
xor \arg2, t
|
||||
#else
|
||||
mov \arg0, t // x
|
||||
mov \arg0, s // x
|
||||
not t // ~x
|
||||
and \arg1, s // x & y
|
||||
and \arg2, t // ~x & z
|
||||
xor s, t // t = ((x) & (y)) ^ ((~(x)) & (z));
|
||||
#endif
|
||||
.endm
|
||||
|
||||
// #define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
|
||||
|
||||
.macro Maj arg0, arg1, arg2
|
||||
mov \arg1, t // y
|
||||
mov \arg2, s // z
|
||||
xor \arg2, t // y^z
|
||||
and \arg1, s // y&z
|
||||
and \arg0, t // x&(y^z)
|
||||
xor s, t // Maj(x,y,z)
|
||||
.endm
|
||||
|
||||
// #define Gamma0(x) (S64(1, (x)) ^ S64(8, (x)) ^ R(7 , (x)))
|
||||
|
||||
// performs Gamma0_512 on 2 words on an xmm registers
|
||||
// use xmm8/xmm9 as intermediate registers
|
||||
.macro Gamma0 arg0
|
||||
movdqa \arg0, %xmm8
|
||||
movdqa \arg0, %xmm9
|
||||
psrlq $7, \arg0 // R(7, x)
|
||||
psrlq $1, %xmm8 // part of S64(1, x)
|
||||
psllq $56, %xmm9 // part of S64(8, x)
|
||||
pxor %xmm8, \arg0
|
||||
psrlq $7, %xmm8 // part of S64(8, x)
|
||||
pxor %xmm9, \arg0
|
||||
psllq $7, %xmm9 // part of S64(1, x)
|
||||
pxor %xmm8, \arg0
|
||||
pxor %xmm9, \arg0
|
||||
.endm
|
||||
|
||||
// #define Gamma1(x) (S64(19, (x)) ^ S64(61, (x)) ^ R(6, (x)))
|
||||
|
||||
// performs Gamma1_512 on 2 words on an xmm registers
|
||||
// use xmm8/xmm9 as intermediate registers
|
||||
.macro Gamma1 arg0
|
||||
movdqa \arg0, %xmm8
|
||||
movdqa \arg0, %xmm9
|
||||
psrlq $6, \arg0 // R(6, x)
|
||||
psrlq $19, %xmm8 // part of S64(19, x)
|
||||
psllq $3, %xmm9 // part of S64(61, x)
|
||||
pxor %xmm8, \arg0
|
||||
psrlq $42, %xmm8 // part of S64(61, x)
|
||||
pxor %xmm9, \arg0
|
||||
psllq $42, %xmm9 // part of S64(19, x)
|
||||
pxor %xmm8, \arg0
|
||||
pxor %xmm9, \arg0
|
||||
.endm
|
||||
|
||||
// W[r] = W[r-16] + Gamma1(W[r-2]) + W[r-7] + Gamma0(W[r-15]);
|
||||
/*
|
||||
W0 W1 W2 W3 W4 W5 W6 W7
|
||||
|
||||
update 2 quad words in W0 = W0 + Gamma1(W7) + vext(W4,W5) + Gamma0(vext(W0,W1)).
|
||||
use %xmm10, %xmm11 for temp
|
||||
*/
|
||||
.macro message_update2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
|
||||
movdqa \arg5, %xmm10
|
||||
movdqa \arg1, %xmm11
|
||||
palignr $8, \arg4, %xmm10 // vext(W4,W5)
|
||||
palignr $8, \arg0, %xmm11 // vext(W0,W1)
|
||||
paddq %xmm10, \arg0 // W0 + vext(W4,W5)
|
||||
movdqa \arg7, %xmm10
|
||||
Gamma1 %xmm10 // Gamma1(W7)
|
||||
Gamma0 %xmm11 // Gamma0(vext(W0,W1))
|
||||
paddq %xmm10, \arg0 // W0 + Gamma1(W7) + vext(W4,W5)
|
||||
paddq %xmm11, \arg0 // W0 + Gamma1(W7) + vext(W4,W5) + Gamma0(vext(W0,W1))
|
||||
.endm
|
||||
|
||||
// #define Sigma0(x) (S64(28, (x)) ^ S64(34, (x)) ^ S64(39, (x)))
|
||||
|
||||
.macro Sigma0 arg0
|
||||
mov \arg0, t // x
|
||||
mov \arg0, s // x
|
||||
ror $28, t // S(28, (x))
|
||||
ror $34, s // S(34, (x))
|
||||
xor s, t // S(28, (x)) ^ S(34, (x))
|
||||
ror $5, s // S(39, (x))
|
||||
xor s, t // t = (S(28, (x)) ^ S(34, (x)) ^ S(39, (x)))
|
||||
.endm
|
||||
|
||||
// #define Sigma1(x) (S(14, (x)) ^ S(18, (x)) ^ S(41, (x)))
|
||||
|
||||
.macro Sigma1 arg0
|
||||
mov \arg0, s // x
|
||||
ror $14, s // S(14, (x))
|
||||
mov s, t // S(14, (x))
|
||||
ror $4, s // S(18, (x))
|
||||
xor s, t // S(14, (x)) ^ S(18, (x))
|
||||
ror $23, s // S(41, (x))
|
||||
xor s, t // t = (S(14, (x)) ^ S(18, (x)) ^ S(41, (x)))
|
||||
.endm
|
||||
|
||||
// per round digests update
|
||||
.macro round_ref arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
Sigma1 \arg4 // t = Sigma1(e);
|
||||
add t, \arg7 // h = h+Sigma1(e)
|
||||
Ch \arg4, \arg5, \arg6 // t = Ch (e, f, g);
|
||||
add t, \arg7 // h = h+Sigma1(e)+Ch(e,f,g);
|
||||
add WK(\arg8), \arg7 // h = h+Sigma1(e)+Ch(e,f,g)+WK
|
||||
add \arg7, \arg3 // d += h;
|
||||
Sigma0 \arg0 // t = Sigma0(a);
|
||||
add t, \arg7 // h += Sigma0(a);
|
||||
Maj \arg0, \arg1, \arg2 // t = Maj(a,b,c)
|
||||
add t, \arg7 // h = T1 + Sigma0(a) + Maj(a,b,c);
|
||||
.endm
|
||||
|
||||
.macro round arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
mov \arg4, s
|
||||
mov \arg0, t
|
||||
ror $(41-18), s
|
||||
ror $(39-34), t
|
||||
xor \arg4, s
|
||||
mov \arg5, u
|
||||
xor \arg0, t
|
||||
ror $(18-14), s
|
||||
xor \arg6, u
|
||||
xor \arg4, s
|
||||
ror $(34-28), t
|
||||
and \arg4, u
|
||||
xor \arg0, t
|
||||
xor \arg6, u
|
||||
ror $14, s
|
||||
ror $28, t
|
||||
add s, u
|
||||
mov \arg0, s
|
||||
add WK(\arg8), u
|
||||
or \arg2, s
|
||||
add u, \arg7
|
||||
mov \arg0, u
|
||||
add \arg7, \arg3
|
||||
and \arg1, s
|
||||
and \arg2, u
|
||||
or u, s
|
||||
add t, \arg7
|
||||
add s, \arg7
|
||||
.endm
|
||||
|
||||
/*
|
||||
16 rounds of hash update, update input schedule W (in vector register xmm0-xmm7) and WK = W + K (in stack)
|
||||
*/
|
||||
.macro rounds_schedule arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
message_update2 W0, W1, W2, W3, W4, W5, W6, W7
|
||||
movdqa 0*16(K), %xmm8
|
||||
round \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 0+\arg8
|
||||
paddq W0, %xmm8
|
||||
round \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 1+\arg8
|
||||
movdqa %xmm8, WK(0)
|
||||
|
||||
message_update2 W1, W2, W3, W4, W5, W6, W7, W0
|
||||
movdqa 1*16(K), %xmm8
|
||||
round \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 2+\arg8
|
||||
paddq W1, %xmm8
|
||||
round \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 3+\arg8
|
||||
movdqa %xmm8, WK(2)
|
||||
|
||||
message_update2 W2, W3, W4, W5, W6, W7, W0, W1
|
||||
movdqa 2*16(K), %xmm8
|
||||
round \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 4+\arg8
|
||||
paddq W2, %xmm8
|
||||
round \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 5+\arg8
|
||||
movdqa %xmm8, WK(4)
|
||||
|
||||
message_update2 W3, W4, W5, W6, W7, W0, W1, W2
|
||||
movdqa 3*16(K), %xmm8
|
||||
round \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 6+\arg8
|
||||
paddq W3, %xmm8
|
||||
round \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 7+\arg8
|
||||
movdqa %xmm8, WK(6)
|
||||
|
||||
message_update2 W4, W5, W6, W7, W0, W1, W2, W3
|
||||
movdqa 4*16(K), %xmm8
|
||||
round \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 8+\arg8
|
||||
paddq W4, %xmm8
|
||||
round \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 9+\arg8
|
||||
movdqa %xmm8, WK(8)
|
||||
|
||||
message_update2 W5, W6, W7, W0, W1, W2, W3, W4
|
||||
movdqa 5*16(K), %xmm8
|
||||
round \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 10+\arg8
|
||||
paddq W5, %xmm8
|
||||
round \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 11+\arg8
|
||||
movdqa %xmm8, WK(10)
|
||||
|
||||
message_update2 W6, W7, W0, W1, W2, W3, W4, W5
|
||||
movdqa 6*16(K), %xmm8
|
||||
round \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 12+\arg8
|
||||
paddq W6, %xmm8
|
||||
round \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 13+\arg8
|
||||
movdqa %xmm8, WK(12)
|
||||
|
||||
message_update2 W7, W0, W1, W2, W3, W4, W5, W6
|
||||
movdqa 7*16(K), %xmm8
|
||||
round \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 14+\arg8
|
||||
paddq W7, %xmm8
|
||||
round \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 15+\arg8
|
||||
movdqa %xmm8, WK(14)
|
||||
|
||||
addq $128, K
|
||||
.endm
|
||||
|
||||
/*
|
||||
16 rounds of hash update, load new input schedule W (in vector register xmm0-xmm7) and update WK = W + K (in stack)
|
||||
*/
|
||||
.macro rounds_schedule_initial arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
movdqu 0*16(data), W0
|
||||
movdqa 0*16(K), %xmm8
|
||||
pshufb L_aligned_bswap, W0
|
||||
round \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 0+\arg8
|
||||
paddq W0, %xmm8
|
||||
round \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 1+\arg8
|
||||
movdqa %xmm8, WK(0)
|
||||
|
||||
movdqu 1*16(data), W1
|
||||
movdqa 1*16(K), %xmm8
|
||||
pshufb L_aligned_bswap, W1
|
||||
round \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 2+\arg8
|
||||
paddq W1, %xmm8
|
||||
round \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 3+\arg8
|
||||
movdqa %xmm8, WK(2)
|
||||
|
||||
movdqu 2*16(data), W2
|
||||
movdqa 2*16(K), %xmm8
|
||||
pshufb L_aligned_bswap, W2
|
||||
round \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 4+\arg8
|
||||
paddq W2, %xmm8
|
||||
round \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 5+\arg8
|
||||
movdqa %xmm8, WK(4)
|
||||
|
||||
movdqu 3*16(data), W3
|
||||
movdqa 3*16(K), %xmm8
|
||||
pshufb L_aligned_bswap, W3
|
||||
round \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 6+\arg8
|
||||
paddq W3, %xmm8
|
||||
round \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 7+\arg8
|
||||
movdqa %xmm8, WK(6)
|
||||
|
||||
movdqu 4*16(data), W4
|
||||
movdqa 4*16(K), %xmm8
|
||||
pshufb L_aligned_bswap, W4
|
||||
round \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 8+\arg8
|
||||
paddq W4, %xmm8
|
||||
round \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 9+\arg8
|
||||
movdqa %xmm8, WK(8)
|
||||
|
||||
movdqu 5*16(data), W5
|
||||
movdqa 5*16(K), %xmm8
|
||||
pshufb L_aligned_bswap, W5
|
||||
round \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 10+\arg8
|
||||
paddq W5, %xmm8
|
||||
round \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 11+\arg8
|
||||
movdqa %xmm8, WK(10)
|
||||
|
||||
movdqu 6*16(data), W6
|
||||
movdqa 6*16(K), %xmm8
|
||||
pshufb L_aligned_bswap, W6
|
||||
round \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 12+\arg8
|
||||
paddq W6, %xmm8
|
||||
round \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 13+\arg8
|
||||
movdqa %xmm8, WK(12)
|
||||
|
||||
movdqu 7*16(data), W7
|
||||
movdqa 7*16(K), %xmm8
|
||||
pshufb L_aligned_bswap, W7
|
||||
round \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 14+\arg8
|
||||
paddq W7, %xmm8
|
||||
round \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 15+\arg8
|
||||
movdqa %xmm8, WK(14)
|
||||
|
||||
addq $128, K
|
||||
addq $128, data
|
||||
.endm
|
||||
|
||||
/*
|
||||
16 rounds of hash update
|
||||
*/
|
||||
.macro rounds_schedule_final arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
|
||||
round \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 0+\arg8
|
||||
round \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 1+\arg8
|
||||
|
||||
round \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 2+\arg8
|
||||
round \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 3+\arg8
|
||||
|
||||
round \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 4+\arg8
|
||||
round \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 5+\arg8
|
||||
|
||||
round \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 6+\arg8
|
||||
round \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 7+\arg8
|
||||
|
||||
round \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 8+\arg8
|
||||
round \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 9+\arg8
|
||||
|
||||
round \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 10+\arg8
|
||||
round \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 11+\arg8
|
||||
|
||||
round \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 12+\arg8
|
||||
round \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 13+\arg8
|
||||
|
||||
round \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 14+\arg8
|
||||
round \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 15+\arg8
|
||||
.endm
|
||||
|
||||
.text
|
||||
.globl _AccelerateCrypto_SHA512_compress_ssse3
|
||||
_AccelerateCrypto_SHA512_compress_ssse3:
|
||||
|
||||
// push callee-saved registers
|
||||
push %rbp
|
||||
movq %rsp, %rbp
|
||||
push %rbx
|
||||
push %r12
|
||||
push %r13
|
||||
push %r14
|
||||
push %r15
|
||||
|
||||
// allocate stack space
|
||||
sub $stack_size, sp
|
||||
|
||||
// if kernel code, save used xmm registers
|
||||
#if BUILDKERNEL
|
||||
movdqa %xmm0, 0*16+xmm_save
|
||||
movdqa %xmm1, 1*16+xmm_save
|
||||
movdqa %xmm2, 2*16+xmm_save
|
||||
movdqa %xmm3, 3*16+xmm_save
|
||||
movdqa %xmm4, 4*16+xmm_save
|
||||
movdqa %xmm5, 5*16+xmm_save
|
||||
movdqa %xmm6, 6*16+xmm_save
|
||||
movdqa %xmm7, 7*16+xmm_save
|
||||
movdqa %xmm8, 8*16+xmm_save
|
||||
movdqa %xmm9, 9*16+xmm_save
|
||||
movdqa %xmm10, 10*16+xmm_save
|
||||
movdqa %xmm11, 11*16+xmm_save
|
||||
#endif
|
||||
|
||||
movq num_blocks, _num_blocks
|
||||
|
||||
// set up bswap parameters in the aligned stack space and pointer to table K512[]
|
||||
lea CC_C_LABEL(sha512_K)(%rip), K
|
||||
|
||||
// load W[0:15] into xmm0-xmm7
|
||||
movdqu 0*16(data), W0
|
||||
movdqu 1*16(data), W1
|
||||
movdqu 2*16(data), W2
|
||||
movdqu 3*16(data), W3
|
||||
movdqu 4*16(data), W4
|
||||
movdqu 5*16(data), W5
|
||||
movdqu 6*16(data), W6
|
||||
movdqu 7*16(data), W7
|
||||
addq $128, data
|
||||
|
||||
movdqa L_aligned_bswap, %xmm8
|
||||
pshufb %xmm8, W0
|
||||
pshufb %xmm8, W1
|
||||
pshufb %xmm8, W2
|
||||
pshufb %xmm8, W3
|
||||
pshufb %xmm8, W4
|
||||
pshufb %xmm8, W5
|
||||
pshufb %xmm8, W6
|
||||
pshufb %xmm8, W7
|
||||
|
||||
// compute WK[0:15] and save in stack
|
||||
movdqa 0*16(K), %xmm8
|
||||
movdqa 1*16(K), %xmm9
|
||||
movdqa 2*16(K), %xmm10
|
||||
movdqa 3*16(K), %xmm11
|
||||
paddq %xmm0, %xmm8
|
||||
paddq %xmm1, %xmm9
|
||||
paddq %xmm2, %xmm10
|
||||
paddq %xmm3, %xmm11
|
||||
movdqa %xmm8, WK(0)
|
||||
movdqa %xmm9, WK(2)
|
||||
movdqa %xmm10, WK(4)
|
||||
movdqa %xmm11, WK(6)
|
||||
|
||||
movdqa 4*16(K), %xmm8
|
||||
movdqa 5*16(K), %xmm9
|
||||
movdqa 6*16(K), %xmm10
|
||||
movdqa 7*16(K), %xmm11
|
||||
paddq %xmm4, %xmm8
|
||||
paddq %xmm5, %xmm9
|
||||
paddq %xmm6, %xmm10
|
||||
paddq %xmm7, %xmm11
|
||||
movdqa %xmm8, WK(8)
|
||||
movdqa %xmm9, WK(10)
|
||||
movdqa %xmm10, WK(12)
|
||||
movdqa %xmm11, WK(14)
|
||||
addq $128, K
|
||||
|
||||
L_loop:
|
||||
|
||||
// digests a-h = ctx->states;
|
||||
mov 0*8(ctx), a
|
||||
mov 1*8(ctx), b
|
||||
mov 2*8(ctx), c
|
||||
mov 3*8(ctx), d
|
||||
mov 4*8(ctx), e
|
||||
mov 5*8(ctx), f
|
||||
mov 6*8(ctx), g
|
||||
mov 7*8(ctx), h
|
||||
|
||||
// rounds 0:47 interleaved with W/WK update for rounds 16:63
|
||||
rounds_schedule a, b, c, d, e, f, g, h, 16
|
||||
rounds_schedule a, b, c, d, e, f, g, h, 32
|
||||
rounds_schedule a, b, c, d, e, f, g, h, 48
|
||||
rounds_schedule a, b, c, d, e, f, g, h, 64
|
||||
|
||||
// revert K to the beginning of K256[]
|
||||
subq $640, K
|
||||
subq $1, _num_blocks // num_blocks--
|
||||
|
||||
je L_final_block // if final block, wrap up final rounds
|
||||
|
||||
rounds_schedule_initial a, b, c, d, e, f, g, h, 0
|
||||
|
||||
// ctx->states += digests a-h
|
||||
add a, 0*8(ctx)
|
||||
add b, 1*8(ctx)
|
||||
add c, 2*8(ctx)
|
||||
add d, 3*8(ctx)
|
||||
add e, 4*8(ctx)
|
||||
add f, 5*8(ctx)
|
||||
add g, 6*8(ctx)
|
||||
add h, 7*8(ctx)
|
||||
|
||||
jmp L_loop // branch for next block
|
||||
|
||||
// wrap up digest update round 48:63 for final block
|
||||
L_final_block:
|
||||
rounds_schedule_final a, b, c, d, e, f, g, h, 0
|
||||
|
||||
// ctx->states += digests a-h
|
||||
add a, 0*8(ctx)
|
||||
add b, 1*8(ctx)
|
||||
add c, 2*8(ctx)
|
||||
add d, 3*8(ctx)
|
||||
add e, 4*8(ctx)
|
||||
add f, 5*8(ctx)
|
||||
add g, 6*8(ctx)
|
||||
add h, 7*8(ctx)
|
||||
|
||||
// if kernel, restore xmm0-xmm7
|
||||
#if BUILDKERNEL
|
||||
movdqa 0*16+xmm_save, %xmm0
|
||||
movdqa 1*16+xmm_save, %xmm1
|
||||
movdqa 2*16+xmm_save, %xmm2
|
||||
movdqa 3*16+xmm_save, %xmm3
|
||||
movdqa 4*16+xmm_save, %xmm4
|
||||
movdqa 5*16+xmm_save, %xmm5
|
||||
movdqa 6*16+xmm_save, %xmm6
|
||||
movdqa 7*16+xmm_save, %xmm7
|
||||
movdqa 8*16+xmm_save, %xmm8
|
||||
movdqa 9*16+xmm_save, %xmm9
|
||||
movdqa 10*16+xmm_save, %xmm10
|
||||
movdqa 11*16+xmm_save, %xmm11
|
||||
#endif
|
||||
|
||||
// free allocated stack memory
|
||||
add $stack_size, sp
|
||||
|
||||
// restore callee-saved registers
|
||||
pop %r15
|
||||
pop %r14
|
||||
pop %r13
|
||||
pop %r12
|
||||
pop %rbx
|
||||
pop %rbp
|
||||
|
||||
// return
|
||||
ret
|
||||
|
||||
// data for using ssse3 pshufb instruction (big-endian loading of data)
|
||||
CC_ASM_SECTION_CONST
|
||||
.p2align 4
|
||||
|
||||
L_bswap:
|
||||
.quad 0x0001020304050607
|
||||
.quad 0x08090a0b0c0d0e0f
|
||||
|
||||
#endif // x86_64
|
||||
|
|
@ -0,0 +1,58 @@
|
|||
/* Copyright (c) (2016,2018-2020) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <corecrypto/cc_config.h>
|
||||
|
||||
/* the K array */
|
||||
const uint64_t sha512_K[80] CC_ALIGNED(16) = {
|
||||
0x428a2f98d728ae22, 0x7137449123ef65cd,
|
||||
0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc,
|
||||
0x3956c25bf348b538, 0x59f111f1b605d019,
|
||||
0x923f82a4af194f9b, 0xab1c5ed5da6d8118,
|
||||
0xd807aa98a3030242, 0x12835b0145706fbe,
|
||||
0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2,
|
||||
0x72be5d74f27b896f, 0x80deb1fe3b1696b1,
|
||||
0x9bdc06a725c71235, 0xc19bf174cf692694,
|
||||
0xe49b69c19ef14ad2, 0xefbe4786384f25e3,
|
||||
0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65,
|
||||
0x2de92c6f592b0275, 0x4a7484aa6ea6e483,
|
||||
0x5cb0a9dcbd41fbd4, 0x76f988da831153b5,
|
||||
0x983e5152ee66dfab, 0xa831c66d2db43210,
|
||||
0xb00327c898fb213f, 0xbf597fc7beef0ee4,
|
||||
0xc6e00bf33da88fc2, 0xd5a79147930aa725,
|
||||
0x06ca6351e003826f, 0x142929670a0e6e70,
|
||||
0x27b70a8546d22ffc, 0x2e1b21385c26c926,
|
||||
0x4d2c6dfc5ac42aed, 0x53380d139d95b3df,
|
||||
0x650a73548baf63de, 0x766a0abb3c77b2a8,
|
||||
0x81c2c92e47edaee6, 0x92722c851482353b,
|
||||
0xa2bfe8a14cf10364, 0xa81a664bbc423001,
|
||||
0xc24b8b70d0f89791, 0xc76c51a30654be30,
|
||||
0xd192e819d6ef5218, 0xd69906245565a910,
|
||||
0xf40e35855771202a, 0x106aa07032bbd1b8,
|
||||
0x19a4c116b8d2d0c8, 0x1e376c085141ab53,
|
||||
0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8,
|
||||
0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb,
|
||||
0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3,
|
||||
0x748f82ee5defb2fc, 0x78a5636f43172f60,
|
||||
0x84c87814a1f0ab72, 0x8cc702081a6439ec,
|
||||
0x90befffa23631e28, 0xa4506cebde82bde9,
|
||||
0xbef9a3f7b2c67915, 0xc67178f2e372532b,
|
||||
0xca273eceea26619c, 0xd186b8c721c0c207,
|
||||
0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178,
|
||||
0x06f067aa72176fba, 0x0a637dc5a2c898a6,
|
||||
0x113f9804bef90dae, 0x1b710b35131c471b,
|
||||
0x28db77f523047d84, 0x32caab7b40c72493,
|
||||
0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c,
|
||||
0x4cc5d4becb3e42b6, 0x597f299cfc657e2a,
|
||||
0x5fcb6fab3ad6faec, 0x6c44198c4a475817
|
||||
};
|
||||
|
||||
|
|
@ -0,0 +1,182 @@
|
|||
/* Copyright (c) (2010-2012,2014-2020) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#ifndef _CORECRYPTO_CC_H_
|
||||
#define _CORECRYPTO_CC_H_
|
||||
|
||||
#include <corecrypto/cc_config.h>
|
||||
#include <corecrypto/cc_error.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#if __has_feature(attribute_availability_with_replacement)
|
||||
#if __has_feature(attribute_availability_bridgeos)
|
||||
#ifndef __CC_BRIDGE_OS_DEPRECATED
|
||||
#define __CC_BRIDGEOS_DEPRECATED(_dep, _msg) __attribute__((availability(bridgeos,deprecated=_dep, replacement=_msg)))
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef __CC_BRIDGEOS_DEPRECATED
|
||||
#define __CC_BRIDGEOS_DEPRECATED(_dep, _msg)
|
||||
#endif
|
||||
|
||||
#define cc_deprecate_with_replacement(replacement_message, ios_version, macos_version, tvos_version, watchos_version, bridgeos_version) \
|
||||
__attribute__((availability(macos,deprecated=macos_version, replacement=replacement_message)))\
|
||||
__attribute__((availability(ios,deprecated=ios_version, replacement=replacement_message)))\
|
||||
__attribute__((availability(watchos,deprecated=watchos_version, replacement=replacement_message)))\
|
||||
__attribute__((availability(tvos,deprecated=tvos_version, replacement=replacement_message)))\
|
||||
__CC_BRIDGEOS_DEPRECATED(bridgeos_version, replacement_message)
|
||||
|
||||
#else /* !__has_feature(attribute_availability_with_replacement) */
|
||||
|
||||
#define cc_deprecate_with_replacement(replacement_message, ios_version, macos_version, tvos_version, watchos_version, bridgeos_version)
|
||||
|
||||
#endif /* __has_feature(attribute_availability_with_replacement) */
|
||||
|
||||
/* Provide a general purpose macro concat method. */
|
||||
#define cc_concat_(a, b) a##b
|
||||
#define cc_concat(a, b) cc_concat_(a, b)
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#define __asm__(x)
|
||||
#endif
|
||||
|
||||
/* Manage asserts here because a few functions in header public files do use asserts */
|
||||
#if CORECRYPTO_DEBUG
|
||||
#define cc_assert(x) assert(x)
|
||||
#else
|
||||
#define cc_assert(x)
|
||||
#endif
|
||||
|
||||
#if CC_KERNEL
|
||||
#include <kern/assert.h>
|
||||
#elif CC_USE_S3
|
||||
#define assert(args) // No assert in S3
|
||||
#else
|
||||
#include <assert.h>
|
||||
#endif
|
||||
|
||||
/* Provide a static assert that can be used to create compile-type failures. */
|
||||
#define cc_static_assert(e,m) \
|
||||
enum { cc_concat(static_assert_, __COUNTER__) = 1/(int)(!!(e)) }
|
||||
|
||||
/* Declare a struct element with a guarenteed alignment of _alignment_.
|
||||
The resulting struct can be used to create arrays that are aligned by
|
||||
a certain amount. */
|
||||
#define cc_aligned_struct(_alignment_) \
|
||||
typedef struct { \
|
||||
uint8_t b[_alignment_]; \
|
||||
} CC_ALIGNED(_alignment_)
|
||||
|
||||
#if defined(__BIGGEST_ALIGNMENT__)
|
||||
#define CC_MAX_ALIGNMENT ((size_t)__BIGGEST_ALIGNMENT__)
|
||||
#else
|
||||
#define CC_MAX_ALIGNMENT ((size_t)16)
|
||||
#endif
|
||||
|
||||
/* pads a given size to be a multiple of the biggest alignment for any type */
|
||||
#define cc_pad_align(_size_) ((_size_ + CC_MAX_ALIGNMENT - 1) & (~(CC_MAX_ALIGNMENT - 1)))
|
||||
|
||||
/* number of array elements used in a cc_ctx_decl */
|
||||
#define cc_ctx_n(_type_, _size_) ((_size_ + sizeof(_type_) - 1) / sizeof(_type_))
|
||||
|
||||
/* sizeof of a context declared with cc_ctx_decl */
|
||||
#define cc_ctx_sizeof(_type_, _size_) sizeof(_type_[cc_ctx_n(_type_, _size_)])
|
||||
|
||||
/*
|
||||
1. _alloca cannot be removed becasue this header file is compiled with both MSVC++ and with clang.
|
||||
2. The _MSC_VER version of cc_ctx_decl() is not compatible with the way *_decl macros as used in CommonCrypto, AppleKeyStore and SecurityFrameworks. To observe the incompatibilities and errors, use below definition. Corecrypto itself, accepts both deinitions
|
||||
#define cc_ctx_decl(_type_, _size_, _name_) _type_ _name_ ## _array[cc_ctx_n(_type_, (_size_))]; _type_ *_name_ = _name_ ## _array
|
||||
3. Never use sizeof() operator for the variables declared with cc_ctx_decl(), because it is not be compatible with the _MSC_VER version of cc_ctx_decl().
|
||||
*/
|
||||
#if defined(_MSC_VER)
|
||||
#include <malloc.h>
|
||||
#define cc_ctx_decl(_type_, _size_, _name_) _type_ * _name_ = (_type_ *) _alloca(sizeof(_type_) * cc_ctx_n(_type_, _size_) )
|
||||
#define cc_ctx_decl_field(_type_, _size_, _name_) _type_ _name_ [cc_ctx_n(_type_, _size_)]
|
||||
#else
|
||||
#define cc_ctx_decl(_type_, _size_, _name_) \
|
||||
_Pragma("GCC diagnostic push") \
|
||||
_Pragma("GCC diagnostic ignored \"-Wvla\"") \
|
||||
_type_ _name_ [cc_ctx_n(_type_, _size_)] \
|
||||
_Pragma("GCC diagnostic pop")
|
||||
#define cc_ctx_decl_field cc_ctx_decl
|
||||
#endif
|
||||
|
||||
/*!
|
||||
@brief cc_clear(len, dst) zeroizes array dst and it will not be optimized out.
|
||||
@discussion It is used to clear sensitive data, particularly when the are defined in the stack
|
||||
@param len number of bytes to be cleared in dst
|
||||
@param dst input array
|
||||
*/
|
||||
CC_NONNULL((2))
|
||||
void cc_clear(size_t len, void *dst);
|
||||
|
||||
// cc_zero is deprecated, please use cc_clear instead.
|
||||
cc_deprecate_with_replacement("cc_clear", 13.0, 10.15, 13.0, 6.0, 4.0)
|
||||
CC_NONNULL_ALL CC_INLINE
|
||||
void cc_zero(size_t len, void *dst)
|
||||
{
|
||||
cc_clear(len, dst);
|
||||
}
|
||||
|
||||
#define cc_copy(_size_, _dst_, _src_) memcpy(_dst_, _src_, _size_)
|
||||
|
||||
CC_INLINE CC_NONNULL((2, 3, 4))
|
||||
void cc_xor(size_t size, void *r, const void *s, const void *t) {
|
||||
uint8_t *_r=(uint8_t *)r;
|
||||
const uint8_t *_s=(const uint8_t *)s;
|
||||
const uint8_t *_t=(const uint8_t *)t;
|
||||
while (size--) {
|
||||
_r[size] = _s[size] ^ _t[size];
|
||||
}
|
||||
}
|
||||
|
||||
/*!
|
||||
@brief cc_cmp_safe(num, pt1, pt2) compares two array ptr1 and ptr2 of num bytes.
|
||||
@discussion The execution time/cycles is independent of the data and therefore guarantees no leak about the data. However, the execution time depends on num.
|
||||
@param num number of bytes in each array
|
||||
@param ptr1 input array
|
||||
@param ptr2 input array
|
||||
@return returns 0 if the num bytes starting at ptr1 are identical to the num bytes starting at ptr2 and 1 if they are different or if num is 0 (empty arrays).
|
||||
*/
|
||||
CC_NONNULL((2, 3))
|
||||
int cc_cmp_safe (size_t num, const void * ptr1, const void * ptr2);
|
||||
|
||||
/* Exchange S and T of any type. NOTE: Both and S and T are evaluated
|
||||
mutliple times and MUST NOT be expressions. */
|
||||
#define CC_SWAP(S,T) do { \
|
||||
volatile __typeof__(S) _cc_swap_tmp = S; S = T; T = _cc_swap_tmp; \
|
||||
_cc_swap_tmp = 0;\
|
||||
} while(0)
|
||||
|
||||
/* Return the maximum value between S and T. */
|
||||
#define CC_MAX(S, T) ({__typeof__(S) _cc_max_s = S; __typeof__(T) _cc_max_t = T; _cc_max_s > _cc_max_t ? _cc_max_s : _cc_max_t;})
|
||||
|
||||
/* Clone of CC_MAX() that evalutes S and T multiple times to allow nesting. */
|
||||
#define CC_MAX_EVAL(S, T) ((S) > (T) ? (S) : (T))
|
||||
|
||||
/* Return the minimum value between S and T. */
|
||||
#define CC_MIN(S, T) ({__typeof__(S) _cc_min_s = S; __typeof__(T) _cc_min_t = T; _cc_min_s <= _cc_min_t ? _cc_min_s : _cc_min_t;})
|
||||
|
||||
/*
|
||||
When building with "-nostdinc" (i.e. iboot), ptrauth.h is in a non-standard location.
|
||||
This requires a new flag to be used when building iboot: -ibuiltininc which is not
|
||||
yet available.
|
||||
*/
|
||||
#if __has_feature(ptrauth_calls) && (CC_KERNEL || CC_USE_L4 || CC_USE_SEPROM)
|
||||
#include <ptrauth.h>
|
||||
#define CC_SPTR(_sn_, _n_) \
|
||||
__ptrauth(ptrauth_key_process_independent_code, 1, ptrauth_string_discriminator("cc_" #_sn_ #_n_)) _n_
|
||||
#else
|
||||
#define CC_SPTR(_sn_, _n_) _n_
|
||||
#endif
|
||||
|
||||
#endif /* _CORECRYPTO_CC_H_ */
|
||||
|
|
@ -0,0 +1,83 @@
|
|||
/* Copyright (c) (2016-2020) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#ifndef cc_absolute_time_h
|
||||
#define cc_absolute_time_h
|
||||
|
||||
#include <corecrypto/cc_config.h>
|
||||
#include <stdint.h>
|
||||
|
||||
// For more info on mach_absolute_time() precision:
|
||||
// https://developer.apple.com/library/mac/qa/qa1398/_index.html
|
||||
|
||||
#if CC_USE_L4
|
||||
#include <ert/time.h>
|
||||
#define cc_absolute_time() ert_time_now()
|
||||
|
||||
// L4 doesn't use a scaling factor
|
||||
#define cc_absolute_time_sf() (1.0 / 1000000000.0)
|
||||
#elif CC_KERNEL
|
||||
#include <mach/mach_time.h>
|
||||
#include <kern/clock.h>
|
||||
#define cc_absolute_time() (mach_absolute_time())
|
||||
|
||||
// Scale factor to convert absolute time to seconds
|
||||
#define cc_absolute_time_sf() ({ \
|
||||
struct mach_timebase_info info; \
|
||||
clock_timebase_info(&info); \
|
||||
((double)info.numer) / (1000000000.0 * info.denom); \
|
||||
})
|
||||
#elif CC_DARWIN
|
||||
#include <mach/mach_time.h>
|
||||
#define cc_absolute_time() (mach_absolute_time())
|
||||
|
||||
// Scale factor to convert absolute time to seconds
|
||||
#define cc_absolute_time_sf() ({ \
|
||||
struct mach_timebase_info info; \
|
||||
mach_timebase_info(&info); \
|
||||
((double)info.numer) / (1000000000.0 * info.denom); \
|
||||
})
|
||||
#elif defined(_WIN32)
|
||||
#include <windows.h>
|
||||
CC_INLINE uint64_t cc_absolute_time(void) {
|
||||
LARGE_INTEGER time;
|
||||
QueryPerformanceCounter(&time); //resolution < 1us
|
||||
return (uint64_t)time.QuadPart;
|
||||
}
|
||||
|
||||
CC_INLINE double cc_absolute_time_sf(){
|
||||
LARGE_INTEGER freq;
|
||||
QueryPerformanceFrequency(&freq); //performance counter freq in Hz
|
||||
return (double)1 / freq.QuadPart;
|
||||
}
|
||||
|
||||
#elif CC_LINUX
|
||||
#if CORECRYPTO_SIMULATE_POSIX_ENVIRONMENT
|
||||
#include <mach/mach_time.h>
|
||||
#define cc_absolute_time() (mach_absolute_time()) // To test compilation on mac
|
||||
#else
|
||||
// The following is specific to non x86 (arm/mips/etc...) architectures on Linux.
|
||||
#warning cc_absolute_time() has not been tested
|
||||
#include <time.h>
|
||||
#define NSEC_PER_USEC 1000ull
|
||||
CC_INLINE uint64_t cc_absolute_time() {
|
||||
struct timespec tm;
|
||||
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &tm);
|
||||
return tm.tv_sec * 1000000000ull + tm.tv_nsec;
|
||||
}
|
||||
#endif // CORECRYPTO_SIMULATE_POSIX_ENVIRONMENT
|
||||
#define cc_absolute_time_sf() (1.0 / 1000000000.0)
|
||||
|
||||
#else
|
||||
#warning Target OS is not defined. There should be a definition for cc_absolute_time() for the target OS/platform.
|
||||
#endif
|
||||
|
||||
#endif /* cc_absolute_time_h */
|
||||
|
|
@ -0,0 +1,600 @@
|
|||
/* Copyright (c) (2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#ifndef _CORECRYPTO_CC_CONFIG_H_
|
||||
#define _CORECRYPTO_CC_CONFIG_H_
|
||||
|
||||
/* A word about configuration macros:
|
||||
|
||||
Conditional configuration macros specific to corecrypto should be named CORECRYPTO_xxx
|
||||
or CCxx_yyy and be defined to be either 0 or 1 in this file. You can add an
|
||||
#ifndef #error construct at the end of this file to make sure it's always defined.
|
||||
|
||||
They should always be tested using the #if directive, never the #ifdef directive.
|
||||
|
||||
No other conditional macros shall ever be used (except in this file)
|
||||
|
||||
Configuration Macros that are defined outside of corecrypto (eg: KERNEL, DEBUG, ...)
|
||||
shall only be used in this file to define CCxxx macros.
|
||||
|
||||
External macros should be assumed to be either undefined, defined with no value,
|
||||
or defined as true or false. We shall strive to build with -Wundef whenever possible,
|
||||
so the following construct should be used to test external macros in this file:
|
||||
|
||||
#if defined(DEBUG) && (DEBUG)
|
||||
#define CORECRYPTO_DEBUG 1
|
||||
#else
|
||||
#define CORECRYPTO_DEBUG 0
|
||||
#endif
|
||||
|
||||
|
||||
It is acceptable to define a conditional CC_xxxx macro in an implementation file,
|
||||
to be used only in this file.
|
||||
|
||||
The current code is not guaranteed to follow those rules, but should be fixed to.
|
||||
|
||||
Corecrypto requires GNU and C99 compatibility.
|
||||
Typically enabled by passing --gnu --c99 to the compiler (eg. armcc)
|
||||
|
||||
*/
|
||||
|
||||
//Do not set this macros to 1, unless you are developing/testing for Linux under macOS
|
||||
#define CORECRYPTO_SIMULATE_POSIX_ENVIRONMENT 0
|
||||
|
||||
//Do not set these macros to 1, unless you are developing/testing for Windows under macOS
|
||||
#define CORECRYPTO_SIMULATE_WINDOWS_ENVIRONMENT 0
|
||||
#define CORECRYPTO_HACK_FOR_WINDOWS_DEVELOPMENT 0
|
||||
|
||||
#if (defined(DEBUG) && (DEBUG)) || defined(_DEBUG) //MSVC defines _DEBUG
|
||||
/* CC_DEBUG is already used in CommonCrypto */
|
||||
#define CORECRYPTO_DEBUG 1
|
||||
#else
|
||||
#define CORECRYPTO_DEBUG 0
|
||||
#endif
|
||||
|
||||
// This macro can be used to enable prints when a condition in the macro "cc_require"
|
||||
// is false. This is especially useful to confirm that negative testing fails
|
||||
// at the intended location
|
||||
#define CORECRYPTO_DEBUG_ENABLE_CC_REQUIRE_PRINTS 0
|
||||
|
||||
#if defined(KERNEL) && (KERNEL)
|
||||
#define CC_KERNEL 1 // KEXT, XNU repo or kernel components such as AppleKeyStore
|
||||
#else
|
||||
#define CC_KERNEL 0
|
||||
#endif
|
||||
|
||||
#if defined(__linux__) || CORECRYPTO_SIMULATE_POSIX_ENVIRONMENT
|
||||
#define CC_LINUX 1
|
||||
#else
|
||||
#define CC_LINUX 0
|
||||
#endif
|
||||
|
||||
#if defined(__ANDROID__) && (__ANDROID__)
|
||||
#define CC_ANDROID 1
|
||||
#else
|
||||
#define CC_ANDROID 0
|
||||
#endif
|
||||
|
||||
#if defined(USE_L4) && (USE_L4)
|
||||
#define CC_USE_L4 1
|
||||
#else
|
||||
#define CC_USE_L4 0
|
||||
#endif
|
||||
|
||||
#if defined(RTKIT) && (RTKIT)
|
||||
#define CC_RTKIT 1
|
||||
#else
|
||||
#define CC_RTKIT 0
|
||||
#endif
|
||||
|
||||
#if defined(RTKITROM) && (RTKITROM)
|
||||
#define CC_RTKITROM 1
|
||||
#else
|
||||
#define CC_RTKITROM 0
|
||||
#endif
|
||||
|
||||
#if defined(USE_SEPROM) && (USE_SEPROM)
|
||||
#define CC_USE_SEPROM 1
|
||||
#else
|
||||
#define CC_USE_SEPROM 0
|
||||
#endif
|
||||
|
||||
#if defined(USE_S3) && (USE_S3)
|
||||
#define CC_USE_S3 1
|
||||
#else
|
||||
#define CC_USE_S3 0
|
||||
#endif
|
||||
|
||||
#if (defined(ICE_FEATURES_ENABLED)) || (defined(MAVERICK) && (MAVERICK))
|
||||
#define CC_BASEBAND 1
|
||||
#else
|
||||
#define CC_BASEBAND 0
|
||||
#endif
|
||||
|
||||
#if defined(EFI) && (EFI)
|
||||
#define CC_EFI 1
|
||||
#else
|
||||
#define CC_EFI 0
|
||||
#endif
|
||||
|
||||
#if defined(IBOOT) && (IBOOT)
|
||||
#define CC_IBOOT 1
|
||||
#else
|
||||
#define CC_IBOOT 0
|
||||
#endif
|
||||
|
||||
#if defined(TARGET_OS_BRIDGE)
|
||||
#define CC_BRIDGE TARGET_OS_BRIDGE
|
||||
#else
|
||||
#define CC_BRIDGE 0
|
||||
#endif
|
||||
|
||||
// Check if we're running on a generic, userspace platform, i.e., not in the kernel, SEP, etc.
|
||||
#ifndef CC_GENERIC_PLATFORM
|
||||
#define CC_GENERIC_PLATFORM \
|
||||
(!CC_RTKIT && !CC_KERNEL && !CC_USE_L4 && \
|
||||
!CC_RTKITROM && !CC_EFI && !CC_IBOOT && \
|
||||
!CC_USE_SEPROM && !CC_ANDROID && !CC_LINUX && \
|
||||
!CC_BRIDGE)
|
||||
#endif
|
||||
|
||||
// Defined by the XNU build scripts
|
||||
// Applies to code embedded in XNU but NOT to the kext
|
||||
#if defined(XNU_KERNEL_PRIVATE)
|
||||
#define CC_XNU_KERNEL_PRIVATE 1
|
||||
#else
|
||||
#define CC_XNU_KERNEL_PRIVATE 0
|
||||
#endif
|
||||
|
||||
// handle unaligned data, if the cpu cannot. Currently for gladman AES and the C version of the SHA256
|
||||
#define CC_HANDLE_UNALIGNED_DATA CC_BASEBAND
|
||||
|
||||
// BaseBand configuration
|
||||
#if CC_BASEBAND
|
||||
|
||||
// -- ENDIANESS
|
||||
#if !defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__)
|
||||
#if defined(ENDIAN_LITTLE) || (defined(__arm__) && !defined(__BIG_ENDIAN))
|
||||
#define __LITTLE_ENDIAN__
|
||||
#elif !defined(ENDIAN_BIG) && !defined(__BIG_ENDIAN)
|
||||
#error Baseband endianess not defined.
|
||||
#endif
|
||||
#define AESOPT_ENDIAN_NO_FILE
|
||||
#endif
|
||||
|
||||
// -- Architecture
|
||||
#define CCN_UNIT_SIZE 4 // 32 bits
|
||||
|
||||
// -- External function
|
||||
#define assert ASSERT // sanity
|
||||
|
||||
// -- Warnings
|
||||
// Ignore irrelevant warnings after verification
|
||||
// #1254-D: arithmetic on pointer to void or function type
|
||||
// #186-D: pointless comparison of unsigned integer with zero
|
||||
// #546-D: transfer of control bypasses initialization of
|
||||
#ifdef __arm__
|
||||
#pragma diag_suppress 186, 1254,546
|
||||
#elif defined(__GNUC__)
|
||||
// warning: pointer of type 'void *' used in arithmetic
|
||||
#pragma GCC diagnostic ignored "-Wpointer-arith"
|
||||
#endif // __arm__
|
||||
#define CC_SMALL_CODE 1
|
||||
|
||||
#endif // CC_BASEBAND
|
||||
|
||||
#if CC_RTKIT || CC_RTKITROM
|
||||
#define CC_SMALL_CODE 1
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef CC_SMALL_CODE
|
||||
#define CC_SMALL_CODE 0
|
||||
#endif
|
||||
|
||||
//CC_DARWIN indicates the availability of XNU kernel functions,
|
||||
//like what we have on OSX, iOS, tvOS, Watch OS
|
||||
#if (CC_USE_L4 || CC_RTKIT || CC_RTKITROM || CC_USE_SEPROM || CC_EFI || CC_LINUX || defined(_WIN32) || CC_BASEBAND || CC_USE_S3 || CC_ANDROID)
|
||||
#define CC_DARWIN 0
|
||||
#else
|
||||
#define CC_DARWIN 1
|
||||
#endif
|
||||
|
||||
//arm arch64 definition for gcc
|
||||
#if defined(__GNUC__) && defined(__aarch64__) && !defined(__arm64__)
|
||||
#define __arm64__
|
||||
#endif
|
||||
|
||||
#if !defined(CCN_UNIT_SIZE)
|
||||
#if defined(__arm64__) || defined(__x86_64__) || defined(_WIN64)
|
||||
#define CCN_UNIT_SIZE 8
|
||||
#elif defined(__arm__) || defined(__i386__) || defined(_WIN32)
|
||||
#define CCN_UNIT_SIZE 4
|
||||
#else
|
||||
#error undefined architecture
|
||||
#endif
|
||||
#endif /* !defined(CCN_UNIT_SIZE) */
|
||||
|
||||
|
||||
//this allows corecrypto Windows development using xcode
|
||||
#if defined(CORECRYPTO_SIMULATE_WINDOWS_ENVIRONMENT)
|
||||
#if CORECRYPTO_SIMULATE_WINDOWS_ENVIRONMENT && CC_DARWIN && CORECRYPTO_DEBUG
|
||||
#define CC_USE_ASM 0
|
||||
#define CC_USE_HEAP_FOR_WORKSPACE 1
|
||||
#if (CCN_UNIT_SIZE==8)
|
||||
#define CCN_UINT128_SUPPORT_FOR_64BIT_ARCH 0
|
||||
#else
|
||||
#define CCN_UINT128_SUPPORT_FOR_64BIT_ARCH 1
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if !defined(CCN_UINT128_SUPPORT_FOR_64BIT_ARCH)
|
||||
#if defined(_WIN64) && defined(_WIN32) && (CCN_UNIT_SIZE==8)
|
||||
#define CCN_UINT128_SUPPORT_FOR_64BIT_ARCH 0
|
||||
#elif defined(_WIN32)
|
||||
#define CCN_UINT128_SUPPORT_FOR_64BIT_ARCH 1//should not be a problem
|
||||
#else
|
||||
#define CCN_UINT128_SUPPORT_FOR_64BIT_ARCH 1
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#if defined(__clang__)
|
||||
#define CC_ALIGNED(x) __attribute__ ((aligned(x))) //clang compiler
|
||||
#else
|
||||
#define CC_ALIGNED(x) __declspec(align(x)) //MS complier
|
||||
#endif
|
||||
#else
|
||||
#if __clang__ || CCN_UNIT_SIZE==8
|
||||
#define CC_ALIGNED(x) __attribute__ ((aligned(x)))
|
||||
#else
|
||||
#define CC_ALIGNED(x) __attribute__ ((aligned((x)>8?8:(x))))
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__arm__)
|
||||
//this is copied from <arm/arch.h>, because <arm/arch.h> is not available on SEPROM environment
|
||||
#if defined (__ARM_ARCH_7A__) || defined (__ARM_ARCH_7S__) || defined (__ARM_ARCH_7F__) || defined (__ARM_ARCH_7K__) || defined(__ARM_ARCH_7EM__)
|
||||
#define _ARM_ARCH_7
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_ARCH_6M__) || defined(__TARGET_ARCH_6S_M) || defined (__armv6m__)
|
||||
#define _ARM_ARCH_6M
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__arm64__) || defined(__arm__)
|
||||
#define CCN_IOS 1
|
||||
#define CCN_OSX 0
|
||||
#elif defined(__x86_64__) || defined(__i386__)
|
||||
#define CCN_IOS 0
|
||||
#define CCN_OSX 1
|
||||
#endif
|
||||
|
||||
#if CC_USE_S3
|
||||
/* For corecrypto kext, CC_STATIC should be undefined */
|
||||
#define CC_STATIC 1
|
||||
#endif
|
||||
|
||||
#if !defined(CC_USE_HEAP_FOR_WORKSPACE)
|
||||
#if CC_USE_S3 || CC_USE_SEPROM || CC_RTKITROM
|
||||
#define CC_USE_HEAP_FOR_WORKSPACE 0
|
||||
#else
|
||||
#define CC_USE_HEAP_FOR_WORKSPACE 1
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* memset_s is only available in few target */
|
||||
#if CC_USE_SEPROM || defined(__CC_ARM) \
|
||||
|| defined(__hexagon__) || CC_EFI
|
||||
#define CC_HAS_MEMSET_S 0
|
||||
#else
|
||||
#define CC_HAS_MEMSET_S 1
|
||||
#endif
|
||||
|
||||
// Include target conditionals if available.
|
||||
#if defined(__has_include) /* portability */
|
||||
#if __has_include(<TargetConditionals.h>)
|
||||
#include <TargetConditionals.h>
|
||||
#endif /* __has_include(<TargetConditionals.h>) */
|
||||
#endif /* defined(__has_include) */
|
||||
|
||||
// Disable RSA Keygen on iBridge
|
||||
#if defined(TARGET_OS_BRIDGE) && TARGET_OS_BRIDGE && CC_KERNEL
|
||||
#define CC_DISABLE_RSAKEYGEN 1 /* for iBridge */
|
||||
#else
|
||||
#define CC_DISABLE_RSAKEYGEN 0 /* default */
|
||||
#endif
|
||||
|
||||
#if (CCN_UNIT_SIZE == 8) && !( defined(_MSC_VER) && defined(__clang__))
|
||||
#define CCEC25519_CURVE25519_64BIT 1
|
||||
#else
|
||||
#define CCEC25519_CURVE25519_64BIT 0
|
||||
#endif
|
||||
|
||||
//- functions implemented in assembly ------------------------------------------
|
||||
//this the list of corecrypto clients that use assembly and the clang compiler
|
||||
#if !(CC_DARWIN || CC_KERNEL || CC_USE_L4 || CC_IBOOT || CC_RTKIT || CC_RTKITROM || CC_USE_SEPROM || CC_USE_S3) && !defined(_WIN32) && CORECRYPTO_DEBUG
|
||||
#warning "You are using the default corecrypto configuration, assembly optimizations may not be available for your platform"
|
||||
#endif
|
||||
|
||||
// Enable assembler in Linux if CC_LINUX_ASM is defined
|
||||
#if CC_LINUX && defined(CC_LINUX_ASM) && CC_LINUX_ASM
|
||||
#define CC_USE_ASM 1
|
||||
#endif
|
||||
|
||||
// Use this macro to strictly disable assembly regardless of cpu/os/compiler/etc.
|
||||
// Our assembly code is not gcc compatible. Clang defines the __GNUC__ macro as well.
|
||||
#if !defined(CC_USE_ASM)
|
||||
#if defined(_WIN32) || CC_EFI || CC_BASEBAND || CC_XNU_KERNEL_PRIVATE || (defined(__GNUC__) && !defined(__clang__)) || defined(__ANDROID_API__) || CC_LINUX
|
||||
#define CC_USE_ASM 0
|
||||
#else
|
||||
#define CC_USE_ASM 1
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define CC_CACHE_DESCRIPTORS CC_KERNEL
|
||||
|
||||
//-(1) ARM V7
|
||||
#if defined(_ARM_ARCH_7) && __clang__ && CC_USE_ASM
|
||||
#define CCN_DEDICATED_SQR CC_SMALL_CODE
|
||||
#define CCN_MUL_KARATSUBA 0 // no performance improvement
|
||||
#define CCN_ADD_ASM 1
|
||||
#define CCN_SUB_ASM 1
|
||||
#define CCN_MUL_ASM 0
|
||||
#define CCN_ADDMUL1_ASM 1
|
||||
#define CCN_MUL1_ASM 1
|
||||
#define CCN_CMP_ASM 1
|
||||
#define CCN_ADD1_ASM 1
|
||||
#define CCN_SUB1_ASM 1
|
||||
#define CCN_N_ASM 1
|
||||
#define CCN_SET_ASM 1
|
||||
#define CCN_SHIFT_RIGHT_ASM 1
|
||||
#if defined(__ARM_NEON__)
|
||||
#define CCN_SHIFT_LEFT_ASM 1
|
||||
#else
|
||||
#define CCN_SHIFT_LEFT_ASM 0
|
||||
#endif
|
||||
#define CCN_MULMOD_224_ASM 1
|
||||
#define CCN_MULMOD_256_ASM 1
|
||||
#define CCAES_ARM_ASM 1
|
||||
#define CCAES_INTEL_ASM 0
|
||||
#if CC_KERNEL || CC_USE_L4 || CC_IBOOT || CC_RTKIT || CC_RTKITROM || CC_USE_SEPROM || CC_USE_S3
|
||||
#define CCAES_MUX 0
|
||||
#else
|
||||
#define CCAES_MUX 1
|
||||
#endif
|
||||
#define CCN_USE_BUILTIN_CLZ 1
|
||||
#define CCSHA1_VNG_INTEL 0
|
||||
#define CCSHA2_VNG_INTEL 0
|
||||
|
||||
#if defined(__ARM_NEON__) || CC_KERNEL
|
||||
#define CCSHA1_VNG_ARM 1
|
||||
#define CCSHA2_VNG_ARM 1
|
||||
#else /* !defined(__ARM_NEON__) */
|
||||
#define CCSHA1_VNG_ARM 0
|
||||
#define CCSHA2_VNG_ARM 0
|
||||
#endif /* !defined(__ARM_NEON__) */
|
||||
#define CCSHA256_ARMV6M_ASM 0
|
||||
|
||||
#define CC_ACCELERATECRYPTO 1
|
||||
|
||||
//-(2) ARM 64
|
||||
#elif defined(__arm64__) && __clang__ && CC_USE_ASM
|
||||
#define CCN_DEDICATED_SQR CC_SMALL_CODE
|
||||
#define CCN_MUL_KARATSUBA 0 // 4*n CCN_UNIT extra memory required.
|
||||
#define CCN_ADD_ASM 1
|
||||
#define CCN_SUB_ASM 1
|
||||
#define CCN_MUL_ASM 1
|
||||
#define CCN_ADDMUL1_ASM 0
|
||||
#define CCN_MUL1_ASM 0
|
||||
#define CCN_CMP_ASM 1
|
||||
#define CCN_ADD1_ASM 0
|
||||
#define CCN_SUB1_ASM 0
|
||||
#define CCN_N_ASM 1
|
||||
#define CCN_SET_ASM 0
|
||||
#define CCN_SHIFT_RIGHT_ASM 1
|
||||
#define CCN_SHIFT_LEFT_ASM 1
|
||||
#define CCN_MULMOD_224_ASM 1
|
||||
#define CCN_MULMOD_256_ASM 1
|
||||
#define CCAES_ARM_ASM 1
|
||||
#define CCAES_INTEL_ASM 0
|
||||
#define CCAES_MUX 0 // On 64bit SoC, asm is much faster than HW
|
||||
#define CCN_USE_BUILTIN_CLZ 1
|
||||
#define CCSHA1_VNG_INTEL 0
|
||||
#define CCSHA2_VNG_INTEL 0
|
||||
#define CCSHA1_VNG_ARM 1
|
||||
#define CCSHA2_VNG_ARM 1
|
||||
#define CCSHA256_ARMV6M_ASM 0
|
||||
|
||||
#define CC_ACCELERATECRYPTO 1
|
||||
|
||||
//-(3) Intel 32/64
|
||||
#elif (defined(__x86_64__) || defined(__i386__)) && __clang__ && CC_USE_ASM
|
||||
#define CCN_DEDICATED_SQR 1
|
||||
#define CCN_MUL_KARATSUBA 0 // 4*n CCN_UNIT extra memory required.
|
||||
/* These assembly routines only work for a single CCN_UNIT_SIZE. */
|
||||
#if (defined(__x86_64__) && CCN_UNIT_SIZE == 8) || (defined(__i386__) && CCN_UNIT_SIZE == 4)
|
||||
#define CCN_ADD_ASM 1
|
||||
#define CCN_SUB_ASM 1
|
||||
#define CCN_MUL_ASM 1
|
||||
#else
|
||||
#define CCN_ADD_ASM 0
|
||||
#define CCN_SUB_ASM 0
|
||||
#define CCN_MUL_ASM 0
|
||||
#endif
|
||||
|
||||
#if (defined(__x86_64__) && CCN_UNIT_SIZE == 8)
|
||||
#define CCN_CMP_ASM 1
|
||||
#define CCN_N_ASM 1
|
||||
#define CCN_SHIFT_RIGHT_ASM 1
|
||||
#define CCN_SHIFT_LEFT_ASM 1
|
||||
#else
|
||||
#define CCN_CMP_ASM 0
|
||||
#define CCN_N_ASM 0
|
||||
#define CCN_SHIFT_RIGHT_ASM 0
|
||||
#define CCN_SHIFT_LEFT_ASM 0
|
||||
#endif
|
||||
|
||||
#define CCN_MULMOD_224_ASM 0
|
||||
#if defined(__x86_64__) && CCN_UNIT_SIZE == 8
|
||||
#define CCN_MULMOD_256_ASM 1
|
||||
#define CCN_ADDMUL1_ASM 1
|
||||
#define CCN_MUL1_ASM 1
|
||||
#else
|
||||
#define CCN_MULMOD_256_ASM 0
|
||||
#define CCN_ADDMUL1_ASM 0
|
||||
#define CCN_MUL1_ASM 0
|
||||
#endif
|
||||
#define CCN_ADD1_ASM 0
|
||||
#define CCN_SUB1_ASM 0
|
||||
#define CCN_SET_ASM 0
|
||||
#define CCAES_ARM_ASM 0
|
||||
#define CCAES_INTEL_ASM 1
|
||||
#define CCAES_MUX 0
|
||||
#define CCN_USE_BUILTIN_CLZ 0
|
||||
#define CCSHA1_VNG_INTEL 1
|
||||
#define CCSHA2_VNG_INTEL 1
|
||||
#define CCSHA1_VNG_ARM 0
|
||||
#define CCSHA2_VNG_ARM 0
|
||||
#define CCSHA256_ARMV6M_ASM 0
|
||||
|
||||
#define CC_ACCELERATECRYPTO 1
|
||||
|
||||
//-(4) disable assembly
|
||||
#else
|
||||
#if CCN_UINT128_SUPPORT_FOR_64BIT_ARCH
|
||||
#define CCN_DEDICATED_SQR 1
|
||||
#else
|
||||
#define CCN_DEDICATED_SQR 0 //when assembly is off and 128-bit integers are not supported, dedicated square is off. This is the case on Windows
|
||||
#endif
|
||||
#define CCN_MUL_KARATSUBA 0 // 4*n CCN_UNIT extra memory required.
|
||||
#define CCN_ADD_ASM 0
|
||||
#define CCN_SUB_ASM 0
|
||||
#define CCN_MUL_ASM 0
|
||||
#define CCN_ADDMUL1_ASM 0
|
||||
#define CCN_MUL1_ASM 0
|
||||
#define CCN_CMP_ASM 0
|
||||
#define CCN_ADD1_ASM 0
|
||||
#define CCN_SUB1_ASM 0
|
||||
#define CCN_N_ASM 0
|
||||
#define CCN_SET_ASM 0
|
||||
#define CCN_SHIFT_RIGHT_ASM 0
|
||||
#define CCN_SHIFT_LEFT_ASM 0
|
||||
#define CCN_MULMOD_224_ASM 0
|
||||
#define CCN_MULMOD_256_ASM 0
|
||||
#define CCAES_ARM_ASM 0
|
||||
#define CCAES_INTEL_ASM 0
|
||||
#define CCAES_MUX 0
|
||||
#define CCN_USE_BUILTIN_CLZ 0
|
||||
#define CCSHA1_VNG_INTEL 0
|
||||
#define CCSHA2_VNG_INTEL 0
|
||||
#define CCSHA1_VNG_ARM 0
|
||||
#define CCSHA2_VNG_ARM 0
|
||||
#define CCSHA256_ARMV6M_ASM 0
|
||||
|
||||
#define CC_ACCELERATECRYPTO 0
|
||||
|
||||
#endif
|
||||
|
||||
#define CC_INLINE static inline
|
||||
|
||||
#ifdef __GNUC__
|
||||
#define CC_NORETURN __attribute__((__noreturn__))
|
||||
#define CC_NOTHROW __attribute__((__nothrow__))
|
||||
#define CC_NONNULL(N) __attribute__((__nonnull__ N))
|
||||
#define CC_NONNULL4 CC_NONNULL((4))
|
||||
#define CC_NONNULL_ALL __attribute__((__nonnull__))
|
||||
#define CC_SENTINEL __attribute__((__sentinel__))
|
||||
// Only apply the `CC_CONST` attribute to functions with no side-effects where the output is a strict function of pass by value input vars with no exterior side-effects.
|
||||
// Specifically, do not apply CC_CONST if the function has any arguments that are pointers (directly, or indirectly)
|
||||
#define CC_CONST __attribute__((__const__))
|
||||
#define CC_PURE __attribute__((__pure__))
|
||||
#define CC_WARN_RESULT __attribute__((__warn_unused_result__))
|
||||
#define CC_MALLOC_CLEAR __attribute__((__malloc__))
|
||||
#define CC_UNUSED __attribute__((unused))
|
||||
#else /* !__GNUC__ */
|
||||
/*! @parseOnly */
|
||||
#define CC_UNUSED
|
||||
/*! @parseOnly */
|
||||
#define CC_NONNULL(N)
|
||||
/*! @parseOnly */
|
||||
#define CC_NONNULL4
|
||||
/*! @parseOnly */
|
||||
#define CC_NORETURN
|
||||
/*! @parseOnly */
|
||||
#define CC_NOTHROW
|
||||
/*! @parseOnly */
|
||||
#define CC_NONNULL_ALL
|
||||
/*! @parseOnly */
|
||||
#define CC_SENTINEL
|
||||
/*! @parseOnly */
|
||||
#define CC_CONST
|
||||
/*! @parseOnly */
|
||||
#define CC_PURE
|
||||
/*! @parseOnly */
|
||||
#define CC_WARN_RESULT
|
||||
/*! @parseOnly */
|
||||
#define CC_MALLOC_CLEAR
|
||||
#endif /* !__GNUC__ */
|
||||
|
||||
|
||||
// Bridge differences between MachO and ELF compiler/assemblers. */
|
||||
#if CC_LINUX
|
||||
#define CC_ASM_SECTION_CONST .rodata
|
||||
#define CC_ASM_PRIVATE_EXTERN .hidden
|
||||
#if CC_LINUX
|
||||
// We need to be sure that assembler can access relocated C
|
||||
// symbols. Sad but this is the quickest way to do that, at least with
|
||||
// our current linux compiler (clang-3.4).
|
||||
#define CC_C_LABEL(_sym) _sym@PLT
|
||||
#endif
|
||||
#define _IMM(x) $(x)
|
||||
#else /* !CC_LINUX */
|
||||
#define CC_ASM_SECTION_CONST .const
|
||||
#define CC_ASM_PRIVATE_EXTERN .private_extern
|
||||
#define CC_C_LABEL(_sym) _##_sym
|
||||
#define _IMM(x) $$(x)
|
||||
#endif /* !CC_LINUX */
|
||||
|
||||
// Enable FIPSPOST function tracing only when supported. */
|
||||
#ifdef CORECRYPTO_POST_TRACE
|
||||
#define CC_FIPSPOST_TRACE 1
|
||||
#else
|
||||
#define CC_FIPSPOST_TRACE 0
|
||||
#endif
|
||||
|
||||
#ifndef CC_INTERNAL_SDK
|
||||
#if __has_include(<System/i386/cpu_capabilities.h>)
|
||||
#define CC_INTERNAL_SDK 1
|
||||
#elif __has_include(<System/arm/cpu_capabilities.h>)
|
||||
#define CC_INTERNAL_SDK 1
|
||||
#else
|
||||
#define CC_INTERNAL_SDK 0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Currently thread sanitizer is only supported in local builds.
|
||||
// Please edit your "corecrypto_test" scheme to build with thread
|
||||
// sanitizer and then remove *all* variants of corecrypto_static
|
||||
// besides "normal"
|
||||
#if defined(__has_feature)
|
||||
#if __has_feature(thread_sanitizer)
|
||||
#define CC_TSAN 1
|
||||
#else
|
||||
#define CC_TSAN 0
|
||||
#endif // __has_feature(thread_sanitizer)
|
||||
#else
|
||||
#define CC_TSAN 0
|
||||
#endif // __has_feature
|
||||
|
||||
#endif /* _CORECRYPTO_CC_CONFIG_H_ */
|
||||
|
|
@ -0,0 +1,76 @@
|
|||
/* Copyright (c) (2012,2014-2019) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
//debug configuration header file
|
||||
#ifndef _CORECRYPTO_CCN_DEBUG_H_
|
||||
#define _CORECRYPTO_CCN_DEBUG_H_
|
||||
|
||||
#include <corecrypto/cc_config.h>
|
||||
|
||||
// DO NOT INCLUDE this HEADER file in CoreCrypto files added for XNU project or headers
|
||||
// included by external clients.
|
||||
|
||||
// ========================
|
||||
// Printf for corecrypto
|
||||
// ========================
|
||||
#if CC_KERNEL
|
||||
#include <pexpert/pexpert.h>
|
||||
#define cc_printf(x...) kprintf(x)
|
||||
#if !CONFIG_EMBEDDED
|
||||
extern int printf(const char *format, ...) __printflike(1,2);
|
||||
#endif
|
||||
#elif CC_USE_S3 || CC_IBOOT || CC_RTKIT || CC_RTKITROM
|
||||
#include <stdio.h>
|
||||
#define cc_printf(x...) printf(x)
|
||||
#elif defined(__ANDROID_API__)
|
||||
#include <android/log.h>
|
||||
#define cc_printf(x...) __android_log_print(ANDROID_LOG_DEBUG, "corecrypto", x);
|
||||
#else
|
||||
#include <stdio.h>
|
||||
#define cc_printf(x...) fprintf(stderr, x)
|
||||
#endif
|
||||
|
||||
// ========================
|
||||
// Integer types
|
||||
// ========================
|
||||
|
||||
#if CC_KERNEL
|
||||
/* Those are not defined in libkern */
|
||||
#define PRIx64 "llx"
|
||||
#define PRIx32 "x"
|
||||
#define PRIx16 "hx"
|
||||
#define PRIx8 "hhx"
|
||||
#else
|
||||
#include <inttypes.h>
|
||||
#endif
|
||||
|
||||
#if CCN_UNIT_SIZE == 8
|
||||
#define CCPRIx_UNIT ".016" PRIx64
|
||||
#elif CCN_UNIT_SIZE == 4
|
||||
#define CCPRIx_UNIT ".08" PRIx32
|
||||
#elif CCN_UNIT_SIZE == 2
|
||||
#define CCPRIx_UNIT ".04" PRIx16
|
||||
#elif CCN_UNIT_SIZE == 1
|
||||
#define CCPRIx_UNIT ".02" PRIx8
|
||||
#else
|
||||
#error invalid CCN_UNIT_SIZE
|
||||
#endif
|
||||
|
||||
// ========================
|
||||
// Print utilities for corecrypto
|
||||
// ========================
|
||||
|
||||
#include <corecrypto/cc.h>
|
||||
|
||||
/* Print a byte array of arbitrary size */
|
||||
void cc_print(const char *label, size_t count, const uint8_t *s);
|
||||
|
||||
#endif /* _CORECRYPTO_CCN_DEBUG_H_ */
|
||||
|
|
@ -0,0 +1,165 @@
|
|||
/* Copyright (c) (2017,2018,2019,2020) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#ifndef _CORECRYPTO_CC_ERROR_H_
|
||||
#define _CORECRYPTO_CC_ERROR_H_
|
||||
|
||||
enum {
|
||||
CCERR_OK = 0,
|
||||
|
||||
/* the default error code */
|
||||
CCERR_INTERNAL = -1,
|
||||
|
||||
CCERR_INTEGRITY = -2,
|
||||
|
||||
CCERR_DEVICE = -3,
|
||||
CCERR_INTERRUPTS = -4,
|
||||
CCERR_CRYPTO_CONFIG = -5,
|
||||
CCERR_PERMS = -6,
|
||||
CCERR_PARAMETER = -7,
|
||||
CCERR_MEMORY = -8,
|
||||
CCERR_FILEDESC = -9,
|
||||
CCERR_OUT_OF_ENTROPY = -10,
|
||||
CCERR_ATFORK = -11,
|
||||
CCERR_OVERFLOW = -12,
|
||||
|
||||
CCERR_MEMORY_ALLOC_FAIL = -13,
|
||||
|
||||
CCEC_GENERATE_KEY_DEFAULT_ERR = -14,
|
||||
CCEC_GENERATE_KEY_TOO_MANY_TRIES = -15,
|
||||
CCEC_GENERATE_KEY_MULT_FAIL = -16,
|
||||
CCEC_GENERATE_KEY_AFF_FAIL = -17,
|
||||
CCEC_GENERATE_KEY_CONSISTENCY = -18,
|
||||
CCEC_GENERATE_NOT_ON_CURVE = -19,
|
||||
CCEC_GENERATE_NOT_ENOUGH_ENTROPY = -20,
|
||||
CCEC_GENERATE_NOT_SUPPORTED = -21,
|
||||
CCEC_GENERATE_INVALID_INPUT = -22,
|
||||
|
||||
// Program error: buffer too small or encrypted message is too small
|
||||
CCRSA_INVALID_INPUT = -23,
|
||||
// Invalid crypto configuration: Hash length versus RSA key size
|
||||
CCRSA_INVALID_CONFIG = -24,
|
||||
CCRSA_ENCODING_ERROR = -25,
|
||||
CCRSA_DECODING_ERROR = -26,
|
||||
|
||||
// The data is invalid (we won't say more for security)
|
||||
CCRSA_PRIVATE_OP_ERROR = -27,
|
||||
CCRSA_KEY_ERROR = -28,
|
||||
|
||||
// Key generation specific
|
||||
CCRSA_KEYGEN_PRIME_NOT_FOUND = -29,
|
||||
CCRSA_KEYGEN_PRIME_NEED_NEW_SEED = -30,
|
||||
CCRSA_KEYGEN_PRIME_TOO_MANY_ITERATIONS = -31,
|
||||
CCRSA_KEYGEN_PRIME_SEED_GENERATION_ERROR = -32,
|
||||
CCRSA_KEYGEN_MODULUS_CRT_INV_ERROR = -33,
|
||||
CCRSA_KEYGEN_NEXT_PRIME_ERROR = -34,
|
||||
CCRSA_KEYGEN_SEED_X_ERROR = -35,
|
||||
CCRSA_KEYGEN_SEED_r_ERROR = -36,
|
||||
CCRSA_KEYGEN_KEYGEN_CONSISTENCY_FAIL = -37,
|
||||
CCRSA_KEYGEN_R1R2_SIZE_ERROR = -38,
|
||||
CCRSA_KEYGEN_PQ_DELTA_ERROR = -39,
|
||||
|
||||
CCRSA_FIPS_KEYGEN_DISABLED = -40,
|
||||
|
||||
CCZP_INV_ERROR = -41,
|
||||
CCZP_INV_NO_INVERSE = -42,
|
||||
CCZP_INV_INVALID_INPUT = -43,
|
||||
|
||||
CCZ_INVALID_INPUT_ERROR = -44,
|
||||
CCZ_INVALID_RADIX_ERROR = -45,
|
||||
|
||||
CCDH_ERROR_DEFAULT = -46,
|
||||
CCDH_GENERATE_KEY_TOO_MANY_TRIES = -47,
|
||||
CCDH_NOT_SUPPORTED_CONFIGURATION = -48,
|
||||
CCDH_SAFETY_CHECK = -49,
|
||||
CCDH_PUBLIC_KEY_MISSING = -50,
|
||||
CCDH_INVALID_DOMAIN_PARAMETER = -51,
|
||||
CCDH_INVALID_INPUT = -52,
|
||||
CCDH_DOMAIN_PARAMETER_MISMATCH = -53,
|
||||
CCDH_GENERATE_KEY_CONSISTENCY = -54,
|
||||
|
||||
CCSRP_ERROR_DEFAULT = -55,
|
||||
CCSRP_GENERATE_KEY_TOO_MANY_TRIES = -56,
|
||||
CCSRP_NOT_SUPPORTED_CONFIGURATION = -57,
|
||||
CCSRP_SAFETY_CHECK = -58,
|
||||
CCSRP_PUBLIC_KEY_MISSING = -59,
|
||||
CCSRP_INVALID_DOMAIN_PARAMETER = -60,
|
||||
|
||||
CCDRBG_STATUS_ERROR = -61,
|
||||
CCDRBG_STATUS_NEED_RESEED = -62,
|
||||
CCDRBG_STATUS_PARAM_ERROR = -63,
|
||||
// If this value is returned, the caller must abort or panic the process for
|
||||
// security reasons. for example in the case of catastrophic error in
|
||||
// http://csrc.nist.gov/publications/drafts/800-90/sp800_90a_r1_draft.pdf
|
||||
// ccdrbg calls abort() or panic(), if they are available in the system.
|
||||
CCDRBG_STATUS_ABORT = -64,
|
||||
|
||||
CCKPRNG_NEED_ENTROPY = -65,
|
||||
CCKPRNG_ABORT = -66,
|
||||
|
||||
CCMODE_INVALID_INPUT = -67,
|
||||
CCMODE_INVALID_CALL_SEQUENCE = -68,
|
||||
CCMODE_INTEGRITY_FAILURE = -69,
|
||||
CCMODE_NOT_SUPPORTED = -70,
|
||||
CCMODE_INTERNAL_ERROR = -71,
|
||||
|
||||
// Configuration or unexpected issue
|
||||
CCPOST_GENERIC_FAILURE = -72,
|
||||
CCPOST_LIBRARY_ERROR = -73,
|
||||
CCPOST_INTEGRITY_ERROR = -74,
|
||||
// Output of the algo is not as expected
|
||||
CCPOST_KAT_FAILURE = -75,
|
||||
|
||||
CCKPRNG_SEEDFILE_OPEN = -76,
|
||||
CCKPRNG_SEEDFILE_READ = -78,
|
||||
CCKPRNG_SEEDFILE_WRITE = -79,
|
||||
CCKPRNG_SEEDFILE_CHMOD = -80,
|
||||
CCKPRNG_SEEDFILE_CHOWN = -81,
|
||||
CCKPRNG_RANDOMDEV_OPEN = -82,
|
||||
CCKPRNG_RANDOMDEV_WRITE = -83,
|
||||
CCKPRNG_GETENTROPY = -84,
|
||||
|
||||
CCSAE_HUNTPECK_EXCEEDED_MAX_TRIALS = -85,
|
||||
|
||||
CCERR_CALL_SEQUENCE = -86,
|
||||
|
||||
CCVRF_POINT_DECODE_FAILURE = -87,
|
||||
CCVRF_POINT_INVALID_PUBLIC_KEY = -88,
|
||||
CCVRF_VERIFY_FAILURE = -89,
|
||||
|
||||
// Error codes for Authenticated Encryption Modes
|
||||
CCMODE_TAG_LENGTH_REQUEST_TOO_LONG = -100,
|
||||
CCMODE_TAG_LENGTH_TOO_SHORT = -101,
|
||||
CCMODE_NONCE_EMPTY = -102,
|
||||
CCMODE_AD_EMPTY = -103,
|
||||
CCMODE_DECRYPTION_OR_VERIFICATION_ERR=-104,
|
||||
CCMODE_BUFFER_OUT_IN_OVERLAP = -105,
|
||||
|
||||
CCSAE_NOT_ENOUGH_COMMIT_PARTIAL_CALLS = -132,
|
||||
CCSAE_GENERATE_COMMIT_CALL_AGAIN = -133,
|
||||
|
||||
CCERR_VALID_SIGNATURE = CCERR_OK,
|
||||
CCERR_INVALID_SIGNATURE = -146,
|
||||
|
||||
CCERR_IOSERVICE_GETMATCHING = -147,
|
||||
CCERR_IOSERVICE_OPEN = -148,
|
||||
CCERR_IOCONNECT_CALL = -149,
|
||||
|
||||
CCEC_KEY_CANNOT_BE_UNIT = -160,
|
||||
CCEC_COMPRESSED_POINT_ENCODING_ERROR = -161,
|
||||
|
||||
CCERR_RNG_NOT_SEEDED = -162,
|
||||
};
|
||||
|
||||
#define CCDRBG_STATUS_OK CCERR_OK
|
||||
#define CCKPRNG_OK CCERR_OK
|
||||
|
||||
#endif /* _CORECRYPTO_CC_ERROR_H_ */
|
||||
|
|
@ -0,0 +1,29 @@
|
|||
/* Copyright (c) (2019,2020) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#ifndef corecrypto_cc_fault_canary_h
|
||||
#define corecrypto_cc_fault_canary_h
|
||||
|
||||
#include "cc.h"
|
||||
|
||||
#define CC_FAULT_CANARY_SIZE 16
|
||||
typedef uint8_t cc_fault_canary_t[CC_FAULT_CANARY_SIZE];
|
||||
|
||||
extern const cc_fault_canary_t CCEC_FAULT_CANARY;
|
||||
extern const cc_fault_canary_t CCRSA_PKCS1_FAULT_CANARY;
|
||||
extern const cc_fault_canary_t CCRSA_PSS_FAULT_CANARY;
|
||||
|
||||
#define CC_FAULT_CANARY_MEMCPY(_dst_, _src_) memcpy(_dst_, _src_, CC_FAULT_CANARY_SIZE)
|
||||
#define CC_FAULT_CANARY_CLEAR(_name_) memset(_name_, 0x00, CC_FAULT_CANARY_SIZE)
|
||||
|
||||
#define CC_FAULT_CANARY_EQUAL(_a_, _b_) (cc_cmp_safe(CC_FAULT_CANARY_SIZE, _a_, _b_) == 0)
|
||||
|
||||
#endif /* corecrypto_cc_fault_canary_h */
|
||||
|
|
@ -0,0 +1,27 @@
|
|||
/* Copyright (c) (2019,2020) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#ifndef corecrypto_cc_fault_canary_internal_h
|
||||
#define corecrypto_cc_fault_canary_internal_h
|
||||
|
||||
/*!
|
||||
@function cc_fault_canary_set
|
||||
@abstract Set the output `fault_canary_out` to the value `fault_canary` if the two inputs are equal.
|
||||
|
||||
@param fault_canary_out Output fault canary value
|
||||
@param fault_canary Fault canary for a specific operation (e.g. CCEC_FAULT_CANARY for ECC signing)
|
||||
@param nbytes Byte length of inputs in1 and in2
|
||||
@param in1 Input one
|
||||
@param in2 Input two
|
||||
*/
|
||||
void cc_fault_canary_set(cc_fault_canary_t fault_canary_out, const cc_fault_canary_t fault_canary, size_t nbytes, const uint8_t *in1, const uint8_t *in2);
|
||||
|
||||
#endif /* corecrypto_cc_fault_canary_internal_h */
|
||||
|
|
@ -0,0 +1,16 @@
|
|||
/* Copyright (c) (2019) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
#include <corecrypto/cc_priv.h>
|
||||
|
||||
extern bool cc_rdrand(uint64_t *rand);
|
||||
|
|
@ -0,0 +1,150 @@
|
|||
/* Copyright (c) (2012,2015,2016,2017,2019) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#ifndef _CORECRYPTO_CC_MACROS_H_
|
||||
#define _CORECRYPTO_CC_MACROS_H_
|
||||
|
||||
#include <corecrypto/cc_config.h>
|
||||
|
||||
#ifndef __CC_DEBUG_ASSERT_COMPONENT_NAME_STRING
|
||||
#define __CC_DEBUG_ASSERT_COMPONENT_NAME_STRING ""
|
||||
#endif
|
||||
|
||||
#ifndef __CC_DEBUG_ASSERT_PRODUCTION_CODE
|
||||
#define __CC_DEBUG_ASSERT_PRODUCTION_CODE !CORECRYPTO_DEBUG
|
||||
#endif
|
||||
|
||||
#if CORECRYPTO_DEBUG_ENABLE_CC_REQUIRE_PRINTS
|
||||
|
||||
#if !CC_KERNEL
|
||||
#include <string.h> // for strstr
|
||||
#endif // !CC_KERNEL
|
||||
|
||||
CC_UNUSED static char *cc_strstr(const char *file) {
|
||||
#if CC_KERNEL
|
||||
(void) file;
|
||||
#else
|
||||
const char cc_char []="corecrypto";
|
||||
char *p=strstr(file, cc_char);
|
||||
if (p) return (p+strlen(cc_char)+1);
|
||||
#endif
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#define __CC_DEBUG_REQUIRE_MESSAGE(name, assertion, label, message, file, line, value) \
|
||||
{char *___t = cc_strstr(file); cc_printf( "require: %s, %s%s:%d\n", assertion, (message!=0) ? message : "", ___t==NULL?file:___t, line);}
|
||||
|
||||
#endif // CORECRYPTO_DEBUG_ENABLE_CC_REQUIRE_PRINTS
|
||||
|
||||
#ifndef cc_require
|
||||
#if (__CC_DEBUG_ASSERT_PRODUCTION_CODE) || (!CORECRYPTO_DEBUG_ENABLE_CC_REQUIRE_PRINTS)
|
||||
#if defined(_WIN32) && defined (__clang__)
|
||||
#define cc_require(assertion, exceptionLabel) \
|
||||
do { \
|
||||
if (!(assertion) ) { \
|
||||
goto exceptionLabel; \
|
||||
} \
|
||||
} while ( 0 )
|
||||
#else
|
||||
#define cc_require(assertion, exceptionLabel) \
|
||||
do { \
|
||||
if ( __builtin_expect(!(assertion), 0) ) { \
|
||||
goto exceptionLabel; \
|
||||
} \
|
||||
} while ( 0 )
|
||||
#endif
|
||||
#else
|
||||
#define cc_require(assertion, exceptionLabel) \
|
||||
do { \
|
||||
if ( __builtin_expect(!(assertion), 0) ) { \
|
||||
__CC_DEBUG_REQUIRE_MESSAGE(__CC_DEBUG_ASSERT_COMPONENT_NAME_STRING, \
|
||||
#assertion, #exceptionLabel, 0, __FILE__, __LINE__, 0); \
|
||||
goto exceptionLabel; \
|
||||
} \
|
||||
} while ( 0 )
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef cc_require_action
|
||||
#if __CC_DEBUG_ASSERT_PRODUCTION_CODE || (!CORECRYPTO_DEBUG_ENABLE_CC_REQUIRE_PRINTS)
|
||||
#if defined(_WIN32) && defined(__clang__)
|
||||
#define cc_require_action(assertion, exceptionLabel, action) \
|
||||
do \
|
||||
{ \
|
||||
if (!(assertion)) \
|
||||
{ \
|
||||
{ \
|
||||
action; \
|
||||
} \
|
||||
goto exceptionLabel; \
|
||||
} \
|
||||
} while ( 0 )
|
||||
#else
|
||||
#define cc_require_action(assertion, exceptionLabel, action) \
|
||||
do \
|
||||
{ \
|
||||
if ( __builtin_expect(!(assertion), 0) ) \
|
||||
{ \
|
||||
{ \
|
||||
action; \
|
||||
} \
|
||||
goto exceptionLabel; \
|
||||
} \
|
||||
} while ( 0 )
|
||||
#endif
|
||||
#else
|
||||
#define cc_require_action(assertion, exceptionLabel, action) \
|
||||
do \
|
||||
{ \
|
||||
if ( __builtin_expect(!(assertion), 0) ) \
|
||||
{ \
|
||||
__CC_DEBUG_REQUIRE_MESSAGE( \
|
||||
__CC_DEBUG_ASSERT_COMPONENT_NAME_STRING, \
|
||||
#assertion, #exceptionLabel, 0, __FILE__, __LINE__, 0); \
|
||||
{ \
|
||||
action; \
|
||||
} \
|
||||
goto exceptionLabel; \
|
||||
} \
|
||||
} while ( 0 )
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef cc_require_or_return
|
||||
#if (__CC_DEBUG_ASSERT_PRODUCTION_CODE) || (!CORECRYPTO_DEBUG_ENABLE_CC_REQUIRE_PRINTS)
|
||||
#if defined(_WIN32) && defined (__clang__)
|
||||
#define cc_require_or_return(assertion, value) \
|
||||
do { \
|
||||
if (!(assertion) ) { \
|
||||
return value; \
|
||||
} \
|
||||
} while ( 0 )
|
||||
#else
|
||||
#define cc_require_or_return(assertion, value) \
|
||||
do { \
|
||||
if ( __builtin_expect(!(assertion), 0) ) { \
|
||||
return value; \
|
||||
} \
|
||||
} while ( 0 )
|
||||
#endif
|
||||
#else
|
||||
#define cc_require_or_return(assertion, value) \
|
||||
do { \
|
||||
if ( __builtin_expect(!(assertion), 0) ) { \
|
||||
__CC_DEBUG_REQUIRE_MESSAGE(__CC_DEBUG_ASSERT_COMPONENT_NAME_STRING, \
|
||||
#assertion, #exceptionLabel, 0, __FILE__, __LINE__, 0); \
|
||||
return value; \
|
||||
} \
|
||||
} while ( 0 )
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#endif /* _CORECRYPTO_CC_MACROS_H_ */
|
||||
|
|
@ -0,0 +1,192 @@
|
|||
/* Copyright (c) (2014,2015,2016,2017,2018,2019,2020) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#include "cc_config.h"
|
||||
|
||||
#ifndef corecrypto_cc_memory_h
|
||||
#define corecrypto_cc_memory_h
|
||||
|
||||
#if CORECRYPTO_DEBUG && !defined(_WIN32) && !defined(_WIN64)
|
||||
#define CC_ALLOC_DEBUG 1
|
||||
#endif
|
||||
|
||||
struct ws_dbg {
|
||||
const void *p;
|
||||
const char *file;
|
||||
int line;
|
||||
const char *func;
|
||||
};
|
||||
|
||||
#if defined(CC_ALLOC_DEBUG)
|
||||
extern struct ws_dbg g_ws_dbg;
|
||||
#endif
|
||||
|
||||
#include <corecrypto/cc_config.h>
|
||||
#include <corecrypto/cc_error.h>
|
||||
#include "cc_debug.h"
|
||||
#include <corecrypto/cc_priv.h>
|
||||
|
||||
CC_INLINE void cc_alloc_debug(CC_UNUSED const void *p, CC_UNUSED const char *file, CC_UNUSED int line, CC_UNUSED const char *func)
|
||||
{
|
||||
#if defined(CC_ALLOC_DEBUG)
|
||||
// Contract for some client is to have a single malloc at a time
|
||||
cc_assert(g_ws_dbg.p == NULL);
|
||||
g_ws_dbg = (struct ws_dbg){ p, file, line, func };
|
||||
#endif
|
||||
}
|
||||
|
||||
CC_INLINE void cc_free_debug(CC_UNUSED const void *p)
|
||||
{
|
||||
#if defined(CC_ALLOC_DEBUG)
|
||||
// Contract for some client is to have a single malloc at a time
|
||||
cc_assert(g_ws_dbg.p == p); // Free the address we allocated
|
||||
g_ws_dbg = (struct ws_dbg){};
|
||||
#endif
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// Declare workspace with memory in STACK
|
||||
// This is the least preferred option since most corecrypto client have
|
||||
// small stack. It is still useful when needing small allocations and errors
|
||||
// can't be easily propagated
|
||||
// =============================================================================
|
||||
|
||||
// Declare a variable in stack and use its address
|
||||
// Only uses this when we don't have a way to propagate error
|
||||
#define CC_DECL_WORKSPACE_STACK(ws, n) \
|
||||
cc_unit ws##_buf[(n)]; \
|
||||
cc_ws ws##_ctx = { &ws##_buf[0], &ws##_buf[(n)] }; \
|
||||
cc_ws_t ws = &ws##_ctx; \
|
||||
cc_alloc_debug(ws->start, __FILE__, __LINE__, __func__);
|
||||
|
||||
// Reset pointers to avoid future reference
|
||||
#define CC_FREE_WORKSPACE_STACK(ws) \
|
||||
cc_free_debug(ws->start); \
|
||||
ws->start = NULL; \
|
||||
ws->end = NULL;
|
||||
|
||||
#define CC_CLEAR_AND_FREE_WORKSPACE_STACK(ws) \
|
||||
cc_try_abort_if(ws->start > ws->end, "free ws"); \
|
||||
ccn_clear((cc_size)(ws->end - ws->start), ws->start); \
|
||||
CC_FREE_WORKSPACE_STACK(ws);
|
||||
|
||||
// =============================================================================
|
||||
// Declare workspace in the region correspding to HEAP or STACK
|
||||
// depending on the setting of CC_USE_HEAP_FOR_WORKSPACE
|
||||
// This should be the preference for large memory allocations but it requires
|
||||
// to propagate error in case of allocation failure
|
||||
// =============================================================================
|
||||
#if CC_USE_HEAP_FOR_WORKSPACE
|
||||
|
||||
// Malloc/free functions to be used
|
||||
#if CC_KERNEL
|
||||
#include <IOKit/IOLib.h>
|
||||
#include <vm/pmap.h>
|
||||
CC_INLINE void *cc_malloc_clear(size_t s)
|
||||
{
|
||||
void *p = NULL;
|
||||
if (pmap_in_ppl()) {
|
||||
if (s > PAGE_SIZE) {
|
||||
panic("PPL cc_malloc_clear trying to allocate %zu > PAGE_SIZE", s);
|
||||
}
|
||||
|
||||
p = pmap_claim_reserved_ppl_page();
|
||||
} else {
|
||||
p = IOMalloc(s);
|
||||
}
|
||||
if (p != NULL) {
|
||||
memset(p, 0, s);
|
||||
}
|
||||
return p;
|
||||
}
|
||||
CC_INLINE void cc_free(void *p, size_t size)
|
||||
{
|
||||
if (pmap_in_ppl()) {
|
||||
if (size > PAGE_SIZE) {
|
||||
panic("PPL cc_malloc_clear trying to free %zu > PAGE_SIZE", size);
|
||||
}
|
||||
|
||||
pmap_free_reserved_ppl_page(p);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
IOFree(p, size);
|
||||
}
|
||||
#else // !CC_KERNEL
|
||||
#include <stdlib.h>
|
||||
CC_INLINE void *cc_malloc_clear(size_t s)
|
||||
{
|
||||
void *p = malloc(s);
|
||||
if (p != NULL) {
|
||||
memset(p, 0, s);
|
||||
}
|
||||
return p;
|
||||
}
|
||||
CC_INLINE void cc_free(void *p, size_t size CC_UNUSED)
|
||||
{
|
||||
free(p);
|
||||
}
|
||||
|
||||
#endif // !CC_KERNEL
|
||||
|
||||
#define CC_DECL_WORKSPACE_OR_FAIL(ws, n) \
|
||||
cc_unit *ws##_buf = (cc_unit *) cc_malloc_clear(ccn_sizeof_n((n))); \
|
||||
cc_ws ws##_ctx = { &ws##_buf[0], &ws##_buf[(n)] }; \
|
||||
cc_ws_t ws = &ws##_ctx; \
|
||||
if (NULL == ws->start) \
|
||||
return CCERR_MEMORY_ALLOC_FAIL; \
|
||||
cc_alloc_debug(ws->start, __FILE__, __LINE__, __func__);
|
||||
|
||||
// Free and reset pointers to avoid future references
|
||||
#define CC_FREE_WORKSPACE(ws) \
|
||||
cc_free_debug(ws->start); \
|
||||
cc_try_abort_if(ws->start > ws->end, "free ws"); \
|
||||
cc_free(ws->start, (size_t)(ws->end - ws->start) * sizeof(ws->start[0])); \
|
||||
ws->start = NULL; \
|
||||
ws->end = NULL;
|
||||
|
||||
#else // !CC_USE_HEAP_FOR_WORKSPACE
|
||||
|
||||
// Declare a variable in stack and use its address
|
||||
// Could use alloca but alloca is not so portable, and not secure.
|
||||
#define CC_DECL_WORKSPACE_OR_FAIL CC_DECL_WORKSPACE_STACK
|
||||
|
||||
// Reset pointers to avoid future reference
|
||||
#define CC_FREE_WORKSPACE CC_FREE_WORKSPACE_STACK
|
||||
|
||||
#endif // !CC_USE_HEAP_FOR_WORKSPACE
|
||||
|
||||
// =============================================================================
|
||||
// Common
|
||||
// =============================================================================
|
||||
|
||||
#define CC_CLEAR_AND_FREE_WORKSPACE(ws) \
|
||||
cc_try_abort_if(ws->start > ws->end, "clear ws"); \
|
||||
ccn_clear((cc_size)(ws->end - ws->start), ws->start); \
|
||||
CC_FREE_WORKSPACE(ws);
|
||||
|
||||
// To allocate array of n cc_unit in the WS
|
||||
#define CC_DECL_BP_WS(ws, bp) cc_unit *bp = ws->start;
|
||||
#define CC_FREE_BP_WS(ws, bp) ws->start = bp;
|
||||
#define CC_ALLOC_WS(ws, n) \
|
||||
ws->start; \
|
||||
ws->start += n; \
|
||||
cc_try_abort_if(ws->start > ws->end, "alloc ws");
|
||||
|
||||
#if CC_KERNEL
|
||||
#include <libkern/section_keywords.h>
|
||||
#define CC_READ_ONLY_LATE(_t) SECURITY_READ_ONLY_LATE(_t)
|
||||
#else
|
||||
#define CC_READ_ONLY_LATE(_t) _t
|
||||
#endif
|
||||
|
||||
#endif // corecrypto_cc_memory_h
|
||||
|
|
@ -0,0 +1,818 @@
|
|||
/* Copyright (c) (2010,2011,2012,2014,2015,2016,2017,2018,2019,2020) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#ifndef _CORECRYPTO_CC_PRIV_H_
|
||||
#define _CORECRYPTO_CC_PRIV_H_
|
||||
|
||||
#include <corecrypto/cc.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
|
||||
// Fork handlers for the stateful components of corecrypto.
|
||||
void cc_atfork_prepare(void);
|
||||
void cc_atfork_parent(void);
|
||||
void cc_atfork_child(void);
|
||||
|
||||
#ifndef __has_builtin
|
||||
#define __has_builtin(x) 0
|
||||
#endif
|
||||
|
||||
#ifndef __DECONST
|
||||
#define __DECONST(type, var) ((type)(uintptr_t)(const void *)(var))
|
||||
#endif
|
||||
|
||||
/* defines the following macros :
|
||||
|
||||
CC_ARRAY_LEN: returns the number of elements in an array
|
||||
|
||||
CC_STORE32_BE : store 32 bit value in big endian in unaligned buffer.
|
||||
CC_STORE32_LE : store 32 bit value in little endian in unaligned buffer.
|
||||
CC_STORE64_BE : store 64 bit value in big endian in unaligned buffer.
|
||||
CC_STORE64_LE : store 64 bit value in little endian in unaligned buffer.
|
||||
|
||||
CC_LOAD32_BE : load 32 bit value in big endian from unaligned buffer.
|
||||
CC_LOAD32_LE : load 32 bit value in little endian from unaligned buffer.
|
||||
CC_LOAD64_BE : load 64 bit value in big endian from unaligned buffer.
|
||||
CC_LOAD64_LE : load 64 bit value in little endian from unaligned buffer.
|
||||
|
||||
CC_ROR : Rotate Right 32 bits. Rotate count can be a variable.
|
||||
CC_ROL : Rotate Left 32 bits. Rotate count can be a variable.
|
||||
CC_RORc : Rotate Right 32 bits. Rotate count must be a constant.
|
||||
CC_ROLc : Rotate Left 32 bits. Rotate count must be a constant.
|
||||
|
||||
CC_ROR64 : Rotate Right 64 bits. Rotate count can be a variable.
|
||||
CC_ROL64 : Rotate Left 64 bits. Rotate count can be a variable.
|
||||
CC_ROR64c : Rotate Right 64 bits. Rotate count must be a constant.
|
||||
CC_ROL64c : Rotate Left 64 bits. Rotate count must be a constant.
|
||||
|
||||
CC_BSWAP : byte swap a 32 bits variable.
|
||||
|
||||
CC_H2BE32 : convert a 32 bits value between host and big endian order.
|
||||
CC_H2LE32 : convert a 32 bits value between host and little endian order.
|
||||
|
||||
CC_BSWAP64 : byte swap a 64 bits variable
|
||||
|
||||
CC_READ_LE32 : read a 32 bits little endian value
|
||||
|
||||
CC_WRITE_LE32 : write a 32 bits little endian value
|
||||
CC_WRITE_LE64 : write a 64 bits little endian value
|
||||
|
||||
CC_H2BE64 : convert a 64 bits value between host and big endian order
|
||||
CC_H2LE64 : convert a 64 bits value between host and little endian order
|
||||
|
||||
*/
|
||||
|
||||
// RTKitOSPlatform should replace CC_MEMCPY with memcpy
|
||||
#define CC_MEMCPY(D,S,L) cc_memcpy((D),(S),(L))
|
||||
#define CC_MEMMOVE(D,S,L) cc_memmove((D),(S),(L))
|
||||
#define CC_MEMSET(D,V,L) cc_memset((D),(V),(L))
|
||||
|
||||
#if __has_builtin(__builtin___memcpy_chk) && !defined(_MSC_VER)
|
||||
#define cc_memcpy(dst, src, len) __builtin___memcpy_chk((dst), (src), (len), __builtin_object_size((dst), 1))
|
||||
#define cc_memcpy_nochk(dst, src, len) __builtin___memcpy_chk((dst), (src), (len), __builtin_object_size((dst), 0))
|
||||
#else
|
||||
#define cc_memcpy(dst, src, len) memcpy((dst), (src), (len))
|
||||
#define cc_memcpy_nochk(dst, src, len) memcpy((dst), (src), (len))
|
||||
#endif
|
||||
|
||||
#if __has_builtin(__builtin___memmove_chk) && !defined(_MSC_VER)
|
||||
#define cc_memmove(dst, src, len) __builtin___memmove_chk((dst), (src), (len), __builtin_object_size((dst), 1))
|
||||
#else
|
||||
#define cc_memmove(dst, src, len) memmove((dst), (src), (len))
|
||||
#endif
|
||||
|
||||
#if __has_builtin(__builtin___memset_chk) && !defined(_MSC_VER)
|
||||
#define cc_memset(dst, val, len) __builtin___memset_chk((dst), (val), (len), __builtin_object_size((dst), 1))
|
||||
#else
|
||||
#define cc_memset(dst, val, len) memset((dst), (val), (len))
|
||||
#endif
|
||||
|
||||
#define CC_ARRAY_LEN(x) (sizeof((x))/sizeof((x)[0]))
|
||||
|
||||
// MARK: - Loads and Store
|
||||
|
||||
// MARK: -- 32 bits - little endian
|
||||
|
||||
// MARK: --- Default version
|
||||
|
||||
#define CC_STORE32_LE(x, y) do { \
|
||||
((unsigned char *)(y))[3] = (unsigned char)(((x)>>24)&255); \
|
||||
((unsigned char *)(y))[2] = (unsigned char)(((x)>>16)&255); \
|
||||
((unsigned char *)(y))[1] = (unsigned char)(((x)>>8)&255); \
|
||||
((unsigned char *)(y))[0] = (unsigned char)((x)&255); \
|
||||
} while(0)
|
||||
|
||||
#define CC_LOAD32_LE(x, y) do { \
|
||||
x = ((uint32_t)(((const unsigned char *)(y))[3] & 255)<<24) | \
|
||||
((uint32_t)(((const unsigned char *)(y))[2] & 255)<<16) | \
|
||||
((uint32_t)(((const unsigned char *)(y))[1] & 255)<<8) | \
|
||||
((uint32_t)(((const unsigned char *)(y))[0] & 255)); \
|
||||
} while(0)
|
||||
|
||||
// MARK: -- 64 bits - little endian
|
||||
|
||||
#define CC_STORE64_LE(x, y) do { \
|
||||
((unsigned char *)(y))[7] = (unsigned char)(((x)>>56)&255); \
|
||||
((unsigned char *)(y))[6] = (unsigned char)(((x)>>48)&255); \
|
||||
((unsigned char *)(y))[5] = (unsigned char)(((x)>>40)&255); \
|
||||
((unsigned char *)(y))[4] = (unsigned char)(((x)>>32)&255); \
|
||||
((unsigned char *)(y))[3] = (unsigned char)(((x)>>24)&255); \
|
||||
((unsigned char *)(y))[2] = (unsigned char)(((x)>>16)&255); \
|
||||
((unsigned char *)(y))[1] = (unsigned char)(((x)>>8)&255); \
|
||||
((unsigned char *)(y))[0] = (unsigned char)((x)&255); \
|
||||
} while(0)
|
||||
|
||||
#define CC_LOAD64_LE(x, y) do { \
|
||||
x = (((uint64_t)(((const unsigned char *)(y))[7] & 255))<<56) | \
|
||||
(((uint64_t)(((const unsigned char *)(y))[6] & 255))<<48) | \
|
||||
(((uint64_t)(((const unsigned char *)(y))[5] & 255))<<40) | \
|
||||
(((uint64_t)(((const unsigned char *)(y))[4] & 255))<<32) | \
|
||||
(((uint64_t)(((const unsigned char *)(y))[3] & 255))<<24) | \
|
||||
(((uint64_t)(((const unsigned char *)(y))[2] & 255))<<16) | \
|
||||
(((uint64_t)(((const unsigned char *)(y))[1] & 255))<<8) | \
|
||||
(((uint64_t)(((const unsigned char *)(y))[0] & 255))); \
|
||||
} while(0)
|
||||
|
||||
// MARK: -- 32 bits - big endian
|
||||
// MARK: --- intel version
|
||||
|
||||
#if (defined(__i386__) || defined(__x86_64__)) && !defined(_MSC_VER)
|
||||
|
||||
#define CC_STORE32_BE(x, y) \
|
||||
__asm__ __volatile__ ( \
|
||||
"bswapl %0 \n\t" \
|
||||
"movl %0,(%1)\n\t" \
|
||||
"bswapl %0 \n\t" \
|
||||
::"r"(x), "r"(y))
|
||||
|
||||
#define CC_LOAD32_BE(x, y) \
|
||||
__asm__ __volatile__ ( \
|
||||
"movl (%1),%0\n\t" \
|
||||
"bswapl %0\n\t" \
|
||||
:"=r"(x): "r"(y))
|
||||
|
||||
#else
|
||||
// MARK: --- default version
|
||||
#define CC_STORE32_BE(x, y) do { \
|
||||
((unsigned char *)(y))[0] = (unsigned char)(((x)>>24)&255); \
|
||||
((unsigned char *)(y))[1] = (unsigned char)(((x)>>16)&255); \
|
||||
((unsigned char *)(y))[2] = (unsigned char)(((x)>>8)&255); \
|
||||
((unsigned char *)(y))[3] = (unsigned char)((x)&255); \
|
||||
} while(0)
|
||||
|
||||
#define CC_LOAD32_BE(x, y) do { \
|
||||
x = ((uint32_t)(((const unsigned char *)(y))[0] & 255)<<24) | \
|
||||
((uint32_t)(((const unsigned char *)(y))[1] & 255)<<16) | \
|
||||
((uint32_t)(((const unsigned char *)(y))[2] & 255)<<8) | \
|
||||
((uint32_t)(((const unsigned char *)(y))[3] & 255)); \
|
||||
} while(0)
|
||||
|
||||
#endif
|
||||
|
||||
// MARK: -- 64 bits - big endian
|
||||
|
||||
// MARK: --- intel 64 bits version
|
||||
|
||||
#if defined(__x86_64__) && !defined (_MSC_VER)
|
||||
|
||||
#define CC_STORE64_BE(x, y) \
|
||||
__asm__ __volatile__ ( \
|
||||
"bswapq %0 \n\t" \
|
||||
"movq %0,(%1)\n\t" \
|
||||
"bswapq %0 \n\t" \
|
||||
::"r"(x), "r"(y))
|
||||
|
||||
#define CC_LOAD64_BE(x, y) \
|
||||
__asm__ __volatile__ ( \
|
||||
"movq (%1),%0\n\t" \
|
||||
"bswapq %0\n\t" \
|
||||
:"=r"(x): "r"(y))
|
||||
|
||||
#else
|
||||
|
||||
// MARK: --- default version
|
||||
|
||||
#define CC_STORE64_BE(x, y) do { \
|
||||
((unsigned char *)(y))[0] = (unsigned char)(((x)>>56)&255); \
|
||||
((unsigned char *)(y))[1] = (unsigned char)(((x)>>48)&255); \
|
||||
((unsigned char *)(y))[2] = (unsigned char)(((x)>>40)&255); \
|
||||
((unsigned char *)(y))[3] = (unsigned char)(((x)>>32)&255); \
|
||||
((unsigned char *)(y))[4] = (unsigned char)(((x)>>24)&255); \
|
||||
((unsigned char *)(y))[5] = (unsigned char)(((x)>>16)&255); \
|
||||
((unsigned char *)(y))[6] = (unsigned char)(((x)>>8)&255); \
|
||||
((unsigned char *)(y))[7] = (unsigned char)((x)&255); \
|
||||
} while(0)
|
||||
|
||||
#define CC_LOAD64_BE(x, y) do { \
|
||||
x = (((uint64_t)(((const unsigned char *)(y))[0] & 255))<<56) | \
|
||||
(((uint64_t)(((const unsigned char *)(y))[1] & 255))<<48) | \
|
||||
(((uint64_t)(((const unsigned char *)(y))[2] & 255))<<40) | \
|
||||
(((uint64_t)(((const unsigned char *)(y))[3] & 255))<<32) | \
|
||||
(((uint64_t)(((const unsigned char *)(y))[4] & 255))<<24) | \
|
||||
(((uint64_t)(((const unsigned char *)(y))[5] & 255))<<16) | \
|
||||
(((uint64_t)(((const unsigned char *)(y))[6] & 255))<<8) | \
|
||||
(((uint64_t)(((const unsigned char *)(y))[7] & 255))); \
|
||||
} while(0)
|
||||
|
||||
#endif
|
||||
|
||||
// MARK: - 32-bit Rotates
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
// MARK: -- MSVC version
|
||||
|
||||
#include <stdlib.h>
|
||||
#if !defined(__clang__)
|
||||
#pragma intrinsic(_lrotr,_lrotl)
|
||||
#endif
|
||||
#define CC_ROR(x,n) _lrotr(x,n)
|
||||
#define CC_ROL(x,n) _lrotl(x,n)
|
||||
#define CC_RORc(x,n) _lrotr(x,n)
|
||||
#define CC_ROLc(x,n) _lrotl(x,n)
|
||||
|
||||
#elif (defined(__i386__) || defined(__x86_64__))
|
||||
// MARK: -- intel asm version
|
||||
|
||||
CC_INLINE uint32_t CC_ROL(uint32_t word, int i)
|
||||
{
|
||||
__asm__ ("roll %%cl,%0"
|
||||
:"=r" (word)
|
||||
:"0" (word),"c" (i));
|
||||
return word;
|
||||
}
|
||||
|
||||
CC_INLINE uint32_t CC_ROR(uint32_t word, int i)
|
||||
{
|
||||
__asm__ ("rorl %%cl,%0"
|
||||
:"=r" (word)
|
||||
:"0" (word),"c" (i));
|
||||
return word;
|
||||
}
|
||||
|
||||
/* Need to be a macro here, because 'i' is an immediate (constant) */
|
||||
#define CC_ROLc(word, i) \
|
||||
({ uint32_t _word=(word); \
|
||||
__asm__ __volatile__ ("roll %2,%0" \
|
||||
:"=r" (_word) \
|
||||
:"0" (_word),"I" (i)); \
|
||||
_word; \
|
||||
})
|
||||
|
||||
|
||||
#define CC_RORc(word, i) \
|
||||
({ uint32_t _word=(word); \
|
||||
__asm__ __volatile__ ("rorl %2,%0" \
|
||||
:"=r" (_word) \
|
||||
:"0" (_word),"I" (i)); \
|
||||
_word; \
|
||||
})
|
||||
|
||||
#else
|
||||
|
||||
// MARK: -- default version
|
||||
|
||||
CC_INLINE uint32_t CC_ROL(uint32_t word, int i)
|
||||
{
|
||||
return ( (word<<(i&31)) | (word>>(32-(i&31))) );
|
||||
}
|
||||
|
||||
CC_INLINE uint32_t CC_ROR(uint32_t word, int i)
|
||||
{
|
||||
return ( (word>>(i&31)) | (word<<(32-(i&31))) );
|
||||
}
|
||||
|
||||
#define CC_ROLc(x, y) CC_ROL(x, y)
|
||||
#define CC_RORc(x, y) CC_ROR(x, y)
|
||||
|
||||
#endif
|
||||
|
||||
// MARK: - 64 bits rotates
|
||||
|
||||
#if defined(__x86_64__) && !defined(_MSC_VER) //clang _MSVC doesn't support GNU-style inline assembly
|
||||
// MARK: -- intel 64 asm version
|
||||
|
||||
CC_INLINE uint64_t CC_ROL64(uint64_t word, int i)
|
||||
{
|
||||
__asm__("rolq %%cl,%0"
|
||||
:"=r" (word)
|
||||
:"0" (word),"c" (i));
|
||||
return word;
|
||||
}
|
||||
|
||||
CC_INLINE uint64_t CC_ROR64(uint64_t word, int i)
|
||||
{
|
||||
__asm__("rorq %%cl,%0"
|
||||
:"=r" (word)
|
||||
:"0" (word),"c" (i));
|
||||
return word;
|
||||
}
|
||||
|
||||
/* Need to be a macro here, because 'i' is an immediate (constant) */
|
||||
#define CC_ROL64c(word, i) \
|
||||
({ \
|
||||
uint64_t _word=(word); \
|
||||
__asm__("rolq %2,%0" \
|
||||
:"=r" (_word) \
|
||||
:"0" (_word),"J" (i)); \
|
||||
_word; \
|
||||
})
|
||||
|
||||
#define CC_ROR64c(word, i) \
|
||||
({ \
|
||||
uint64_t _word=(word); \
|
||||
__asm__("rorq %2,%0" \
|
||||
:"=r" (_word) \
|
||||
:"0" (_word),"J" (i)); \
|
||||
_word; \
|
||||
})
|
||||
|
||||
|
||||
#else /* Not x86_64 */
|
||||
|
||||
// MARK: -- default C version
|
||||
|
||||
CC_INLINE uint64_t CC_ROL64(uint64_t word, int i)
|
||||
{
|
||||
return ( (word<<(i&63)) | (word>>(64-(i&63))) );
|
||||
}
|
||||
|
||||
CC_INLINE uint64_t CC_ROR64(uint64_t word, int i)
|
||||
{
|
||||
return ( (word>>(i&63)) | (word<<(64-(i&63))) );
|
||||
}
|
||||
|
||||
#define CC_ROL64c(x, y) CC_ROL64(x, y)
|
||||
#define CC_ROR64c(x, y) CC_ROR64(x, y)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
// MARK: - Byte Swaps
|
||||
|
||||
#if __has_builtin(__builtin_bswap32)
|
||||
#define CC_BSWAP32(x) __builtin_bswap32(x)
|
||||
#else
|
||||
CC_INLINE uint32_t CC_BSWAP32(uint32_t x)
|
||||
{
|
||||
return
|
||||
((x & 0xff000000) >> 24) |
|
||||
((x & 0x00ff0000) >> 8) |
|
||||
((x & 0x0000ff00) << 8) |
|
||||
((x & 0x000000ff) << 24);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if __has_builtin(__builtin_bswap64)
|
||||
#define CC_BSWAP64(x) __builtin_bswap64(x)
|
||||
#else
|
||||
CC_INLINE uint64_t CC_BSWAP64(uint64_t x)
|
||||
{
|
||||
return
|
||||
((x & 0xff00000000000000ULL) >> 56) |
|
||||
((x & 0x00ff000000000000ULL) >> 40) |
|
||||
((x & 0x0000ff0000000000ULL) >> 24) |
|
||||
((x & 0x000000ff00000000ULL) >> 8) |
|
||||
((x & 0x00000000ff000000ULL) << 8) |
|
||||
((x & 0x0000000000ff0000ULL) << 24) |
|
||||
((x & 0x000000000000ff00ULL) << 40) |
|
||||
((x & 0x00000000000000ffULL) << 56);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
#define CC_H2BE32(x) CC_BSWAP32(x)
|
||||
#define CC_H2LE32(x) (x)
|
||||
#define CC_H2BE64(x) CC_BSWAP64(x)
|
||||
#define CC_H2LE64(x) (x)
|
||||
#else
|
||||
#define CC_H2BE32(x) (x)
|
||||
#define CC_H2LE32(x) CC_BSWAP32(x)
|
||||
#define CC_H2BE64(x) (x)
|
||||
#define CC_H2LE64(x) CC_BSWAP64(x)
|
||||
#endif
|
||||
|
||||
#define CC_READ_LE32(ptr) \
|
||||
( (uint32_t)( \
|
||||
((uint32_t)((const uint8_t *)(ptr))[0]) | \
|
||||
(((uint32_t)((const uint8_t *)(ptr))[1]) << 8) | \
|
||||
(((uint32_t)((const uint8_t *)(ptr))[2]) << 16) | \
|
||||
(((uint32_t)((const uint8_t *)(ptr))[3]) << 24)))
|
||||
|
||||
#define CC_WRITE_LE32(ptr, x) \
|
||||
do { \
|
||||
((uint8_t *)(ptr))[0] = (uint8_t)( (x) & 0xFF); \
|
||||
((uint8_t *)(ptr))[1] = (uint8_t)(((x) >> 8) & 0xFF); \
|
||||
((uint8_t *)(ptr))[2] = (uint8_t)(((x) >> 16) & 0xFF); \
|
||||
((uint8_t *)(ptr))[3] = (uint8_t)(((x) >> 24) & 0xFF); \
|
||||
} while(0)
|
||||
|
||||
#define CC_WRITE_LE64(ptr, x) \
|
||||
do { \
|
||||
((uint8_t *)(ptr))[0] = (uint8_t)( (x) & 0xFF); \
|
||||
((uint8_t *)(ptr))[1] = (uint8_t)(((x) >> 8) & 0xFF); \
|
||||
((uint8_t *)(ptr))[2] = (uint8_t)(((x) >> 16) & 0xFF); \
|
||||
((uint8_t *)(ptr))[3] = (uint8_t)(((x) >> 24) & 0xFF); \
|
||||
((uint8_t *)(ptr))[4] = (uint8_t)(((x) >> 32) & 0xFF); \
|
||||
((uint8_t *)(ptr))[5] = (uint8_t)(((x) >> 40) & 0xFF); \
|
||||
((uint8_t *)(ptr))[6] = (uint8_t)(((x) >> 48) & 0xFF); \
|
||||
((uint8_t *)(ptr))[7] = (uint8_t)(((x) >> 56) & 0xFF); \
|
||||
} while(0)
|
||||
|
||||
/* extract a byte portably */
|
||||
#ifdef _MSC_VER
|
||||
#define cc_byte(x, n) ((unsigned char)((x) >> (8 * (n))))
|
||||
#else
|
||||
#define cc_byte(x, n) (((x) >> (8 * (n))) & 255)
|
||||
#endif
|
||||
|
||||
/* Count leading zeros (for nonzero inputs) */
|
||||
|
||||
/*
|
||||
* On i386 and x86_64, we know clang and GCC will generate BSR for
|
||||
* __builtin_clzl. This instruction IS NOT constant time on all micro-
|
||||
* architectures, but it *is* constant time on all micro-architectures that
|
||||
* have been used by Apple, and we expect that to continue to be the case.
|
||||
*
|
||||
* When building for x86_64h with clang, this produces LZCNT, which is exactly
|
||||
* what we want.
|
||||
*
|
||||
* On arm and arm64, we know that clang and GCC generate the constant-time CLZ
|
||||
* instruction from __builtin_clzl( ).
|
||||
*/
|
||||
|
||||
#if defined(_WIN32)
|
||||
/* We use the Windows implementations below. */
|
||||
#elif defined(__x86_64__) || defined(__i386__) || defined(__arm64__) || defined(__arm__)
|
||||
/* We use a thought-to-be-good version of __builtin_clz. */
|
||||
#elif defined __GNUC__
|
||||
#warning Using __builtin_clz() on an unknown architecture; it may not be constant-time.
|
||||
/* If you find yourself seeing this warning, file a radar for someone to
|
||||
* check whether or not __builtin_clz() generates a constant-time
|
||||
* implementation on the architecture you are targeting. If it does, append
|
||||
* the name of that architecture to the list of "safe" architectures above. */
|
||||
#endif
|
||||
|
||||
CC_INLINE CC_CONST unsigned cc_clz32_fallback(uint32_t data)
|
||||
{
|
||||
unsigned int b = 0;
|
||||
unsigned int bit = 0;
|
||||
// Work from LSB to MSB
|
||||
for (int i = 0; i < 32; i++) {
|
||||
bit = (data >> i) & 1;
|
||||
// If the bit is 0, update the "leading bits are zero" counter "b".
|
||||
b += (1 - bit);
|
||||
/* If the bit is 0, (bit - 1) is 0xffff... therefore b is retained.
|
||||
* If the bit is 1, (bit - 1) is 0 therefore b is set to 0.
|
||||
*/
|
||||
b &= (bit - 1);
|
||||
}
|
||||
return b;
|
||||
}
|
||||
|
||||
CC_INLINE CC_CONST unsigned cc_clz64_fallback(uint64_t data)
|
||||
{
|
||||
unsigned int b = 0;
|
||||
unsigned int bit = 0;
|
||||
// Work from LSB to MSB
|
||||
for (int i = 0; i < 64; i++) {
|
||||
bit = (data >> i) & 1;
|
||||
// If the bit is 0, update the "leading bits are zero" counter.
|
||||
b += (1 - bit);
|
||||
/* If the bit is 0, (bit - 1) is 0xffff... therefore b is retained.
|
||||
* If the bit is 1, (bit - 1) is 0 therefore b is set to 0.
|
||||
*/
|
||||
b &= (bit - 1);
|
||||
}
|
||||
return b;
|
||||
}
|
||||
|
||||
CC_INLINE CC_CONST unsigned cc_ctz32_fallback(uint32_t data)
|
||||
{
|
||||
unsigned int b = 0;
|
||||
unsigned int bit = 0;
|
||||
// Work from MSB to LSB
|
||||
for (int i = 31; i >= 0; i--) {
|
||||
bit = (data >> i) & 1;
|
||||
// If the bit is 0, update the "trailing zero bits" counter.
|
||||
b += (1 - bit);
|
||||
/* If the bit is 0, (bit - 1) is 0xffff... therefore b is retained.
|
||||
* If the bit is 1, (bit - 1) is 0 therefore b is set to 0.
|
||||
*/
|
||||
b &= (bit - 1);
|
||||
}
|
||||
return b;
|
||||
}
|
||||
|
||||
CC_INLINE CC_CONST unsigned cc_ctz64_fallback(uint64_t data)
|
||||
{
|
||||
unsigned int b = 0;
|
||||
unsigned int bit = 0;
|
||||
// Work from MSB to LSB
|
||||
for (int i = 63; i >= 0; i--) {
|
||||
bit = (data >> i) & 1;
|
||||
// If the bit is 0, update the "trailing zero bits" counter.
|
||||
b += (1 - bit);
|
||||
/* If the bit is 0, (bit - 1) is 0xffff... therefore b is retained.
|
||||
* If the bit is 1, (bit - 1) is 0 therefore b is set to 0.
|
||||
*/
|
||||
b &= (bit - 1);
|
||||
}
|
||||
return b;
|
||||
}
|
||||
|
||||
/*!
|
||||
@function cc_clz32
|
||||
@abstract Count leading zeros of a nonzero 32-bit value
|
||||
|
||||
@param data A nonzero 32-bit value
|
||||
|
||||
@result Count of leading zeros of @p data
|
||||
|
||||
@discussion @p data is assumed to be nonzero.
|
||||
*/
|
||||
CC_INLINE CC_CONST unsigned cc_clz32(uint32_t data) {
|
||||
cc_assert(data != 0);
|
||||
#if defined(_WIN32)
|
||||
return cc_clz32_fallback(data);
|
||||
#elif defined(__x86_64__) || defined(__i386__) || defined(__arm64__) || defined(__arm__) || defined(__GNUC__)
|
||||
cc_static_assert(sizeof(unsigned) == 4, "clz relies on an unsigned int being 4 bytes");
|
||||
return (unsigned)__builtin_clz(data);
|
||||
#else
|
||||
return cc_clz32_fallback(data);
|
||||
#endif
|
||||
}
|
||||
|
||||
/*!
|
||||
@function cc_clz64
|
||||
@abstract Count leading zeros of a nonzero 64-bit value
|
||||
|
||||
@param data A nonzero 64-bit value
|
||||
|
||||
@result Count of leading zeros of @p data
|
||||
|
||||
@discussion @p data is assumed to be nonzero.
|
||||
*/
|
||||
CC_INLINE CC_CONST unsigned cc_clz64(uint64_t data) {
|
||||
cc_assert(data != 0);
|
||||
#if defined(_WIN32)
|
||||
return cc_clz64_fallback(data);
|
||||
#elif defined(__x86_64__) || defined(__i386__) || defined(__arm64__) || defined(__arm__) || defined(__GNUC__)
|
||||
return (unsigned)__builtin_clzll(data);
|
||||
#else
|
||||
return cc_clz64_fallback(data);
|
||||
#endif
|
||||
}
|
||||
|
||||
/*!
|
||||
@function cc_ctz32
|
||||
@abstract Count trailing zeros of a nonzero 32-bit value
|
||||
|
||||
@param data A nonzero 32-bit value
|
||||
|
||||
@result Count of trailing zeros of @p data
|
||||
|
||||
@discussion @p data is assumed to be nonzero.
|
||||
*/
|
||||
CC_INLINE CC_CONST unsigned cc_ctz32(uint32_t data) {
|
||||
cc_assert(data != 0);
|
||||
#if defined(_WIN32)
|
||||
return cc_ctz32_fallback(data);
|
||||
#elif defined(__x86_64__) || defined(__i386__) || defined(__arm64__) || defined(__arm__) || defined(__GNUC__)
|
||||
cc_static_assert(sizeof(unsigned) == 4, "ctz relies on an unsigned int being 4 bytes");
|
||||
return (unsigned)__builtin_ctz(data);
|
||||
#else
|
||||
return cc_ctz32_fallback(data);
|
||||
#endif
|
||||
}
|
||||
|
||||
/*!
|
||||
@function cc_ctz64
|
||||
@abstract Count trailing zeros of a nonzero 64-bit value
|
||||
|
||||
@param data A nonzero 64-bit value
|
||||
|
||||
@result Count of trailing zeros of @p data
|
||||
|
||||
@discussion @p data is assumed to be nonzero.
|
||||
*/
|
||||
CC_INLINE CC_CONST unsigned cc_ctz64(uint64_t data) {
|
||||
cc_assert(data != 0);
|
||||
#if defined(_WIN32)
|
||||
return cc_ctz64_fallback(data);
|
||||
#elif defined(__x86_64__) || defined(__i386__) || defined(__arm64__) || defined(__arm__) || defined(__GNUC__)
|
||||
return (unsigned)__builtin_ctzll(data);
|
||||
#else
|
||||
return cc_ctz64_fallback(data);
|
||||
#endif
|
||||
}
|
||||
|
||||
/*!
|
||||
@function cc_ffs32_fallback
|
||||
@abstract Find first bit set in a 32-bit value
|
||||
|
||||
@param data A 32-bit value
|
||||
|
||||
@result One plus the index of the least-significant bit set in @p data or, if @p data is zero, zero
|
||||
*/
|
||||
CC_INLINE CC_CONST unsigned cc_ffs32_fallback(int32_t data)
|
||||
{
|
||||
unsigned b = 0;
|
||||
unsigned bit = 0;
|
||||
unsigned seen = 0;
|
||||
|
||||
// Work from LSB to MSB
|
||||
for (int i = 0; i < 32; i++) {
|
||||
bit = ((uint32_t)data >> i) & 1;
|
||||
|
||||
// Track whether we've seen a 1 bit.
|
||||
seen |= bit;
|
||||
|
||||
// If the bit is 0 and we haven't seen a 1 yet, increment b.
|
||||
b += (1 - bit) & (seen - 1);
|
||||
}
|
||||
|
||||
// If we saw a 1, return b + 1, else 0.
|
||||
return (~(seen - 1)) & (b + 1);
|
||||
}
|
||||
|
||||
/*!
|
||||
@function cc_ffs64_fallback
|
||||
@abstract Find first bit set in a 64-bit value
|
||||
|
||||
@param data A 64-bit value
|
||||
|
||||
@result One plus the index of the least-significant bit set in @p data or, if @p data is zero, zero
|
||||
*/
|
||||
CC_INLINE CC_CONST unsigned cc_ffs64_fallback(int64_t data)
|
||||
{
|
||||
unsigned b = 0;
|
||||
unsigned bit = 0;
|
||||
unsigned seen = 0;
|
||||
|
||||
// Work from LSB to MSB
|
||||
for (int i = 0; i < 64; i++) {
|
||||
bit = ((uint64_t)data >> i) & 1;
|
||||
|
||||
// Track whether we've seen a 1 bit.
|
||||
seen |= bit;
|
||||
|
||||
// If the bit is 0 and we haven't seen a 1 yet, increment b.
|
||||
b += (1 - bit) & (seen - 1);
|
||||
}
|
||||
|
||||
// If we saw a 1, return b + 1, else 0.
|
||||
return (~(seen - 1)) & (b + 1);
|
||||
}
|
||||
|
||||
/*!
|
||||
@function cc_ffs32
|
||||
@abstract Find first bit set in a 32-bit value
|
||||
|
||||
@param data A 32-bit value
|
||||
|
||||
@result One plus the index of the least-significant bit set in @p data or, if @p data is zero, zero
|
||||
*/
|
||||
CC_INLINE CC_CONST unsigned cc_ffs32(int32_t data)
|
||||
{
|
||||
cc_static_assert(sizeof(int) == 4, "ffs relies on an int being 4 bytes");
|
||||
#ifdef _WIN32
|
||||
return cc_ffs32_fallback(data);
|
||||
#else
|
||||
return (unsigned)__builtin_ffs(data);
|
||||
#endif
|
||||
}
|
||||
|
||||
/*!
|
||||
@function cc_ffs64
|
||||
@abstract Find first bit set in a 64-bit value
|
||||
|
||||
@param data A 64-bit value
|
||||
|
||||
@result One plus the index of the least-significant bit set in @p data or, if @p data is zero, zero
|
||||
*/
|
||||
CC_INLINE CC_CONST unsigned cc_ffs64(int64_t data)
|
||||
{
|
||||
#ifdef _WIN32
|
||||
return cc_ffs64_fallback(data);
|
||||
#else
|
||||
return (unsigned)__builtin_ffsll(data);
|
||||
#endif
|
||||
}
|
||||
|
||||
#define cc_add_overflow __builtin_add_overflow
|
||||
#define cc_mul_overflow __builtin_mul_overflow
|
||||
|
||||
/* HEAVISIDE_STEP (shifted by one)
|
||||
function f(x): x->0, when x=0
|
||||
x->1, when x>0
|
||||
Can also be seen as a bitwise operation:
|
||||
f(x): x -> y
|
||||
y[0]=(OR x[i]) for all i (all bits)
|
||||
y[i]=0 for all i>0
|
||||
Run in constant time (log2(<bitsize of x>))
|
||||
Useful to run constant time checks
|
||||
*/
|
||||
#define CC_HEAVISIDE_STEP(r, s) { \
|
||||
const uint64_t _s = (uint64_t)s; \
|
||||
const uint64_t _t = (_s & 0xffffffff) | (_s >> 32); \
|
||||
r = (__typeof__(r))((0xffffffff + _t) >> 32); \
|
||||
}
|
||||
|
||||
/* Return 1 if x mod 4 =1,2,3, 0 otherwise */
|
||||
#define CC_CARRY_2BITS(x) (((x>>1) | x) & 0x1)
|
||||
#define CC_CARRY_3BITS(x) (((x>>2) | (x>>1) | x) & 0x1)
|
||||
|
||||
#define cc_ceiling(a,b) (((a)+((b)-1))/(b))
|
||||
#define CC_BITLEN_TO_BYTELEN(x) cc_ceiling((x), 8)
|
||||
|
||||
/*!
|
||||
@brief cc_muxp(s, a, b) is equivalent to z = s ? a : b, but it executes in constant time
|
||||
@param a input pointer
|
||||
@param b input pointer
|
||||
@param s The selection parameter s must be 0 or 1. if s is integer 1 a is returned. If s is integer 0, b is returned. Otherwise, the output is undefined.
|
||||
@return Returns a, if s is 1 and b if s is 0
|
||||
*/
|
||||
void *cc_muxp(int s, const void *a, const void *b);
|
||||
|
||||
/*!
|
||||
@brief CC_MUXU(r, s, a, b) is equivalent to r = s ? a : b, but executes in constant time
|
||||
@param a Input a
|
||||
@param b Input b
|
||||
@param s Selection parameter s. Must be 0 or 1.
|
||||
@param r Output, set to a if s=1, or b if s=0.
|
||||
*/
|
||||
#define CC_MUXU(r, s, a, b) \
|
||||
{ \
|
||||
__typeof__(r) _cond = (__typeof__(r))((s)-1); \
|
||||
r = (~_cond & (a)) | (_cond & (b)); \
|
||||
}
|
||||
|
||||
#define CC_PROVIDES_ABORT (!(CC_USE_SEPROM || CC_USE_S3 || CC_BASEBAND || CC_EFI || CC_IBOOT || CC_RTKITROM))
|
||||
|
||||
/*!
|
||||
@function cc_abort
|
||||
@abstract Abort execution unconditionally
|
||||
*/
|
||||
CC_NORETURN
|
||||
void cc_abort(const char *msg);
|
||||
|
||||
/*!
|
||||
@function cc_try_abort
|
||||
@abstract Abort execution iff the platform provides a function like @p abort() or @p panic()
|
||||
|
||||
@discussion If the platform does not provide a means to abort execution, this function does nothing; therefore, callers should return an error code after calling this function.
|
||||
*/
|
||||
#if CC_PROVIDES_ABORT
|
||||
|
||||
#pragma clang diagnostic push
|
||||
#pragma clang diagnostic ignored "-Wmissing-noreturn"
|
||||
|
||||
CC_INLINE
|
||||
void cc_try_abort(const char *msg)
|
||||
{
|
||||
cc_abort(msg);
|
||||
}
|
||||
|
||||
#pragma clang diagnostic pop
|
||||
|
||||
#else
|
||||
|
||||
CC_INLINE
|
||||
void cc_try_abort(CC_UNUSED const char *msg)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if __has_builtin(__builtin_expect)
|
||||
#define CC_UNLIKELY(cond) __builtin_expect(cond, 0)
|
||||
#else
|
||||
#define CC_UNLIKELY(cond) cond
|
||||
#endif
|
||||
|
||||
CC_INLINE
|
||||
void cc_try_abort_if(bool condition, const char *msg)
|
||||
{
|
||||
if (CC_UNLIKELY(condition)) {
|
||||
cc_try_abort(msg);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
Unfortunately, since we export this symbol, this declaration needs
|
||||
to be in a public header to satisfy TAPI.
|
||||
|
||||
See fipspost_trace_priv.h for more details.
|
||||
*/
|
||||
extern const void *fipspost_trace_vtable;
|
||||
|
||||
#endif /* _CORECRYPTO_CC_PRIV_H_ */
|
||||
|
|
@ -0,0 +1,90 @@
|
|||
/* Copyright (c) (2012,2014,2015,2016,2017,2018,2019,2020) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#ifndef CORECRYPTO_CC_RUNTIME_CONFIG_H_
|
||||
#define CORECRYPTO_CC_RUNTIME_CONFIG_H_
|
||||
|
||||
#include <corecrypto/cc_config.h>
|
||||
|
||||
#if defined(__x86_64__) || defined(__i386__)
|
||||
|
||||
#if CC_KERNEL
|
||||
#include <i386/cpuid.h>
|
||||
#define CC_HAS_RDRAND() ((cpuid_features() & CPUID_FEATURE_RDRAND) != 0)
|
||||
#define CC_HAS_AESNI() ((cpuid_features() & CPUID_FEATURE_AES) != 0)
|
||||
#define CC_HAS_SupplementalSSE3() ((cpuid_features() & CPUID_FEATURE_SSSE3) != 0)
|
||||
#define CC_HAS_AVX1() ((cpuid_features() & CPUID_FEATURE_AVX1_0) != 0)
|
||||
#define CC_HAS_AVX2() ((cpuid_info()->cpuid_leaf7_features & CPUID_LEAF7_FEATURE_AVX2) != 0)
|
||||
#define CC_HAS_AVX512_AND_IN_KERNEL() ((cpuid_info()->cpuid_leaf7_features & CPUID_LEAF7_FEATURE_AVX512F) !=0)
|
||||
#define CC_HAS_BMI2() ((cpuid_info()->cpuid_leaf7_features & CPUID_LEAF7_FEATURE_BMI2) != 0)
|
||||
#define CC_HAS_ADX() ((cpuid_info()->cpuid_leaf7_features & CPUID_LEAF7_FEATURE_ADX) != 0)
|
||||
|
||||
#elif CC_DARWIN && CC_INTERNAL_SDK
|
||||
#include <System/i386/cpu_capabilities.h>
|
||||
#define CC_HAS_RDRAND() (_get_cpu_capabilities() & kHasRDRAND)
|
||||
#define CC_HAS_AESNI() (_get_cpu_capabilities() & kHasAES)
|
||||
#define CC_HAS_SupplementalSSE3() (_get_cpu_capabilities() & kHasSupplementalSSE3)
|
||||
#define CC_HAS_AVX1() (_get_cpu_capabilities() & kHasAVX1_0)
|
||||
#define CC_HAS_AVX2() (_get_cpu_capabilities() & kHasAVX2_0)
|
||||
#define CC_HAS_AVX512_AND_IN_KERNEL() 0
|
||||
#define CC_HAS_BMI2() (_get_cpu_capabilities() & kHasBMI2)
|
||||
#define CC_HAS_ADX() (_get_cpu_capabilities() & kHasADX)
|
||||
|
||||
#else
|
||||
#define CC_HAS_AESNI() __builtin_cpu_supports("aes")
|
||||
#define CC_HAS_SupplementalSSE3() __builtin_cpu_supports("ssse3")
|
||||
#define CC_HAS_AVX1() __builtin_cpu_supports("avx")
|
||||
#define CC_HAS_AVX2() __builtin_cpu_supports("avx2")
|
||||
#define CC_HAS_AVX512_AND_IN_KERNEL() 0
|
||||
#define CC_HAS_BMI2() __builtin_cpu_supports("bmi2")
|
||||
#if CC_LINUX || !CC_INTERNAL_SDK
|
||||
#include <cpuid.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
CC_INLINE bool _cpu_supports_rdrand()
|
||||
{
|
||||
unsigned int eax, ebx, ecx, edx;
|
||||
__cpuid(1, eax, ebx, ecx, edx);
|
||||
return ecx & bit_RDRND;
|
||||
}
|
||||
|
||||
CC_INLINE bool _cpu_supports_adx()
|
||||
{
|
||||
unsigned int eax, ebx, ecx, edx;
|
||||
__cpuid_count(7, 0, eax, ebx, ecx, edx);
|
||||
return ebx & bit_ADX;
|
||||
}
|
||||
|
||||
#define CC_HAS_RDRAND() _cpu_supports_rdrand()
|
||||
#define CC_HAS_ADX() _cpu_supports_adx()
|
||||
#else
|
||||
#define CC_HAS_RDRAND() 0
|
||||
#define CC_HAS_ADX() 0
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#endif // defined(__x86_64__) || defined(__i386__)
|
||||
|
||||
#if defined(__arm64__)
|
||||
|
||||
#if CC_DARWIN && CC_INTERNAL_SDK
|
||||
#include <System/arm/cpu_capabilities.h>
|
||||
#define CC_HAS_SHA512() (_get_cpu_capabilities() & kHasARMv82SHA512)
|
||||
#define CC_HAS_SHA3() (_get_cpu_capabilities() & kHasARMv82SHA3)
|
||||
#else
|
||||
#define CC_HAS_SHA512() (0)
|
||||
#define CC_HAS_SHA3() (0)
|
||||
#endif
|
||||
|
||||
#endif // defined(__arm64__)
|
||||
|
||||
#endif /* CORECRYPTO_CC_RUNTIME_CONFIG_H_ */
|
||||
|
|
@ -0,0 +1,174 @@
|
|||
/* Copyright (c) (2013,2015,2016,2019) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
|
||||
#ifndef corecrypto_arm_aes_compatability_h
|
||||
#define corecrypto_arm_aes_compatability_h
|
||||
|
||||
// #include <Availability.h>
|
||||
#include <sys/cdefs.h>
|
||||
|
||||
#if defined(__clang__) && ((defined(__apple_build_version__) && __apple_build_version__ > 5010000))
|
||||
#define __USES_V_CRYPTO_INTRINSICS 1
|
||||
#else
|
||||
#define __USES_V_CRYPTO_INTRINSICS 0
|
||||
#endif
|
||||
|
||||
|
||||
// AES INSTRUCTIONS
|
||||
// aese.16b v0, v1
|
||||
// aesd.16b v0, v1
|
||||
// aesmc.16b v0, v1
|
||||
// aesimc.16b v0, v1
|
||||
|
||||
// SHA1 INTRINSICS
|
||||
// sha1su0.4s v0, v1, v2
|
||||
// sha1su1.4s v0, v1
|
||||
// sha1c.4s v0, v1, v2 // or q0, s1, v2.4s
|
||||
// sha1m.4s v0, v1, v2 // or q0, s1, v2.4s
|
||||
// sha1p.4s v0, v1, v2 // or q0, s1, v2.4s
|
||||
// sha1h.4s v0, v1 // or s0, s1
|
||||
|
||||
// SHA256 INTRINSICS
|
||||
// sha256su0.4s v0, v1
|
||||
// sha256su1.4s v0, v1, v2
|
||||
// sha256h.4s v0, v1, v2 // or q0, q1, v2.4s
|
||||
// sha256h2.4s v0, v1, v2 // or q0, q1, v2.4s
|
||||
|
||||
|
||||
#if __USES_V_CRYPTO_INTRINSICS == 1
|
||||
.macro AESE
|
||||
aese.16b v$0, v$1
|
||||
.endm
|
||||
|
||||
.macro AESD
|
||||
aesd.16b v$0, v$1
|
||||
.endm
|
||||
|
||||
.macro AESMC
|
||||
aesmc.16b v$0, v$1
|
||||
.endm
|
||||
|
||||
.macro AESIMC
|
||||
aesimc.16b v$0, v$1
|
||||
.endm
|
||||
|
||||
|
||||
#else
|
||||
|
||||
.macro AESE
|
||||
aese q$0, q$1
|
||||
.endm
|
||||
|
||||
.macro AESD
|
||||
aesd q$0, q$1
|
||||
.endm
|
||||
|
||||
.macro AESMC
|
||||
aesmc q$0, q$1
|
||||
.endm
|
||||
|
||||
.macro AESIMC
|
||||
aesimc q$0, q$1
|
||||
.endm
|
||||
|
||||
#endif
|
||||
|
||||
#if __USES_V_CRYPTO_INTRINSICS == 1
|
||||
|
||||
.macro SHA1SU0
|
||||
sha1su0 v$0.4s, v$1.4s, v$2.4s
|
||||
.endm
|
||||
|
||||
.macro SHA1SU1
|
||||
sha1su1 v$0.4s, v$1.4s
|
||||
.endm
|
||||
|
||||
.macro SHA1C
|
||||
sha1c q$0, s$1, v$2.4s
|
||||
.endm
|
||||
|
||||
.macro SHA1M
|
||||
sha1m q$0, s$1, v$2.4s
|
||||
.endm
|
||||
|
||||
.macro SHA1P
|
||||
sha1p q$0, s$1, v$2.4s
|
||||
.endm
|
||||
|
||||
.macro SHA1H
|
||||
sha1h s$0, s$1
|
||||
.endm
|
||||
|
||||
.macro SHA256SU0
|
||||
sha256su0 v$0.4s, v$1.4s
|
||||
.endm
|
||||
|
||||
.macro SHA256SU1
|
||||
sha256su1 v$0.4s, v$1.4s, v$2.4s
|
||||
.endm
|
||||
|
||||
.macro SHA256H
|
||||
sha256h q$0, q$1, v$2.4s
|
||||
.endm
|
||||
|
||||
.macro SHA256H2
|
||||
sha256h2 q$0, q$1, v$2.4s
|
||||
.endm
|
||||
|
||||
#else
|
||||
|
||||
.macro SHA1SU0
|
||||
sha1su0 q$0, q$1, q$2
|
||||
.endm
|
||||
|
||||
.macro SHA1SU1
|
||||
sha1su1 q$0, q$1
|
||||
.endm
|
||||
|
||||
.macro SHA1C
|
||||
sha1c q$0, q$1, q$2
|
||||
.endm
|
||||
|
||||
.macro SHA1M
|
||||
sha1m q$0, q$1, q$2
|
||||
.endm
|
||||
|
||||
.macro SHA1P
|
||||
sha1p q$0, q$1, q$2
|
||||
.endm
|
||||
|
||||
.macro SHA1H
|
||||
sha1h q$0, q$1
|
||||
.endm
|
||||
|
||||
.macro SHA256SU0
|
||||
sha256su0 q$0, q$1
|
||||
.endm
|
||||
|
||||
.macro SHA256SU1
|
||||
sha256su1 q$0, q$1, q$2
|
||||
.endm
|
||||
|
||||
.macro SHA256H
|
||||
sha256h q$0, q$1, q$2
|
||||
.endm
|
||||
|
||||
.macro SHA256H2
|
||||
sha256h2 q$0, q$1, q$2
|
||||
.endm
|
||||
|
||||
#endif
|
||||
#endif /*corecrypto_arm_aes_compatability_h*/
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,46 @@
|
|||
/* Copyright (c) (2011,2015,2016,2018-2020) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#ifndef _CORECRYPTO_CCARM_PAC_BTI_MACROS_H_
|
||||
#define _CORECRYPTO_CCARM_PAC_BTI_MACROS_H_
|
||||
|
||||
/*
|
||||
* This file defines commonly used macros in handwritten assembly
|
||||
* for making functions BTI and PAC compatible.
|
||||
*/
|
||||
|
||||
#ifndef __arm64e__
|
||||
#define __arm64e__ 0
|
||||
#endif
|
||||
|
||||
.macro SIGN_LR
|
||||
#if __arm64e__
|
||||
pacibsp
|
||||
#endif
|
||||
.endmacro
|
||||
|
||||
.macro AUTH_LR_AND_RET
|
||||
#if __arm64e__
|
||||
retab
|
||||
#else
|
||||
ret
|
||||
#endif
|
||||
.endmacro
|
||||
|
||||
.macro BRANCH_TARGET_CALL
|
||||
#if __arm64e__
|
||||
hint #34 /* bti c */
|
||||
#endif
|
||||
.endmacro
|
||||
|
||||
|
||||
|
||||
#endif /* _CORECRYPTO_CCARM_PAC_BTI_MACROS_H_ */
|
||||
|
|
@ -0,0 +1,596 @@
|
|||
/* Copyright (c) (2014,2015,2016,2018,2019) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#include <corecrypto/cc_priv.h>
|
||||
#include "../corecrypto_test/include/testmore.h"
|
||||
#include "testbyteBuffer.h"
|
||||
#include <stdbool.h>
|
||||
#include <limits.h>
|
||||
|
||||
#define CC_SECURITY_TEST
|
||||
|
||||
#if (CC == 0)
|
||||
entryPoint(cc_tests,"cc")
|
||||
#else
|
||||
|
||||
#ifdef CC_SECURITY_TEST
|
||||
#include <corecrypto/ccrng_test.h>
|
||||
#include "cccycles.h"
|
||||
#include "ccstats.h"
|
||||
#include "ccconstanttime.h"
|
||||
#endif
|
||||
|
||||
|
||||
// Disable the static analyzer for the code below since we do voluntary access to
|
||||
// uninitialized memory area in stack
|
||||
|
||||
#ifdef __clang_analyzer__
|
||||
int stack_clear_test(size_t size);
|
||||
#endif
|
||||
|
||||
#ifndef __clang_analyzer__
|
||||
|
||||
#if defined(__has_feature) && __has_feature(address_sanitizer)
|
||||
#define CC_NO_SANITIZE __attribute__((no_sanitize_address))
|
||||
#else
|
||||
#define CC_NO_SANITIZE
|
||||
#endif // __has_feature
|
||||
|
||||
#define STACK_MAGIC 0xC0DEBA5E
|
||||
|
||||
CC_NO_SANITIZE static void
|
||||
stack_dirty(size_t size)
|
||||
{
|
||||
volatile uint32_t array[size];
|
||||
for (size_t i=0;i<size;i++)
|
||||
{
|
||||
array[i]=STACK_MAGIC;
|
||||
}
|
||||
}
|
||||
|
||||
CC_NO_SANITIZE static void
|
||||
stack_clear(size_t size)
|
||||
{
|
||||
uint32_t array[size];
|
||||
cc_clear(sizeof(array),array);
|
||||
}
|
||||
|
||||
CC_NO_SANITIZE static int
|
||||
stack_test(size_t size)
|
||||
{
|
||||
volatile uint32_t array[size];
|
||||
for (size_t i=0;i<size;i++)
|
||||
{
|
||||
if (array[i]==STACK_MAGIC)
|
||||
{
|
||||
return 1; //error stack was not cleared.
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
CC_NO_SANITIZE static int
|
||||
stack_clear_test(size_t size)
|
||||
{
|
||||
stack_dirty(size);
|
||||
stack_clear(size);
|
||||
return stack_test(size);
|
||||
}
|
||||
|
||||
#endif /* __clang_analyzer__ */
|
||||
// Static analyzer re-enabled.
|
||||
|
||||
#define CLZ_RANDOM_TESTS 10000
|
||||
|
||||
static void
|
||||
clz_tests(void) {
|
||||
int i;
|
||||
uint64_t r64;
|
||||
uint32_t r32;
|
||||
struct ccrng_state *rng = global_test_rng;
|
||||
|
||||
is(cc_clz32_fallback(2863311530), cc_clz32(2863311530), "clz32 1010... pattern");
|
||||
is(cc_clz64_fallback(12297829382473034410U), cc_clz64(12297829382473034410U), "clz64 1010... pattern");
|
||||
is(cc_clz32_fallback(1431655765), cc_clz32(1431655765), "clz32 0101... pattern");
|
||||
is(cc_clz64_fallback(6148914691236517205U), cc_clz64(6148914691236517205U), "clz64 0101... pattern");
|
||||
|
||||
for (i = 0; i < 32; i++) {
|
||||
is(cc_clz32_fallback(1U << i), cc_clz32(1U << i), "clz32");
|
||||
is(cc_clz32_fallback((1U << i) + 1), cc_clz32((1U << i) + 1), "clz32 + 1");
|
||||
is(cc_clz32_fallback((1U << i) + (1U << 16)), cc_clz32((1U << i) + (1U << 16)), "clz32 + 1 << 16");
|
||||
}
|
||||
|
||||
for (i = 0; i < 64; i++) {
|
||||
is(cc_clz64_fallback(1ULL << i), cc_clz64(1ULL << i), "clz64");
|
||||
is(cc_clz64_fallback((1ULL << i) + 1), cc_clz64((1ULL << i) + 1), "clz64 + 1");
|
||||
is(cc_clz64_fallback((1ULL << i) + UINT_MAX + 1), cc_clz64((1ULL << i) + UINT_MAX + 1), "clz64 + 1 << 32");
|
||||
}
|
||||
|
||||
for (i = 0; i < CLZ_RANDOM_TESTS; i++)
|
||||
{
|
||||
ccrng_generate(rng, sizeof(r64), &r64);
|
||||
is(cc_clz64_fallback(r64), cc_clz64(r64), "clz64 random");
|
||||
r32 = r64 >> 32;
|
||||
is(cc_clz32_fallback(r32), cc_clz32(r32), "clz32 random");
|
||||
}
|
||||
}
|
||||
|
||||
#define CTZ_RANDOM_TESTS 10000
|
||||
|
||||
static void
|
||||
ctz_tests(void) {
|
||||
int i;
|
||||
uint64_t r64;
|
||||
uint32_t r32;
|
||||
struct ccrng_state *rng = global_test_rng;
|
||||
|
||||
is(cc_ctz32_fallback(2863311530), cc_ctz32(2863311530), "ctz32 1010... pattern");
|
||||
is(cc_ctz64_fallback(12297829382473034410U), cc_ctz64(12297829382473034410U), "ctz64 1010... pattern");
|
||||
is(cc_ctz32_fallback(1431655765), cc_ctz32(1431655765), "ctz32 0101... pattern");
|
||||
is(cc_ctz64_fallback(6148914691236517205U), cc_ctz64(6148914691236517205U), "ctz64 0101... pattern");
|
||||
|
||||
for (i = 0; i < 32; i++) {
|
||||
is(cc_ctz32_fallback(1U << i), cc_ctz32(1U << i), "ctz32");
|
||||
is(cc_ctz32_fallback((1U << i) + 1), cc_ctz32((1U << i) + 1), "ctz32 + 1");
|
||||
is(cc_ctz32_fallback((1U << i) + (1U << 16)), cc_ctz32((1U << i) + (1U << 16)), "ctz32 + 1 << 16");
|
||||
}
|
||||
|
||||
for (i = 0; i < 64; i++) {
|
||||
is(cc_ctz64_fallback(1ULL << i), cc_ctz64(1ULL << i), "ctz64");
|
||||
is(cc_ctz64_fallback((1ULL << i) + 1), cc_ctz64((1ULL << i) + 1), "ctz64 + 1");
|
||||
is(cc_ctz64_fallback((1ULL << i) + UINT_MAX + 1), cc_ctz64((1ULL << i) + UINT_MAX + 1), "ctz64 + 1 << 32");
|
||||
}
|
||||
|
||||
for (i = 0; i < CTZ_RANDOM_TESTS; i++)
|
||||
{
|
||||
ccrng_generate(rng, sizeof(r64), &r64);
|
||||
is(cc_ctz64_fallback(r64), cc_ctz64(r64), "ctz64 random");
|
||||
r32 = r64 >> 32;
|
||||
is(cc_ctz32_fallback(r32), cc_ctz32(r32), "ctz32 random");
|
||||
}
|
||||
}
|
||||
|
||||
#define FFS_RANDOM_TESTS 10000
|
||||
|
||||
static void
|
||||
ffs_tests(void) {
|
||||
int i;
|
||||
int64_t r64;
|
||||
int32_t r32;
|
||||
struct ccrng_state *rng = global_test_rng;
|
||||
|
||||
is(cc_ffs32_fallback(0), cc_ffs32(0), "ffs32 zero");
|
||||
is(cc_ffs64_fallback(0), cc_ffs64(0), "ffs64 zero");
|
||||
is(cc_ffs32_fallback((int32_t)2863311530), cc_ffs32((int32_t)2863311530), "ffs32 1010... pattern");
|
||||
is(cc_ffs64_fallback((int64_t)12297829382473034410U), cc_ffs64((int64_t)12297829382473034410U), "ffs64 1010... pattern");
|
||||
is(cc_ffs32_fallback(1431655765), cc_ffs32(1431655765), "ffs32 0101... pattern");
|
||||
is(cc_ffs64_fallback(6148914691236517205), cc_ffs64(6148914691236517205), "ffs64 0101... pattern");
|
||||
|
||||
for (i = 0; i < 32; i++) {
|
||||
is(cc_ffs32_fallback(1 << i), cc_ffs32(1 << i), "ffs32");
|
||||
is(cc_ffs32_fallback((1 << i) + 1), cc_ffs32((1 << i) + 1), "ffs32 + 1");
|
||||
is(cc_ffs32_fallback((1 << i) + (1 << 16)), cc_ffs32((1 << i) + (1 << 16)), "ffs32 + 1 << 16");
|
||||
}
|
||||
|
||||
for (i = 0; i < 64; i++) {
|
||||
is(cc_ffs64_fallback(1LL << i), cc_ffs64(1LL << i), "ffs64");
|
||||
is(cc_ffs64_fallback((1LL << i) + 1), cc_ffs64((1LL << i) + 1), "ffs64 + 1");
|
||||
is(cc_ffs64_fallback((1LL << i) + UINT_MAX + 1), cc_ffs64((1LL << i) + UINT_MAX + 1), "ffs64 + 1 << 32");
|
||||
}
|
||||
|
||||
for (i = 0; i < FFS_RANDOM_TESTS; i++) {
|
||||
ccrng_generate(rng, sizeof(r64), &r64);
|
||||
is(cc_ffs64_fallback(r64), cc_ffs64(r64), "ffs64 random");
|
||||
r32 = r64 >> 32;
|
||||
is(cc_ffs32_fallback(r32), cc_ffs32(r32), "ffs32 random");
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
Rotate_Tests(void) {
|
||||
int c=1;
|
||||
uint32_t result32=0xaaaaaaaa;
|
||||
uint64_t result64=0xaaaaaaaaaaaaaaaa;
|
||||
|
||||
/* The first argument is NOT a variable on purpose */
|
||||
is(result32, CC_ROL(0x55555555, c), "CC_ROL 1");
|
||||
|
||||
is(result32, CC_ROLc(0x55555555, 1), "CC_ROLc 1");
|
||||
|
||||
is(result64, CC_ROL64(0x5555555555555555, c), "CC_ROL64 1");
|
||||
|
||||
is(result64, CC_ROL64c(0x5555555555555555, 1), "CC_ROL64c 1");
|
||||
|
||||
is(result32, CC_ROR(0x55555555, c), "CC_ROR 1");
|
||||
|
||||
is(result32, CC_RORc(0x55555555, 1), "CC_RORc 1");
|
||||
|
||||
is(result64, CC_ROR64(0x5555555555555555, c), "CC_ROR64 1");
|
||||
|
||||
is(result64, CC_ROR64c(0x5555555555555555, 1), "CC_ROR64c 1");
|
||||
}
|
||||
|
||||
static void
|
||||
mux_Tests(void) {
|
||||
|
||||
uint8_t i8;
|
||||
uint16_t i16;
|
||||
uint32_t i32;
|
||||
uint64_t i64;
|
||||
|
||||
CC_MUXU(i8,0,(uint8_t)0xAB,(uint8_t)0xBA);
|
||||
is(i8,0xBA,"sizeof(uint8_t)!=1");
|
||||
CC_MUXU(i8,1,(uint8_t)0xBA,(uint8_t)0xAB);
|
||||
is(i8,0xBA,"sizeof(uint8_t)!=1");
|
||||
|
||||
CC_MUXU(i16,0,(uint16_t)0xAB00,(uint16_t)0xBA00);
|
||||
is(i16,0xBA00,"sizeof(uint8_t)!=1");
|
||||
CC_MUXU(i16,1,(uint16_t)0xBA00,(uint16_t)0xAB00);
|
||||
is(i16,0xBA00,"sizeof(uint8_t)!=1");
|
||||
|
||||
CC_MUXU(i32,0,(uint32_t)0xAB00BEEF,(uint32_t)0xBA00BEEF);
|
||||
is(i32,0xBA00BEEF,"sizeof(uint8_t)!=1");
|
||||
CC_MUXU(i32,1,(uint32_t)0xBA00BEEF,(uint32_t)0xAB00BEEF);
|
||||
is(i32,0xBA00BEEF,"sizeof(uint8_t)!=1");
|
||||
|
||||
CC_MUXU(i64,0,(uint64_t)0xAB00BEEF11223344,(uint64_t)0xBA00BEEF11223344);
|
||||
is(i64,0xBA00BEEF11223344,"sizeof(uint8_t)!=1");
|
||||
CC_MUXU(i32,1,(uint64_t)0xBA00BEEF11223344,(uint64_t)0xAB00BEEF11223344);
|
||||
is(i64,0xBA00BEEF11223344,"sizeof(uint8_t)!=1");
|
||||
}
|
||||
|
||||
static void
|
||||
HEAVISIDE_STEP_Tests(void)
|
||||
{
|
||||
uint8_t i8;
|
||||
uint16_t i16;
|
||||
uint32_t i32;
|
||||
uint64_t i64;
|
||||
size_t i; // loop index
|
||||
uint8_t err=0,nb_test=0;
|
||||
|
||||
// Sanity check on intended lengths
|
||||
ok(sizeof(uint8_t) == 1, "sizeof(uint8_t)!=1");
|
||||
ok(sizeof(uint16_t) == 2, "sizeof(uint16_t)!=2");
|
||||
ok(sizeof(uint32_t) == 4, "sizeof(uint32_t)!=4");
|
||||
ok(sizeof(uint64_t) == 8, "sizeof(uint64_t)!=1");
|
||||
|
||||
for (i=0;i<8*sizeof(i8);i++)
|
||||
{
|
||||
nb_test++;
|
||||
CC_HEAVISIDE_STEP(i8,((uint8_t)1<<i));
|
||||
if (i8!=1) err++;
|
||||
}
|
||||
ok(err==0,"CC_HEAVISIDE_STEP(i8)");
|
||||
|
||||
for (i=0;i<8*sizeof(i16);i++)
|
||||
{
|
||||
nb_test++;
|
||||
CC_HEAVISIDE_STEP(i16,((uint16_t)1<<i));
|
||||
if (i16!=1) err++;
|
||||
}
|
||||
ok(err==0,"CC_HEAVISIDE_STEP(i16)");
|
||||
|
||||
for (i=0;i<8*sizeof(i32);i++)
|
||||
{
|
||||
nb_test++;
|
||||
CC_HEAVISIDE_STEP(i32,((uint32_t)1<<i));
|
||||
if (i32!=1) err++;
|
||||
}
|
||||
ok(err==0,"CC_HEAVISIDE_STEP(i32)");
|
||||
|
||||
for (i=0;i<8*sizeof(i64);i++)
|
||||
{
|
||||
nb_test++;
|
||||
CC_HEAVISIDE_STEP(i64,((uint64_t)1<<i));
|
||||
if (i64!=1) err++;
|
||||
}
|
||||
ok(err==0,"CC_HEAVISIDE_STEP(i64)");
|
||||
|
||||
ok(err + (64+32+16+8)-nb_test==0, "CC HEAVISIDE_STEP test failed");
|
||||
}
|
||||
|
||||
static void
|
||||
cmp_secure_functionalTests(void) {
|
||||
#define ARRAY_SIZE 10
|
||||
|
||||
// --- Bytes
|
||||
uint8_t array1[ARRAY_SIZE]={1,2,3,4,5,6,7,8,9,0};
|
||||
uint8_t array2[ARRAY_SIZE];
|
||||
|
||||
memcpy(array2,array1,sizeof(array1));
|
||||
// Equal
|
||||
ok(cc_cmp_safe(sizeof(array1), array1,array2)==0, "array1 to array2");
|
||||
ok(cc_cmp_safe(sizeof(array1), array2,array1)==0, "array2 to array1");
|
||||
|
||||
// length is zero
|
||||
ok(cc_cmp_safe(0, array2,array1)!=0, "Array of size 0");
|
||||
|
||||
// Equal but first byte
|
||||
array1[0]++;
|
||||
ok(cc_cmp_safe(sizeof(array1), array1,array2)!=0, "first byte");
|
||||
array1[0]--;
|
||||
|
||||
// Equal but last byte
|
||||
array1[sizeof(array1)-1]++;
|
||||
ok(cc_cmp_safe(sizeof(array1), array1,array2)!=0, "last byte");
|
||||
array1[sizeof(array1)-1]--;
|
||||
|
||||
// --- cc_units
|
||||
uint64_t u64_array1[ARRAY_SIZE]={};
|
||||
for (size_t i=0;i<ARRAY_SIZE;i++) u64_array1[i]=i;
|
||||
uint64_t u64_array2[ARRAY_SIZE];
|
||||
uint64_t tmp;
|
||||
|
||||
memcpy(u64_array2,u64_array1,sizeof(u64_array1));
|
||||
// Equal
|
||||
ok(cc_cmp_safe(sizeof(u64_array1), u64_array1,u64_array2)==0, "array1 to array2");
|
||||
ok(cc_cmp_safe(sizeof(u64_array1), u64_array2,u64_array1)==0, "array2 to array1");
|
||||
|
||||
// length is zero
|
||||
ok(cc_cmp_safe(0, u64_array2,u64_array1)!=0, "Array of size 0");
|
||||
|
||||
// Equal but first byte
|
||||
((uint8_t *)u64_array1)[0]++;
|
||||
ok(cc_cmp_safe(sizeof(u64_array1),u64_array1,u64_array2)!=0, "first byte");
|
||||
((uint8_t *)u64_array1)[0]--;
|
||||
|
||||
// Equal but last byte
|
||||
CC_LOAD64_BE(tmp,&u64_array1[ARRAY_SIZE-1]);
|
||||
CC_STORE64_BE(tmp^0x80,&u64_array1[ARRAY_SIZE-1]);
|
||||
ok(cc_cmp_safe(sizeof(u64_array1), u64_array1,u64_array2)!=0, "last byte");
|
||||
CC_STORE64_BE(tmp,&u64_array1[ARRAY_SIZE-1]);
|
||||
}
|
||||
|
||||
#ifdef CC_SECURITY_TEST
|
||||
|
||||
//======================================================================
|
||||
// Constant time verification parameters
|
||||
//======================================================================
|
||||
|
||||
// Number of iteration of test where timings are not taken into account.
|
||||
// Made to reach a stable performance state
|
||||
#define CC_WARMUP 10
|
||||
|
||||
// Each sample is the average time for many iteration with identical inputs
|
||||
#define CC_TIMING_REPEAT 150
|
||||
|
||||
// Number of sample for the statistical analysis
|
||||
// typically 100~1000 is a good range
|
||||
#define CC_TIMING_SAMPLES 200
|
||||
|
||||
// In case of failure, try many times
|
||||
// This is to reduce false positives due to noise/timing accuracy.
|
||||
// If implementation is not constant time, the behavior will be consistent
|
||||
// So that this does not reduce the detection power.
|
||||
#define CC_TIMING_RETRIES 10
|
||||
|
||||
// Two statitical tools are available: T-test and Wilcoxon.
|
||||
// T-test assumes that the distribution to be compared are normal
|
||||
// Wilcoxon measure offset between distribution.
|
||||
// Due to potential switches between performance state or occasional
|
||||
// latencies, Wilcoxon is recommended.
|
||||
// > Set to 1 to use T-test instead of Wilcoxon
|
||||
#define T_TEST 1
|
||||
|
||||
// Number of iteration of the full test (to play with to evaluate chances of false positives)
|
||||
#define CMP_SECURITY_TEST_ITERATION 1
|
||||
|
||||
// Quantile for the repeated timing. Empirical value.
|
||||
#define CC_TIMING_PERCENTILE 9
|
||||
|
||||
//======================================================================
|
||||
|
||||
static const int verbose=1;
|
||||
|
||||
#define TEST_LAST_BYTE 1
|
||||
#define TEST_FIRST_BYTE 2
|
||||
#define TEST_RANDOM 3
|
||||
#define TEST_EQUAL 4
|
||||
|
||||
static int
|
||||
cmp_secure_timeconstantTests(size_t length, struct ccrng_state *rng, uint32_t test_id) {
|
||||
|
||||
// Random for messages
|
||||
uint8_t array1[length];
|
||||
uint8_t array2[length];
|
||||
int failure_cnt=0;
|
||||
int early_abort=1;
|
||||
uint32_t j,sample_counter;
|
||||
bool retry=true;
|
||||
|
||||
if (length<=0) {goto errOut;}
|
||||
j=0;
|
||||
while(retry)
|
||||
{
|
||||
sample_counter=0; // Index of current sample
|
||||
measurement_t timing_sample[2*CC_TIMING_SAMPLES];
|
||||
|
||||
for (size_t i=0;i<2*CC_TIMING_SAMPLES+(CC_WARMUP/CC_TIMING_REPEAT);i++)
|
||||
{
|
||||
ccrng_generate(rng,length,array1);
|
||||
volatile int cmp_result;
|
||||
if ((i&1) == 0)
|
||||
{
|
||||
// -------------------------
|
||||
// Random
|
||||
// -------------------------
|
||||
switch(test_id) {
|
||||
// All equal, except last byte
|
||||
case TEST_LAST_BYTE:
|
||||
memcpy(array2,array1,length);
|
||||
array2[length-1]^=1;
|
||||
break;
|
||||
// All equal, except first byte
|
||||
case TEST_FIRST_BYTE:
|
||||
memcpy(array2,array1,length);
|
||||
array2[0]^=1;
|
||||
break;
|
||||
// Random
|
||||
case TEST_RANDOM:
|
||||
ccrng_generate(rng,length,array2);
|
||||
break;
|
||||
// All equal
|
||||
case TEST_EQUAL:
|
||||
memcpy(array2,array1,length);
|
||||
break;
|
||||
default:
|
||||
return 0; // failure
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
// -------------------------
|
||||
// Equal
|
||||
// -------------------------
|
||||
memcpy(array2,array1,length);
|
||||
}
|
||||
#if 1
|
||||
// Actual function to test
|
||||
TIMING_WITH_QUANTILE(timing_sample[sample_counter].timing,
|
||||
CC_TIMING_REPEAT,
|
||||
CC_TIMING_PERCENTILE,
|
||||
cmp_result=cc_cmp_safe(length, array1, array2),errOut);
|
||||
#else
|
||||
// Reference which can be expected to fail
|
||||
TIMING_WITH_QUANTILE(timing_sample[sample_counter].timing,
|
||||
CC_TIMING_REPEAT,
|
||||
CC_TIMING_PERCENTILE,
|
||||
cmp_result=memcmp(array1, array2,length),errOut);
|
||||
#endif
|
||||
timing_sample[sample_counter].group=sample_counter&1;
|
||||
#if CC_WARMUP
|
||||
if (i>=CC_WARMUP/CC_TIMING_REPEAT)
|
||||
#endif
|
||||
{
|
||||
sample_counter++;
|
||||
}
|
||||
}
|
||||
#if CCN_OSX
|
||||
if (verbose>1) {
|
||||
char file_name[64];
|
||||
snprintf(file_name,sizeof(file_name),"corecrypto_test_cc_cmp_timings_%.2zu.csv",length);
|
||||
export_measurement_to_file(file_name,timing_sample,sample_counter);
|
||||
}
|
||||
#endif
|
||||
// Process results
|
||||
#if T_TEST
|
||||
// T test
|
||||
int status=T_test_isRejected(timing_sample,sample_counter);
|
||||
#else
|
||||
// Wilcoxon Rank-Sum Test
|
||||
int status=WilcoxonRankSumTest(timing_sample,sample_counter);
|
||||
#endif
|
||||
if (status!=0)
|
||||
{
|
||||
j++; // retry counter
|
||||
if (j>=CC_TIMING_RETRIES)
|
||||
{
|
||||
diag("Constant timing FAILED for len %d after %d attempts",length,j);
|
||||
//ok_or_fail((status==0),"Decrypt+padding constant timing");
|
||||
failure_cnt++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if ((verbose>1) && (j>0)) diag("Constant timing ok for len %d after %d attempts (of %d)",length,j+1,CC_TIMING_RETRIES);
|
||||
break;
|
||||
}
|
||||
} // retry
|
||||
|
||||
early_abort=0;
|
||||
errOut:
|
||||
if (failure_cnt || early_abort)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
#define CMP_SECURITY_TEST_MAX_LENGTH 2048
|
||||
static void
|
||||
memcmp_secure_securityTests(void) {
|
||||
|
||||
// Random for messages
|
||||
struct ccrng_state *rng = global_test_rng;
|
||||
for (size_t i=0;i<CMP_SECURITY_TEST_ITERATION;i++)
|
||||
{
|
||||
size_t r;
|
||||
ccrng_generate(rng,sizeof(r),&r);
|
||||
r=(r%CMP_SECURITY_TEST_MAX_LENGTH)+1;
|
||||
ok(cmp_secure_timeconstantTests(r,rng,TEST_FIRST_BYTE), "Time constant check, first byte difference");
|
||||
ok(cmp_secure_timeconstantTests(r,rng,TEST_LAST_BYTE), "Time constant check, last byte difference");
|
||||
ok(cmp_secure_timeconstantTests(r,rng,TEST_RANDOM), "Time constant check, random");
|
||||
ok(cmp_secure_timeconstantTests(r,rng,TEST_EQUAL), "Time constant check of equal input - if it fails, it's a test issue");
|
||||
}
|
||||
}
|
||||
#endif // CC_SECURITY_TEST
|
||||
|
||||
#ifdef CC_SECURITY_TEST
|
||||
#define kPlan_ccSecurityTestNb 5
|
||||
#else
|
||||
#define kPlan_ccSecurityTestNb 0
|
||||
#endif
|
||||
|
||||
int cc_tests(TM_UNUSED int argc, TM_UNUSED char *const *argv)
|
||||
{
|
||||
int num_tests = 36 + kPlan_ccSecurityTestNb;
|
||||
num_tests += 292 + 2 * CLZ_RANDOM_TESTS; // clz_tests
|
||||
num_tests += 292 + 2 * CTZ_RANDOM_TESTS; // ctz_tests
|
||||
num_tests += 294 + 2 * FFS_RANDOM_TESTS; // ffs_tests
|
||||
plan_tests(num_tests);
|
||||
|
||||
clz_tests();
|
||||
ctz_tests();
|
||||
ffs_tests();
|
||||
|
||||
//For Windows port, many unsigned longs have been replaced with size_t.
|
||||
//This test makes sure corecrypto is agnostic to the change.
|
||||
//This test can be removed leter on.
|
||||
#if defined(_WIN64) && defined(_WIN32)
|
||||
ok(sizeof(size_t)!=sizeof(unsigned long),
|
||||
#else
|
||||
ok(sizeof(size_t)==sizeof(unsigned long),
|
||||
#endif
|
||||
"Historically, corecrypto assumes size_t and long have the same size. Fon Win64, that is not the case");
|
||||
|
||||
|
||||
if(verbose) diag("Stack cleanup");
|
||||
ok(stack_clear_test(100)==0, "Stack clearing");
|
||||
|
||||
if(verbose) diag("mux test");
|
||||
mux_Tests();
|
||||
|
||||
if(verbose) diag("HEAVISIDE_STEP test");
|
||||
HEAVISIDE_STEP_Tests();
|
||||
|
||||
if(verbose) diag("Rotate test");
|
||||
Rotate_Tests();
|
||||
|
||||
if(verbose) diag("Secure comparison test");
|
||||
cmp_secure_functionalTests();
|
||||
|
||||
#ifdef CC_SECURITY_TEST
|
||||
if(verbose) diag("Secure comparison security test");
|
||||
memcmp_secure_securityTests();
|
||||
#endif // CC_SECURITY_TEST
|
||||
|
||||
// Silence code coverage
|
||||
const char *label = "corecrypto";
|
||||
const uint8_t *buffer = (const uint8_t *)label;
|
||||
cc_print("label", strlen(label), buffer);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif //CC
|
||||
|
|
@ -0,0 +1,54 @@
|
|||
/* Copyright (c) (2015,2016,2019) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#include <corecrypto/cc_priv.h>
|
||||
|
||||
//cc_abort() is implemented to comply with by FIPS 140-2, when DRBG produces
|
||||
//two equal consecutive blocks.
|
||||
|
||||
#if !CC_PROVIDES_ABORT
|
||||
|
||||
#error "This environment does not provide an abort()/panic()-like function"
|
||||
|
||||
#elif CC_KERNEL
|
||||
|
||||
#include <kern/debug.h>
|
||||
void cc_abort(const char * msg)
|
||||
{
|
||||
panic("%s", msg);
|
||||
}
|
||||
|
||||
#elif CC_USE_L4
|
||||
|
||||
#include <sys/panic.h>
|
||||
#include <stdarg.h>
|
||||
void cc_abort(const char * msg)
|
||||
{
|
||||
sys_panic(msg);
|
||||
}
|
||||
|
||||
#elif CC_RTKIT
|
||||
|
||||
#include <RTK_platform.h>
|
||||
void cc_abort(const char * msg)
|
||||
{
|
||||
RTK_abort("%s", msg);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#include <stdlib.h>
|
||||
void cc_abort(const char * msg CC_UNUSED)
|
||||
{
|
||||
abort();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,18 @@
|
|||
/* Copyright (c) (2019) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#include <corecrypto/cc_priv.h>
|
||||
#include <corecrypto/ccrng_cryptographic.h>
|
||||
|
||||
void cc_atfork_child(void)
|
||||
{
|
||||
ccrng_cryptographic_atfork_child();
|
||||
}
|
||||
|
|
@ -0,0 +1,18 @@
|
|||
/* Copyright (c) (2019) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#include <corecrypto/cc_priv.h>
|
||||
#include <corecrypto/ccrng_cryptographic.h>
|
||||
|
||||
void cc_atfork_parent(void)
|
||||
{
|
||||
ccrng_cryptographic_atfork_parent();
|
||||
}
|
||||
|
|
@ -0,0 +1,18 @@
|
|||
/* Copyright (c) (2019) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#include <corecrypto/cc_priv.h>
|
||||
#include <corecrypto/ccrng_cryptographic.h>
|
||||
|
||||
void cc_atfork_prepare(void)
|
||||
{
|
||||
ccrng_cryptographic_atfork_prepare();
|
||||
}
|
||||
|
|
@ -0,0 +1,35 @@
|
|||
/* Copyright (c) (2014,2015,2016,2017,2018,2019) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#include <corecrypto/cc.h>
|
||||
#include "corecrypto/fipspost_trace.h"
|
||||
|
||||
#if ( CC_HAS_MEMSET_S == 1 ) && (defined( __STDC_WANT_LIB_EXT1__ ) && ( __STDC_WANT_LIB_EXT1__ == 1 ) )
|
||||
void cc_clear(size_t len, void *dst)
|
||||
{
|
||||
FIPSPOST_TRACE_EVENT;
|
||||
memset_s(dst,len,0,len);
|
||||
}
|
||||
#elif defined(_WIN32) && !defined(__clang__) //Clang with Microsoft CodeGen, doesn't support SecureZeroMemory
|
||||
#include <windows.h>
|
||||
static void cc_clear(size_t len, void *dst)
|
||||
{
|
||||
SecureZeroMemory(dst, len);
|
||||
}
|
||||
#else
|
||||
void cc_clear(size_t len, void *dst)
|
||||
{
|
||||
FIPSPOST_TRACE_EVENT;
|
||||
volatile char *vptr = (volatile char *)dst;
|
||||
while (len--)
|
||||
*vptr++ = '\0';
|
||||
}
|
||||
#endif
|
||||
|
|
@ -0,0 +1,26 @@
|
|||
/* Copyright (c) (2014,2015,2019) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#include <corecrypto/cc_priv.h>
|
||||
|
||||
int cc_cmp_safe (size_t num, const void * ptr1, const void * ptr2)
|
||||
{
|
||||
size_t i;
|
||||
const uint8_t *s=(const uint8_t *)ptr1;
|
||||
const uint8_t *t=(const uint8_t *)ptr2;
|
||||
uint8_t flag=((num<=0)?1:0); // If 0 return an error
|
||||
for (i=0;i<num;i++)
|
||||
{
|
||||
flag|=(s[i]^t[i]);
|
||||
}
|
||||
CC_HEAVISIDE_STEP(flag,flag); // flag=(flag==0)?0:1;
|
||||
return flag; // 0 iff all bytes were equal, 1 if there is any difference
|
||||
}
|
||||
|
|
@ -0,0 +1,26 @@
|
|||
/* Copyright (c) (2014,2015,2016,2017,2018,2019) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#include <corecrypto/cc_priv.h>
|
||||
#include "cc_debug.h"
|
||||
#include "cc_memory.h"
|
||||
|
||||
#if CORECRYPTO_DEBUG
|
||||
struct ws_dbg g_ws_dbg;
|
||||
#endif
|
||||
|
||||
void cc_print(const char *label, size_t count, const uint8_t *s) {
|
||||
cc_printf("%s { %zu, ",label, count);
|
||||
for (size_t ix=0; ix<count ; ix++) {
|
||||
cc_printf("%.02x", s[ix]);
|
||||
}
|
||||
cc_printf(" }\n");
|
||||
}
|
||||
|
|
@ -0,0 +1,35 @@
|
|||
/* Copyright (c) (2019,2020) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#include <corecrypto/cc_fault_canary.h>
|
||||
#include <corecrypto/cc_fault_canary_internal.h>
|
||||
|
||||
const cc_fault_canary_t CCEC_FAULT_CANARY = { 0xce, 0x3c, 0xed, 0x46, 0x6b, 0x11, 0xbf, 0x08, 0x13, 0xa0, 0xd4, 0xbf, 0x89, 0x60, 0xeb, 0x56 };
|
||||
const cc_fault_canary_t CCRSA_PSS_FAULT_CANARY = { 0xef, 0x49, 0xba, 0x59, 0x22, 0xfe, 0x10, 0xdd, 0x84, 0x4f, 0x24,
|
||||
0xd6, 0xad, 0xc0, 0xa9, 0x93 };
|
||||
const cc_fault_canary_t CCRSA_PKCS1_FAULT_CANARY = { 0xea, 0xc5, 0x4a, 0x7c, 0x9f, 0x28, 0xdf, 0x10, 0xb6, 0xe9, 0x3e, 0xb9, 0x1c, 0xd3, 0x3a, 0xc5 };
|
||||
|
||||
void cc_fault_canary_set(cc_fault_canary_t fault_canary_out, const cc_fault_canary_t fault_canary, size_t nbytes, const uint8_t *in1, const uint8_t *in2)
|
||||
{
|
||||
// We need to be careful with our xor's.
|
||||
// The first loop XORs the actual fault canary value
|
||||
for (size_t ci = 0; ci < CC_FAULT_CANARY_SIZE; ci++) {
|
||||
size_t bi = ci % nbytes;
|
||||
fault_canary_out[ci] = in1[bi] ^ in2[bi] ^ fault_canary[ci];
|
||||
}
|
||||
|
||||
// The second loop XORs the existing value in the input fault canary buffer.
|
||||
for (size_t i = CC_FAULT_CANARY_SIZE; i < nbytes; i++) {
|
||||
size_t bi = i % nbytes;
|
||||
size_t ci = i % sizeof(CCEC_FAULT_CANARY);
|
||||
fault_canary_out[ci] = in1[bi] ^ in2[bi] ^ fault_canary_out[ci];
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,27 @@
|
|||
/* Copyright (c) (2015,2016,2018,2019) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#include <corecrypto/cc_priv.h>
|
||||
/*
|
||||
Per C99 ISO/IEC 9899:1999 §6.5.8 and 6.5.9 Relational operator:
|
||||
Each of the operators < , > , <= , >=, ==, != yield 1 if the specified relation is true and 0 if it is false. ... The result type is integer.
|
||||
Also applies to other revisions of the C standard such as C11.
|
||||
*/
|
||||
// returns z= s ? a : b in constant time, when a and be are pointers. s must be either 0 or 1.
|
||||
void *cc_muxp(int s, const void *a, const void *b)
|
||||
{
|
||||
cc_assert(s==1 || s==0);
|
||||
uintptr_t ia = (uintptr_t) a;
|
||||
uintptr_t ib = (uintptr_t) b;
|
||||
uintptr_t cond =~((uintptr_t)s-(uintptr_t)1);//s?~zero:zero; see above
|
||||
uintptr_t rc = (cond&ia)|(~cond&ib);
|
||||
return (void *)rc;
|
||||
}
|
||||
|
|
@ -0,0 +1,39 @@
|
|||
/* Copyright (c) (2019) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#include <corecrypto/cc_runtime_config.h>
|
||||
#include "cc_internal.h"
|
||||
|
||||
#if defined(__x86_64__)
|
||||
|
||||
bool cc_rdrand(uint64_t *rand)
|
||||
{
|
||||
bool ok;
|
||||
|
||||
if (CC_HAS_RDRAND()) {
|
||||
asm volatile ("rdrand %0; setc %1" : "=r"(rand), "=qm"(ok) : : "cc");
|
||||
} else {
|
||||
*rand = 0;
|
||||
ok = false;
|
||||
}
|
||||
|
||||
return ok;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
bool cc_rdrand(uint64_t *rand)
|
||||
{
|
||||
*rand = 0;
|
||||
return false;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,25 @@
|
|||
/* Copyright (c) (2014,2015,2019) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#import "cc_unit.h"
|
||||
|
||||
NSString *cc_composeString(NSString *format, ...) {
|
||||
if (!format) return @"";
|
||||
|
||||
NSString *composedString;
|
||||
va_list args;
|
||||
|
||||
va_start(args, format);
|
||||
composedString = [[[NSString alloc] initWithFormat:format arguments:args] autorelease];
|
||||
va_end(args);
|
||||
|
||||
return composedString;
|
||||
}
|
||||
|
|
@ -0,0 +1,21 @@
|
|||
/* Copyright (c) (2010,2014,2015,2016,2019) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#import "cc_unit.h"
|
||||
|
||||
NSString *cc_hex_string(size_t len, const unsigned char *s) {
|
||||
NSMutableString *r = [[NSMutableString alloc] initWithCapacity: 3 + len * 8];
|
||||
for (size_t ix = 0; ix < len; ++ix) {
|
||||
[r appendFormat: @"%.02x", s[ix]];
|
||||
}
|
||||
[r autorelease];
|
||||
return r;
|
||||
}
|
||||
|
|
@ -0,0 +1,85 @@
|
|||
/* Copyright (c) (2014,2015,2016,2017,2019) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#import <XCTest/XCTest.h>
|
||||
#import <corecrypto/ccrng_test.h>
|
||||
#import <corecrypto/ccrng_system.h>
|
||||
|
||||
|
||||
NSString *cc_hex_string(size_t len, const unsigned char *s);
|
||||
NSString *cc_composeString(NSString *format, ...);
|
||||
|
||||
#define XCAssertMemEquals(len, a1, a2, description, ...) \
|
||||
({ \
|
||||
@try { \
|
||||
const void *_a1value = (a1); \
|
||||
const void *_a2value = (a2); \
|
||||
size_t _lenvalue = (len); \
|
||||
if (memcmp(_a1value, _a2value, _lenvalue) != 0) {\
|
||||
NSString *_expression = cc_composeString(description, ##__VA_ARGS__); \
|
||||
NSString *_a1encoded = cc_hex_string(_lenvalue, _a1value); \
|
||||
NSString *_a2encoded = cc_hex_string(_lenvalue, _a2value); \
|
||||
XCTFail(@"%@\n%@\n should be \n%@",_expression, _a1encoded, _a2encoded);\
|
||||
}\
|
||||
}\
|
||||
@catch (NSException *exception) {\
|
||||
XCTFail(@"An exception caught");\
|
||||
}\
|
||||
})
|
||||
|
||||
#define XCAssertCharsEquals(len, a1, a2, description, ...) \
|
||||
({ \
|
||||
@try { \
|
||||
const void *_a1value = (a1); \
|
||||
const void *_a2value = (a2); \
|
||||
size_t _lenvalue = (len); \
|
||||
if (memcmp(_a1value, _a2value, _lenvalue) != 0) { \
|
||||
NSString *_expression = cc_composeString(description, ##__VA_ARGS__); \
|
||||
NSString *_a1encoded = cc_hex_string(_lenvalue, _a1value); \
|
||||
NSString *_a2encoded = cc_hex_string(_lenvalue, _a2value); \
|
||||
XCTFail(@"%@\n%@\n should be \n%@",_expression, _a1encoded, _a2encoded);\
|
||||
} \
|
||||
} \
|
||||
@catch (NSException *exception) {\
|
||||
XCTFail(@"An exception caught");\
|
||||
}\
|
||||
})
|
||||
|
||||
|
||||
// When choosing the input seed, it must have the format "\x00\x01\x02\x03"...
|
||||
#define XCTestRNG(rngname,input_seed) \
|
||||
struct ccrng_test_state _test_rng; \
|
||||
struct ccrng_state* rngname=(struct ccrng_state*)&_test_rng; \
|
||||
size_t seedlen=sizeof(input_seed)-1; \
|
||||
uint8_t random_seed[16]; \
|
||||
uint8_t *seed=(uint8_t *)input_seed; \
|
||||
if (input_seed==NULL || seedlen<=0) \
|
||||
{\
|
||||
seed=random_seed; \
|
||||
seedlen=sizeof(random_seed); \
|
||||
struct ccrng_system_state system_rng; \
|
||||
XCTAssert(ccrng_system_init(&system_rng)==0); \
|
||||
XCTAssert(ccrng_generate((struct ccrng_state *)&system_rng, seedlen, random_seed)==0); \
|
||||
ccrng_system_done(&system_rng); \
|
||||
} else {\
|
||||
printf("Forced "); \
|
||||
seed=(uint8_t *)input_seed; \
|
||||
} \
|
||||
XCTAssert(ccrng_test_init(&_test_rng, seedlen,seed,"")==0); \
|
||||
NSString *_seed_encoded = cc_hex_string(seedlen, seed); \
|
||||
printf("XCTestRNG seed: %s {", [_seed_encoded UTF8String]); \
|
||||
for (size_t i=0;i<seedlen;i++) printf("\\x%02x",seed[i]); \
|
||||
printf("}\n"); \
|
||||
|
||||
|
||||
#define XCTestRNG_Done(rng) \
|
||||
ccrng_test_done((struct ccrng_test_state*)rng); \
|
||||
rng=NULL;
|
||||
|
|
@ -0,0 +1,74 @@
|
|||
/* Copyright (c) (2012,2015,2016,2017,2019) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#ifndef _CORECRYPTO_FIPSPOST_H_
|
||||
#define _CORECRYPTO_FIPSPOST_H_
|
||||
|
||||
#include <stdint.h>
|
||||
#include <corecrypto/cc_config.h>
|
||||
|
||||
// Boot-Arg fips_mode Flags
|
||||
//
|
||||
// FIPS_MODE_FLAG_FULL is the default value when no other value is set, which
|
||||
// is the case for all production devices.
|
||||
//
|
||||
// When performing tests, if _FORCEFAIL is set to true, then the tests
|
||||
// intentionally fail and log their failure. The kernelspace and userspace
|
||||
// flags can be enabled independently.
|
||||
//
|
||||
// If it's not desired to panic, supply the _NOPANIC flag with the
|
||||
// _FORCEFAIL flag.
|
||||
//
|
||||
// Additional logging can be enabled by supplying the _VERBOSE flag.
|
||||
//
|
||||
// _NOINTEG is used to ignore just the results of the module integrity
|
||||
// check process, which is very useful when setting breakpoints in the
|
||||
// kext for diagnostic or auditing purposes.
|
||||
//
|
||||
// Supplying _TRACE causes a trace buffer to be accumulated of the instrumented
|
||||
// functions for only one execution of the POST. As the POST finishes, the
|
||||
// _TRACE flag is cleared from the fips_mode and no further tracing will occur.
|
||||
#define FIPS_MODE_FLAG_DEBUG (1 << 0)
|
||||
#define FIPS_MODE_FLAG_FULL (1 << 1)
|
||||
#define FIPS_MODE_FLAG_DISABLE (1 << 2)
|
||||
#define FIPS_MODE_FLAG_VERBOSE (1 << 3)
|
||||
#define FIPS_MODE_FLAG_US_FORCEFAIL (1 << 4)
|
||||
#define FIPS_MODE_FLAG_KS_FORCEFAIL (1 << 5)
|
||||
#define FIPS_MODE_FLAG_NOINTEG (1 << 6)
|
||||
#define FIPS_MODE_FLAG_TRACE (1 << 7)
|
||||
#define FIPS_MODE_FLAG_NOPANIC (1 << 8)
|
||||
|
||||
#define FIPS_MODE_IS_DEBUG(MODE) ((MODE) & FIPS_MODE_FLAG_DEBUG)
|
||||
#define FIPS_MODE_IS_FULL(MODE) ((MODE) & FIPS_MODE_FLAG_FULL)
|
||||
#define FIPS_MODE_IS_DISABLE(MODE) ((MODE) & FIPS_MODE_FLAG_DISABLE)
|
||||
#define FIPS_MODE_IS_VERBOSE(MODE) ((MODE) & FIPS_MODE_FLAG_VERBOSE)
|
||||
#define FIPS_MODE_IS_US_FORCEFAIL(MODE) ((MODE) & FIPS_MODE_FLAG_US_FORCEFAIL)
|
||||
#define FIPS_MODE_IS_KS_FORCEFAIL(MODE) ((MODE) & FIPS_MODE_FLAG_KS_FORCEFAIL)
|
||||
#define FIPS_MODE_IS_NOINTEG(MODE) ((MODE) & FIPS_MODE_FLAG_NOINTEG)
|
||||
#define FIPS_MODE_IS_TRACE(MODE) ((MODE) & FIPS_MODE_FLAG_TRACE)
|
||||
#define FIPS_MODE_IS_NOPANIC(MODE) ((MODE) & FIPS_MODE_FLAG_NOPANIC)
|
||||
|
||||
#if CC_KERNEL
|
||||
#define FIPS_MODE_FLAG_FORCEFAIL FIPS_MODE_FLAG_KS_FORCEFAIL
|
||||
#define FIPS_MODE_IS_FORCEFAIL(MODE) FIPS_MODE_IS_KS_FORCEFAIL(MODE)
|
||||
#else
|
||||
#define FIPS_MODE_FLAG_FORCEFAIL FIPS_MODE_FLAG_US_FORCEFAIL
|
||||
#define FIPS_MODE_IS_FORCEFAIL(MODE) FIPS_MODE_IS_US_FORCEFAIL(MODE)
|
||||
#endif
|
||||
|
||||
struct mach_header;
|
||||
|
||||
/*
|
||||
* Entrypoint for all POST tests.
|
||||
*/
|
||||
int fipspost_post(uint32_t fips_mode, struct mach_header *pmach_header);
|
||||
|
||||
#endif /* _CORECRYPTO_FIPSPOST_H_ */
|
||||
|
|
@ -0,0 +1,18 @@
|
|||
/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#ifndef _CORECRYPTO_FIPSPOST_GET_CPU_KEY_H_
|
||||
#define _CORECRYPTO_FIPSPOST_GET_CPU_KEY_H_
|
||||
|
||||
size_t fipspost_get_cpu_key(char *label, size_t label_size, cpu_type_t cpuType,
|
||||
cpu_subtype_t cpusubtype);
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,101 @@
|
|||
/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#ifndef _CORECRYPTO_FIPSPOST_GET_HMAC_H_
|
||||
#define _CORECRYPTO_FIPSPOST_GET_HMAC_H_
|
||||
|
||||
#include <corecrypto/ccsha2.h>
|
||||
|
||||
struct mach_header;
|
||||
|
||||
/*
|
||||
* The pre-calculated SHA256 HMAC gets placed here for integrity
|
||||
* testing. The current value is a random number. Use a different random
|
||||
* number for each architecture type supported.
|
||||
*/
|
||||
#define FIPSPOST_PRECALC_HMAC_SIZE CCSHA256_OUTPUT_SIZE
|
||||
#define FIPSPOST_HMAC_VALUE fipspost_precalc_hmac
|
||||
#define FIPSPOST_PRECALC_HMAC_VARIABLE \
|
||||
__attribute__((section("__TEXT,__fips_hmacs"))) const unsigned char FIPSPOST_HMAC_VALUE[FIPSPOST_PRECALC_HMAC_SIZE]
|
||||
|
||||
#define FIPSPOST_PRECALC_HMAC(ARCH, MODE) \
|
||||
{ ARCH, MODE, 0x10, 0xdc, 0xe5, 0x34, 0x6f, 0x01, \
|
||||
0xdd, 0x82, 0xf8, 0xad, 0xe5, 0x8f, 0xa1, 0xcc, \
|
||||
0xc1, 0x32, 0xe5, 0xa8, 0x53, 0xc8, 0x39, 0xa3, \
|
||||
0x84, 0x5f, 0x3b, 0xcb, 0x39, 0x9e, 0xd1, 0x7b }
|
||||
|
||||
/* Comprehensive list, in the order of mach/machine.h */
|
||||
#define FIPSPOST_PRECALC_HMAC_VALUE_X86_64 FIPSPOST_PRECALC_HMAC(0x86, 0x64)
|
||||
#define FIPSPOST_PRECALC_HMAC_VALUE_X86_32 FIPSPOST_PRECALC_HMAC(0x86, 0x32)
|
||||
#define FIPSPOST_PRECALC_HMAC_VALUE_ARM_4T FIPSPOST_PRECALC_HMAC(0xa4, 0x01)
|
||||
#define FIPSPOST_PRECALC_HMAC_VALUE_ARM_6 FIPSPOST_PRECALC_HMAC(0xa6, 0x00)
|
||||
#define FIPSPOST_PRECALC_HMAC_VALUE_ARM_V5TEJ FIPSPOST_PRECALC_HMAC(0xa5, 0x01)
|
||||
#define FIPSPOST_PRECALC_HMAC_VALUE_ARM_XSCALE FIPSPOST_PRECALC_HMAC(0xa5, 0x02)
|
||||
#define FIPSPOST_PRECALC_HMAC_VALUE_ARM_7A FIPSPOST_PRECALC_HMAC(0xa7, 0x0a)
|
||||
#define FIPSPOST_PRECALC_HMAC_VALUE_ARM_7F FIPSPOST_PRECALC_HMAC(0xa7, 0x0f)
|
||||
#define FIPSPOST_PRECALC_HMAC_VALUE_ARM_7S FIPSPOST_PRECALC_HMAC(0xa7, 0x05)
|
||||
#define FIPSPOST_PRECALC_HMAC_VALUE_ARM_7K FIPSPOST_PRECALC_HMAC(0xa7, 0x04)
|
||||
#define FIPSPOST_PRECALC_HMAC_VALUE_ARM_6M FIPSPOST_PRECALC_HMAC(0xa6, 0x01)
|
||||
#define FIPSPOST_PRECALC_HMAC_VALUE_ARM_7M FIPSPOST_PRECALC_HMAC(0xa7, 0x06)
|
||||
#define FIPSPOST_PRECALC_HMAC_VALUE_ARM_7EM FIPSPOST_PRECALC_HMAC(0xa7, 0x07)
|
||||
|
||||
#define FIPSPOST_PRECALC_HMAC_VALUE_ARM_64 FIPSPOST_PRECALC_HMAC(0xa8, 0x64)
|
||||
#define FIPSPOST_PRECALC_HMAC_VALUE_ARM_64_V8 FIPSPOST_PRECALC_HMAC(0xa8, 0x68)
|
||||
#define FIPSPOST_PRECALC_HMAC_VALUE_ARM_64E FIPSPOST_PRECALC_HMAC(0xa8, 0x6e)
|
||||
#define FIPSPOST_PRECALC_HMAC_VALUE_ARM_64_32 FIPSPOST_PRECALC_HMAC(0xa8, 0x32)
|
||||
|
||||
#define FIPSPOST_CREATE_PRECALC_HMAC(ARCH, VARIANT) \
|
||||
FIPSPOST_PRECALC_HMAC_VARIABLE = FIPSPOST_PRECALC_HMAC_VALUE ## _ ## ARCH ## _ ## VARIANT;
|
||||
|
||||
/*
|
||||
* Declare the individual variants based on the current architecture. Use the
|
||||
* raw compiler flags because each archive must have a different value, even if
|
||||
* they're all classed as '__arm__', to avoid duplicate values in a FAT file.
|
||||
*/
|
||||
#if defined(__x86_64__)
|
||||
#define FIPSPOST_DECLARE_PRECALC_HMAC FIPSPOST_CREATE_PRECALC_HMAC(X86, 64)
|
||||
#elif defined(__i386__)
|
||||
#define FIPSPOST_DECLARE_PRECALC_HMAC FIPSPOST_CREATE_PRECALC_HMAC(X86, 32)
|
||||
#elif defined(__ARM_ARCH_4T__)
|
||||
#define FIPSPOST_DECLARE_PRECALC_HMAC FIPSPOST_CREATE_PRECALC_HMAC(ARM, 4T)
|
||||
#elif defined(__ARM_ARCH_6K__)
|
||||
#define FIPSPOST_DECLARE_PRECALC_HMAC FIPSPOST_CREATE_PRECALC_HMAC(ARM, 6)
|
||||
// Unknown compiler flags for V5TEJ
|
||||
// Unknown compiler flags for XSCALE
|
||||
#elif defined (__ARM_ARCH_7A__) && !defined (__ARM_ARCH_7K__)
|
||||
#define FIPSPOST_DECLARE_PRECALC_HMAC FIPSPOST_CREATE_PRECALC_HMAC(ARM, 7A)
|
||||
#elif defined (__ARM_ARCH_7F__)
|
||||
#define FIPSPOST_DECLARE_PRECALC_HMAC FIPSPOST_CREATE_PRECALC_HMAC(ARM, 7F)
|
||||
#elif defined (__ARM_ARCH_7S__)
|
||||
#define FIPSPOST_DECLARE_PRECALC_HMAC FIPSPOST_CREATE_PRECALC_HMAC(ARM, 7S)
|
||||
#elif defined (__ARM_ARCH_7K__)
|
||||
#define FIPSPOST_DECLARE_PRECALC_HMAC FIPSPOST_CREATE_PRECALC_HMAC(ARM, 7K)
|
||||
#elif defined(__ARM_ARCH_6M__)
|
||||
#define FIPSPOST_DECLARE_PRECALC_HMAC FIPSPOST_CREATE_PRECALC_HMAC(ARM, 6M)
|
||||
#elif defined (__ARM_ARCH_7M__)
|
||||
#define FIPSPOST_DECLARE_PRECALC_HMAC FIPSPOST_CREATE_PRECALC_HMAC(ARM, 7M)
|
||||
#elif defined(__ARM_ARCH_7EM__)
|
||||
#define FIPSPOST_DECLARE_PRECALC_HMAC FIPSPOST_CREATE_PRECALC_HMAC(ARM, 7EM)
|
||||
#elif defined(__arm64e__)
|
||||
#define FIPSPOST_DECLARE_PRECALC_HMAC FIPSPOST_CREATE_PRECALC_HMAC(ARM, 64E)
|
||||
#elif defined(__ARM64_ARCH_8_32__)
|
||||
#define FIPSPOST_DECLARE_PRECALC_HMAC FIPSPOST_CREATE_PRECALC_HMAC(ARM, 64_32)
|
||||
#elif defined(__ARM_ARCH_ISA_A64)
|
||||
#define FIPSPOST_DECLARE_PRECALC_HMAC FIPSPOST_CREATE_PRECALC_HMAC(ARM, 64)
|
||||
// Unknown compiler flags for 64_V8
|
||||
#else
|
||||
#error Unsupported architecture type; add as necessary in the order of mach/machine.h.
|
||||
#endif
|
||||
|
||||
#define FIPSPOST_EXTERN_PRECALC_HMAC extern FIPSPOST_PRECALC_HMAC_VARIABLE;
|
||||
|
||||
int fipspost_get_hmac(const struct mach_header* pmach_header, unsigned char* sha256HMACBuffer, size_t max_offset);
|
||||
#endif
|
||||
|
|
@ -0,0 +1,33 @@
|
|||
/* Copyright (c) (2020) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#ifndef _CORECRYPTO_FIPSPOST_INDICATOR_H_
|
||||
#define _CORECRYPTO_FIPSPOST_INDICATOR_H_
|
||||
|
||||
/// Checks if a symmetric algorithm mode is allowed for the given key size.
|
||||
int fips_allowed_mode(const void *mode, size_t key_byte_length);
|
||||
|
||||
/// Checks if a function is allowed according to FIPS. The arguments are precise the context in which the function will used if
|
||||
/// required. E.G., for a SHA* hash function no parameters are needed, since the function is sufficient to define the use. On the
|
||||
/// opposite a symmetric mode requires the key length in bytes and the cryptographic algorithm. num_args: the number of passed
|
||||
/// arguments. It can currently be 0, 1, or 2. Depending on num_args, the following arguments can be:
|
||||
/// * num_args == 1:
|
||||
/// - struct ccdigest_info * for a DRBG function
|
||||
/// - ccec_const_cp_t for an ECC function
|
||||
/// - struct ccdigest_info * for a HMAC function
|
||||
/// - ccdh_const_gp_t for a DH function
|
||||
/// - ccec_const_cp_t for ECDH function
|
||||
/// - key_byte_length for a KDF CTR CMAC function
|
||||
/// - struct ccdigest_info * for a KDF CTR HMAC or PBKDF2 function
|
||||
/// - key_bit_length for RSA related functions
|
||||
int fips_allowed(const void *function, size_t num_args, ...);
|
||||
|
||||
#endif /* _CORECRYPTO_FIPSPOST_INDICATOR_H_ */
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#ifndef _CORECRYPTO_FIPSPOST_POST_AES_CBC_H_
|
||||
#define _CORECRYPTO_FIPSPOST_POST_AES_CBC_H_
|
||||
|
||||
int fipspost_post_aes_cbc(uint32_t fips_mode);
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,20 @@
|
|||
/* Copyright (c) (2018,2019) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
// Created on 5/1/18.
|
||||
//
|
||||
// Copyright (c) 2018 Apple Inc. All rights reserved.
|
||||
|
||||
#ifndef fipspost_post_aes_ccm_h
|
||||
#define fipspost_post_aes_ccm_h
|
||||
|
||||
int fipspost_post_aes_ccm(uint32_t fips_mode);
|
||||
|
||||
#endif /* fipspost_post_aes_ccm_h */
|
||||
|
|
@ -0,0 +1,18 @@
|
|||
/* Copyright (c) (2017,2019,2020) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#ifndef _CORECRYPTO_FIPSPOST_POST_AES_CMAC_H_
|
||||
#define _CORECRYPTO_FIPSPOST_POST_AES_CMAC_H_
|
||||
|
||||
int fipspost_post_aes_cmac(uint32_t fips_mode);
|
||||
|
||||
#endif
|
||||
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#ifndef _CORECRYPTO_FIPSPOST_POST_AES_ECB_H_
|
||||
#define _CORECRYPTO_FIPSPOST_POST_AES_ECB_H_
|
||||
|
||||
int fipspost_post_aes_ecb(uint32_t fips_mode);
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#ifndef _CORECRYPTO_FIPSPOST_POST_AES_GCM_H_
|
||||
#define _CORECRYPTO_FIPSPOST_POST_AES_GCM_H_
|
||||
|
||||
int fipspost_post_aes_gcm(uint32_t fips_mode);
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,20 @@
|
|||
/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#ifndef _CORECRYPTO_FIPSPOST_POST_AES_SKG_H_
|
||||
#define _CORECRYPTO_FIPSPOST_POST_AES_SKG_H_
|
||||
|
||||
int fipspost_post_aes_skg_enc_ecb_128(uint32_t fips_mode);
|
||||
int fipspost_post_aes_skg_dec_ecb_128(uint32_t fips_mode);
|
||||
int fipspost_post_aes_skg_enc_cbc_128(uint32_t fips_mode);
|
||||
int fipspost_post_aes_skg_dec_cbc_128(uint32_t fips_mode);
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#ifndef _CORECRYPTO_FIPSPOST_POST_AES_TRNG_H_
|
||||
#define _CORECRYPTO_FIPSPOST_POST_AES_TRNG_H_
|
||||
|
||||
int fipspost_post_aes_trng(uint32_t fips_mode);
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#ifndef _CORECRYPTO_FIPSPOST_POST_AES_XTS_H_
|
||||
#define _CORECRYPTO_FIPSPOST_POST_AES_XTS_H_
|
||||
|
||||
int fipspost_post_aes_xts(uint32_t fips_mode);
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#ifndef _CORECRYPTO_FIPSPOST_POST_DRBG_CTR_H_
|
||||
#define _CORECRYPTO_FIPSPOST_POST_DRBG_CTR_H_
|
||||
|
||||
int fipspost_post_drbg_ctr(uint32_t fips_mode);
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#ifndef _CORECRYPTO_FIPSPOST_POST_DRBG_HMAC_H_
|
||||
#define _CORECRYPTO_FIPSPOST_POST_DRBG_HMAC_H_
|
||||
|
||||
int fipspost_post_drbg_hmac(uint32_t fips_mode);
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#ifndef _CORECRYPTO_FIPSPOST_POST_DRBG_TRNG_H_
|
||||
#define _CORECRYPTO_FIPSPOST_POST_DRBG_TRNG_H_
|
||||
|
||||
int fipspost_post_drbg_trng(uint32_t fips_mode);
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#ifndef _CORECRYPTO_FIPSPOST_POST_ECDH_H_
|
||||
#define _CORECRYPTO_FIPSPOST_POST_ECDH_H_
|
||||
|
||||
int fipspost_post_ecdh(uint32_t fips_mode);
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#ifndef _CORECRYPTO_FIPSPOST_POST_ECDSA_H_
|
||||
#define _CORECRYPTO_FIPSPOST_POST_ECDSA_H_
|
||||
|
||||
int fipspost_post_ecdsa(uint32_t fips_mode);
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#ifndef _CORECRYPTO_FIPSPOST_POST_FFDH_H_
|
||||
#define _CORECRYPTO_FIPSPOST_POST_FFDH_H_
|
||||
|
||||
int fipspost_post_ffdh(uint32_t fips_mode);
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#ifndef _CORECRYPTO_FIPSPOST_POST_HMAC_H_
|
||||
#define _CORECRYPTO_FIPSPOST_POST_HMAC_H_
|
||||
|
||||
int fipspost_post_hmac(uint32_t fips_mode);
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,19 @@
|
|||
/* Copyright (c) (2019,2020) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#ifndef _CORECRYPTO_FIPSPOST_POST_INDICATOR_H_
|
||||
#define _CORECRYPTO_FIPSPOST_POST_INDICATOR_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
int fipspost_post_indicator(uint32_t fips_mode);
|
||||
|
||||
#endif /* _CORECRYPTO_FIPSPOST_POST_INDICATOR_H_ */
|
||||
|
|
@ -0,0 +1,19 @@
|
|||
/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#ifndef _CORECRYPTO_FIPSPOST_POST_INTEGRITY_H_
|
||||
#define _CORECRYPTO_FIPSPOST_POST_INTEGRITY_H_
|
||||
|
||||
struct mach_header;
|
||||
|
||||
int fipspost_post_integrity(uint32_t fips_mode, struct mach_header *pmach_header);
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
/* Copyright (c) (2012,2015,2020) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#ifndef _CORECRYPTO_FIPSPOST_POST_KDF_CTR_H_
|
||||
#define _CORECRYPTO_FIPSPOST_POST_KDF_CTR_H_
|
||||
|
||||
int fipspost_post_kdf_ctr(uint32_t fips_mode);
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
/* Copyright (c) (2012,2015,2020) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#ifndef _CORECRYPTO_FIPSPOST_POST_PBKDF_H_
|
||||
#define _CORECRYPTO_FIPSPOST_POST_PBKDF_H_
|
||||
|
||||
int fipspost_post_pbkdf(uint32_t fips_mode);
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,22 @@
|
|||
/* Copyright (c) (2019) Apple Inc. All rights reserved.
|
||||
*
|
||||
* corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
|
||||
* is contained in the License.txt file distributed with corecrypto) and only to
|
||||
* people who accept that license. IMPORTANT: Any license rights granted to you by
|
||||
* Apple Inc. (if any) are limited to internal use within your organization only on
|
||||
* devices and computers you own or control, for the sole purpose of verifying the
|
||||
* security characteristics and correct functioning of the Apple Software. You may
|
||||
* not, directly or indirectly, redistribute the Apple Software or any portions thereof.
|
||||
*/
|
||||
|
||||
#ifndef _CORECRYPTO_FIPSPOST_POST_RSA_H_
|
||||
#define _CORECRYPTO_FIPSPOST_POST_RSA_H_
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
// DER RSA key used for RSA operation tests pulled from FIPS 186-2 RSA test vectors.
|
||||
extern const uint8_t fipspost_post_rsa_test_key[];
|
||||
extern const size_t fipspost_post_rsa_test_key_nbytes;
|
||||
|
||||
#endif
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue