first commit

2021-12-31 13:03:01 -08:00 · 2021-12-31 13:03:01 -08:00 · 67e48c5b1d
commit 67e48c5b1d
1636 changed files with 1375946 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
+build
+
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -0,0 +1,310 @@
+# Copyright (c) (2018-2020) Apple Inc. All rights reserved.
+#
+# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+# is contained in the License.txt file distributed with corecrypto) and only to
+# people who accept that license. IMPORTANT:  Any license rights granted to you by
+# Apple Inc. (if any) are limited to internal use within your organization only on
+# devices and computers you own or control, for the sole purpose of verifying the
+# security characteristics and correct functioning of the Apple Software.  You may
+# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+
+#
+
+# CMake corecrypto build for Linux
+#
+# This CMake generates corecrypto_static library. It is meant to be
+# used for Linux only.
+#
+
+cmake_minimum_required(VERSION 3.4.3)
+set(CMAKE_OSX_SYSROOT "macosx.internal") # NOTE: This must be set before the call to project
+project (corecrypto C)
+
+option(CC_LINUX_ASM "Enable assembler support on Linux platform" OFF)
+
+include (CoreCryptoSources.cmake)
+
+#
+# Build Macros and Targets
+#
+
+# get_include_dirs: extract include directories from list of headers
+macro (get_include_dirs out in)
+    foreach (file ${in})
+
+	# Add directory including the header
+        get_filename_component(dir ${file} DIRECTORY)
+        list(APPEND ${out} ${dir})
+
+        # If the directory is corecrypto, we should also add its
+        # parent to the include dir.
+        get_filename_component(dirname ${dir} NAME)
+	if (${dirname} STREQUAL "corecrypto")
+            get_filename_component(parent ${dir} DIRECTORY)
+	    list(APPEND ${out} ${parent})
+        endif()
+
+    endforeach()
+endmacro()
+
+
+# Project-level settings
+
+## Build all objects with -fPIC
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+## CMake spelling of -std=gnu99
+set(CMAKE_C_STANDARD 99)
+set(CMAKE_C_EXTENSIONS ON)
+
+## Project-globals
+set_property(DIRECTORY
+    APPEND PROPERTY COMPILE_DEFINITIONS
+        COMPILING_CORECRYPTO=1
+        $<$<CONFIG:Debug>:DEBUG=1>
+        $<$<CONFIG:Release>:NDEBUG>
+)
+set(CC_C_OPTIONS
+    -DBUILDKERNEL=0
+    -Wundef
+    -Wcast-qual
+    -Wno-error=deprecated-declarations
+    $<$<CONFIG:Debug>:-Werror>
+)
+add_compile_options(
+    "$<$<COMPILE_LANGUAGE:C>:${CC_C_OPTIONS}>"
+)
+
+# System dependencies
+find_package(UnixCommands REQUIRED) # For ${BASH}
+find_package(Threads REQUIRED)
+find_library(MATH_LIBRARY m DOC "libm")
+if(NOT MATH_LIBRARY)
+    message(SEND_ERROR "Could not find libm")
+endif()
+
+# Platform-specific dependencies
+if(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
+
+    find_library(SYSTEM_FRAMEWORK NAMES System)
+    mark_as_advanced(SYSTEM_FRAMEWORK)
+    find_path(SYSTEM_CPU_CAPABILITIES_PATH i386/cpu_capabilities.h
+        HINTS "${SYSTEM_FRAMEWORK}/PrivateHeaders")
+    mark_as_advanced(SYSTEM_CPU_CAPABILITIES_PATH)
+    if(NOT SYSTEM_FRAMEWORK OR NOT SYSTEM_CPU_CAPABILITIES_PATH)
+        unset(SYSTEM_FRAMEWORK CACHE)
+        message(SEND_ERROR
+            "Could not find internal System.framework\n"
+            "HINT: Run cmake with xcrun to point it at the right SDK, or try:\n"
+            "  ${CMAKE_COMMAND} -DCMAKE_OSX_SYSROOT=macosx.internal .")
+    else()
+        message("-- Found internal System.framework")
+    endif()
+
+    # Compile assembler sources in OSX
+    enable_language(ASM)
+
+    # Enable FIPS POST trace in OSX
+    set_source_files_properties(cc_fips/src/fipspost_trace.c cc_fips/crypto_test/crypto_test_cc_fips.c
+        PROPERTIES COMPILE_FLAGS -DCORECRYPTO_POST_TRACE=1)
+
+elseif(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+
+    # Exclude sources that don't apply to Linux (or haven't yet been ported)
+    set (CORECRYPTO_EXCLUDE_SRCS
+        # exclude files that are OSX dependent
+        cc_fips/src/fipspost_get_cpu_key.c
+        cc_fips/src/fipspost_get_hmac.c
+        cckprng/src/cckprng_diag.c
+        cckprng/src/cckprng_diaggens.c
+        cckprng/src/cckprng_generate.c
+        cckprng/src/cckprng_init.c
+        cckprng/src/cckprng_initgen.c
+        cckprng/src/cckprng_loadseed.c
+        cckprng/src/cckprng_printdiag.c
+        cckprng/src/cckprng_ratchetseed.c
+        cckprng/src/cckprng_refresh.c
+        cckprng/src/cckprng_rekeygen.c
+        cckprng/src/cckprng_rekeygens.c
+        cckprng/src/cckprng_reseed.c
+        cckprng/src/cckprng_storeseed.c
+        cckprng/src/prng.c
+    )
+
+    set (CORECRYPTO_TEST_EXCLUDE_SRCS
+        # exclude files that are OSX dependent
+        cc_fips/src/fipspost_get_cpu_key.c
+        cc_fips/src/fipspost_get_hmac.c
+        corecrypto_test/lib/ccshadow.c
+        corecrypto_test/lib/cccycles.c
+        cckprng/crypto_test/crypto_test_kprng.c
+
+        # this test requires trace to be enabled
+        cc_fips/crypto_test/crypto_test_cc_fips.c
+    )
+
+    set (CORECRYPTO_PERF_EXCLUDE_SRCS
+        # exclude files that are OSX dependent
+        corecrypto_perf/src/ccperf_kprng.c
+    )
+
+    if (CC_LINUX_ASM)
+        enable_language(ASM)
+
+        # Add assembler specific clang flags
+        set (CC_ASM_OPTIONS
+            -integrated-as        # Always use clang internal assembler
+            -x assembler-with-cpp # Run preprocessor despite .s name
+        )
+        add_compile_options(
+            "$<$<COMPILE_LANGUAGE:ASM>:${CC_ASM_OPTIONS}>"
+        )
+
+       # Enable Linux assembler in corecrypto
+        add_compile_options(
+            "-DCC_LINUX_ASM=1"
+        )
+    endif()
+endif()
+
+include(GNUInstallDirs)
+if(NOT CMAKE_C_COMPILER_ID MATCHES "Clang")
+	message(FATAL_ERROR "Only clang is supported for compilation, found ${CMAKE_C_COMPILER_ID} (${CMAKE_C_COMPILER})")
+endif()
+
+#
+# corecrypto_static library target
+#
+
+# A few include dirs cannot be automatically generated by the above headers
+# list. Manually fix it up.
+set (CORECRYPTO_FIXED_INCLUDE_DIRS
+    ccaes/src/vng
+    cckprng
+    cckprng/corecrypto
+    corecrypto_test/include
+    acceleratecrypto/Include
+    acceleratecrypto/Header
+    ccec25519/src
+)
+
+# Find include dirs for corecrypto_static headers.
+set (cc_include_dir ${CORECRYPTO_FIXED_INCLUDE_DIRS})
+get_include_dirs (cc_include_dir "${CORECRYPTO_PROJECT_HDRS}")
+get_include_dirs (cc_include_dir "${CORECRYPTO_PUBLIC_HDRS}")
+get_include_dirs (cc_include_dir "${CORECRYPTO_PRIVATE_HDRS}")
+list (REMOVE_DUPLICATES cc_include_dir)
+
+
+# Filter out excluded sources
+if(CORECRYPTO_EXCLUDE_SRCS)
+    list(REMOVE_ITEM CORECRYPTO_SRCS ${CORECRYPTO_EXCLUDE_SRCS})
+endif()
+
+
+# Create target for corecrypto_static
+add_library(corecrypto_static STATIC ${CORECRYPTO_SRCS})
+target_link_libraries(corecrypto_static
+                      PRIVATE $<$<PLATFORM_ID:Darwin>:${SYSTEM_FRAMEWORK}> ${MATH_LIBRARY})
+target_include_directories(corecrypto_static PRIVATE ${cc_include_dir})
+set_property(TARGET corecrypto_static PROPERTY POSITION_INDEPENDENT_CODE ON)
+
+# Generate pkgconfig for corecrypto_static
+configure_file("corecrypto.pc.in" "corecrypto.pc" @ONLY)
+
+# Install corecrypto_static
+install (TARGETS corecrypto_static ARCHIVE
+         DESTINATION "${CMAKE_INSTALL_LIBDIR}")
+install (FILES ${CORECRYPTO_PUBLIC_HDRS} ${CORECRYPTO_PRIVATE_HDRS}
+         DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/corecrypto")
+install (FILES ${CMAKE_CURRENT_BINARY_DIR}/corecrypto.pc
+         DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
+
+
+#
+# corecrypto_test target
+#
+
+# Remove the .inc and other non C files from the sources
+foreach (file ${CORECRYPTO_TEST_SRCS})
+    string (REGEX MATCH ".+\\.c$" match ${file})
+    if (NOT match)
+        list (REMOVE_ITEM CORECRYPTO_TEST_SRCS ${file})
+    endif()
+endforeach()
+
+# A few include dirs cannot be automatically generated by the above headers
+# list. Manually fix it up.
+set (CORECRYPTO_TEST_FIXED_INCLUDE_DIRS
+    ccsha2/src
+    ccrng/src
+    ccec25519/src
+    ccaes/src/ios_hardware
+    corecrypto_test
+    cczp/src
+)
+
+# Find include dirs for corecrypto_test headers.
+set (cctest_include_dir ${CORECRYPTO_TEST_FIXED_INCLUDE_DIRS})
+get_include_dirs (cctest_include_dir "${CORECRYPTO_TEST_HDRS}")
+get_include_dirs (cctest_include_dir "${CORECRYPTO_TEST_SRCS}")
+list (REMOVE_DUPLICATES cctest_include_dir)
+
+
+# Create target for corecrypto_test
+if(CORECRYPTO_TEST_EXCLUDE_SRCS)
+    list (REMOVE_ITEM CORECRYPTO_TEST_SRCS ${CORECRYPTO_TEST_EXCLUDE_SRCS})
+endif()
+add_executable(corecrypto_test ${CORECRYPTO_TEST_SRCS})
+target_compile_definitions(corecrypto_test PRIVATE CC_UNITTEST=1)
+target_include_directories(corecrypto_test
+			   PRIVATE ${cctest_include_dir} ${cc_include_dir})
+target_link_libraries(corecrypto_test PRIVATE corecrypto_static
+                      Threads::Threads ${MATH_LIBRARY} ${CMAKE_DL_LIBS})
+
+# Generate test vectors
+set(CC_CONVERT_TEST_VECTORS scripts/convert_testvectors.sh)
+set(CC_TEST_VECTORS corecrypto_test/test_vectors/wycheproof/chacha20_poly1305_test.json)
+set(GENERATED_TEST_VECTORS_DIR ${CMAKE_CURRENT_BINARY_DIR}/gen/corecrypto_test/include)
+set(GENERATED_TEST_VECTORS ${GENERATED_TEST_VECTORS_DIR}/cc_generated_test_vectors.h
+)
+add_custom_command(
+    OUTPUT ${GENERATED_TEST_VECTORS}
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${GENERATED_TEST_VECTORS_DIR}
+    COMMAND ${BASH} ${CMAKE_SOURCE_DIR}/${CC_CONVERT_TEST_VECTORS} ${GENERATED_TEST_VECTORS} ${CMAKE_CURRENT_SOURCE_DIR}/corecrypto_test/test_vectors/wycheproof
+    COMMENT "Generating test vectors"
+    DEPENDS ${CC_CONVERT_TEST_VECTORS} ${CC_TEST_VECTORS}
+)
+target_sources(corecrypto_test PRIVATE ${GENERATED_TEST_VECTORS})
+target_include_directories(corecrypto_test PRIVATE ${GENERATED_TEST_VECTORS_DIR})
+
+set(CC_CONVERT_TEST_VECTORS_PC scripts/convert_h2c_testvectors.py)
+message(STATUS "Running python convert_h2c_testvectors.py")
+execute_process(
+    COMMAND ${PYTHON} ${CMAKE_SOURCE_DIR}/${CC_CONVERT_TEST_VECTORS_PC} ${CMAKE_CURRENT_SOURCE_DIR}
+    RESULT_VARIABLE RESULT_PC
+    OUTPUT_VARIABLE OUTPUT_PC
+    ERROR_VARIABLE ERROR_PC
+)
+message(STATUS "result convert_vectors: ${RESULT_PC}")
+message(STATUS "output convert_vectors: ${OUTPUT_PC}")
+message(STATUS "error convert_vectors: ${ERROR_PC}")
+
+#
+# corecrypto_perf target
+#
+
+# ccperf.h lives in corecrypto_perf/corecrypto. Add it up
+set (CORECRYPTO_PERF_FIXED_INCLUDE_DIRS
+    corecrypto_perf/corecrypto
+)
+set (ccperf_include_dir ${CORECRYPTO_PERF_FIXED_INCLUDE_DIRS})
+
+# Create target for corecrypto_perf
+if(CORECRYPTO_PERF_EXCLUDE_SRCS)
+    list (REMOVE_ITEM CORECRYPTO_PERF_SRCS ${CORECRYPTO_PERF_EXCLUDE_SRCS})
+endif()
+add_executable(corecrypto_perf ${CORECRYPTO_PERF_SRCS})
+target_include_directories(corecrypto_perf
+			   PRIVATE ${ccperf_include_dir} ${cctest_include_dir} ${cc_include_dir})
+target_link_libraries(corecrypto_perf PRIVATE corecrypto_static Threads::Threads ${MATH_LIBRARY})
--- a/CoreCryptoSources.cmake
+++ b/CoreCryptoSources.cmake
--- a/License.txt
+++ b/License.txt
@ -0,0 +1,61 @@
+Copyright (c) Apple Inc. All rights reserved.
+
+corecrypto Internal Use License Agreement
+
+IMPORTANT:  This Apple corecrypto software is supplied to you by Apple Inc. ("Apple")
+in consideration of your agreement to the following terms, and your download or use
+of this Apple software constitutes acceptance of these terms.  If you do not agree
+with these terms, please do not download or use this Apple software.
+
+1.    As used in this Agreement, the term "Apple Software" collectively means and
+includes all of the Apple corecrypto materials provided by Apple here, including
+but not limited to the Apple corecrypto software, frameworks, libraries, documentation
+and other Apple-created materials. In consideration of your agreement to abide by the
+following terms, conditioned upon your compliance with these terms and subject to
+these terms, Apple grants you, for a period of ninety (90) days from the date you
+download the Apple Software, a limited, non-exclusive, non-sublicensable license
+under Apple’s copyrights in the Apple Software to make a reasonable number of copies
+of, compile, and run the Apple Software internally within your organization only on
+devices and computers you own or control, for the sole purpose of verifying the
+security characteristics and correct functioning of the Apple Software; provided
+that you must retain this notice and the following text and disclaimers in all
+copies of the Apple Software that you make. You may not, directly or indirectly,
+redistribute the Apple Software or any portions thereof. The Apple Software is only
+licensed and intended for use as expressly stated above and may not be used for other
+purposes or in other contexts without Apple's prior written permission.  Except as
+expressly stated in this notice, no other rights or licenses, express or implied, are
+granted by Apple herein.
+
+2.    The Apple Software is provided by Apple on an "AS IS" basis.  APPLE MAKES NO
+WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED WARRANTIES
+OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, REGARDING
+THE APPLE SOFTWARE OR ITS USE AND OPERATION ALONE OR IN COMBINATION WITH YOUR PRODUCTS,
+SYSTEMS, OR SERVICES. APPLE DOES NOT WARRANT THAT THE APPLE SOFTWARE WILL MEET YOUR
+REQUIREMENTS, THAT THE OPERATION OF THE APPLE SOFTWARE WILL BE UNINTERRUPTED OR
+ERROR-FREE, THAT DEFECTS IN THE APPLE SOFTWARE WILL BE CORRECTED, OR THAT THE APPLE
+SOFTWARE WILL BE COMPATIBLE WITH FUTURE APPLE PRODUCTS, SOFTWARE OR SERVICES. NO ORAL
+OR WRITTEN INFORMATION OR ADVICE GIVEN BY APPLE OR AN APPLE AUTHORIZED REPRESENTATIVE
+WILL CREATE A WARRANTY.
+
+3.    IN NO EVENT SHALL APPLE BE LIABLE FOR ANY DIRECT, SPECIAL, INDIRECT, INCIDENTAL
+OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) ARISING
+IN ANY WAY OUT OF THE USE, REPRODUCTION, COMPILATION OR OPERATION OF THE APPLE
+SOFTWARE, HOWEVER CAUSED AND WHETHER UNDER THEORY OF CONTRACT, TORT (INCLUDING
+NEGLIGENCE), STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+4.    This Agreement is effective until terminated. Your rights under this Agreement will
+terminate automatically without notice from Apple if you fail to comply with any term(s)
+of this Agreement.  Upon termination, you agree to cease all use of the Apple Software
+and destroy all copies, full or partial, of the Apple Software. This Agreement will be
+governed and construed in accordance with the laws of the State of California, without
+regard to its choice of law rules.
+
+You may report security issues about Apple products to product-security@apple.com,
+as described here:  https://www.apple.com/support/security/.  Non-security bugs and
+enhancement requests can be made via https://bugreport.apple.com as described
+here: https://developer.apple.com/bug-reporting/
+
+EA1350
+10/5/15
--- a/13
+++ b/13
@ -0,0 +1,13 @@
+# Copyright (c) (2017,2018,2019) Apple Inc. All rights reserved.
+#
+# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+# is contained in the License.txt file distributed with corecrypto) and only to 
+# people who accept that license. IMPORTANT:  Any license rights granted to you by 
+# Apple Inc. (if any) are limited to internal use within your organization only on 
+# devices and computers you own or control, for the sole purpose of verifying the 
+# security characteristics and correct functioning of the Apple Software.  You may 
+# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+#
+
+coverage:
+	./scripts/corecrypto_coverage.sh
--- a/README.md
+++ b/README.md
@ -0,0 +1,127 @@
+/* Copyright (c) (2010,2012,2014-2020) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by
+ * Apple Inc. (if any) are limited to internal use within your organization only on
+ * devices and computers you own or control, for the sole purpose of verifying the
+ * security characteristics and correct functioning of the Apple Software.  You may
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+The corecrypto (cc) project
+===========================
+
+The main goal is to provide low level fast math routines and crypto APIs which
+can be used in various environments (Kernel, bootloader, userspace, etc.).  It
+is an explicit goal to minimize dependancies between modules and functions so
+that clients of this library only end up with the routines they need and
+nothing more.
+
+Corecrypto compiles under all Apple OSs, Windows, Android and Linux.
+
+Corecrypto Modules
+------------------
+
+Current corecrypto consists of the following submodules:
+
+* `cc`:			  Headers and code common to all of the modules
+* `ccasn1`:		  ASN.1 typeid constants and ccoid definition.
+* `ccder`:		  DER encoding decoding support
+* `ccn`:		  Math on vectors of n cc_units
+* `cczp`:		  Modular arithmetic mod integer p, on vectors of n cc_units
+* `ccz`:          Variable sized signed integer math routines
+* `ccdrbg`:       Deterministic Random Byte Generators
+* `ccrng`:        Random Bytes Generators
+* `ccdh`:         Diffie-Hellman routines.
+* `ccec25519`:    Elliptic curve signature and Diffie-Hellman routines using the Edward's 25519 curve
+* `ccrsa`:        RSA routines.
+* `ccec`:         Eliptic Curve Curves, ec specific math and APIs
+* `ccdigest`:     Digest abstraction layer.
+* `cchmac`:       HMAC using any ccdigest.
+* `ccpbkdf2`:     PBKDF2 using any ccdigest.
+* `ccmd2`:        MD2 digest implementations.
+* `ccmd4`:        MD4 digest implementations.
+* `ccmd5`:        MD5 digest implementations.
+* `ccripemd`:     RIPE-MD digest implementations.
+* `ccsha1`:       SHA-1 digest implementations.
+* `ccsha2`:       SHA-2 digest implementations.
+* `ccmode`:       Symmetric cipher chaining mode interfaces.
+* `ccpad`:        Symmetric cipher padding code.
+* `ccaes`:        AES symmetric cipher implementations.
+* `ccblowfish`:   Blowfish symmetric cipher implementations.
+* `cccast`:       Cast symmetric cipher implementations.
+* `ccdes`:        DES and 3DES symmetric cipher implementations.
+* `ccrc2`:        RC2 symmetric cipher implementations.
+* `ccrc4`:        RC4 symmetric cipher implementations.
+* `ccperf`:       Performance testing harness.
+* `cctest`:       Common utilities for creating self tests and XCunit tests.
+* `ccprime`:      Functions for generating large prime numbers. Mostly used in RSA key generation.
+* `ccspake`:      SPAKE2+ password-based key exchange implementation.
+
+### Module Subdirectories
+
+Each module has the following subdirectories:
+
+* `corecrypto`:     headers for this module
+* `src`:            sources for this module
+* `doc`:            documentation, references, etc.
+* `xcunit`:         XCTest based unit tests for this module.
+* `crypto_tests`:   sources for executable tests for this module
+* `test_vectors`:   test vectors for this module
+* `tools`:          sources for random helper tools.
+
+The following subdirections don't follow the module layout yet:
+
+* `corecrypto_kext`:   Supporting files for kernel extension build and fips support.
+* `corecrypto_dylib`:  Supporting files for userspace shared lib build and fips support.
+
+ARMV6m
+------
+The ARMV6m is not on corecrypto project target list. To compile corecrypto under ARMV6m use the following command:
+`$xcodebuild -target "corecrypto_static" OTHER_CFLAGS="-Qunused-arguments" -sdk iphoneos.internal -arch armv6m`
+
+
+Windows
+-------
+corecrypto compiles under Windows using Visual Studio 2015 and Clang with Microsoft CodeGen. The corecrypto Solution contains three projects:
+
+1. `corecrypto`: This projects compiles corecrypto, and produces a static library in 32 and 64 bit modes.
+2. `corecrypto_test`: This project compiles corecrypto test files and links statically with the corecrypto debug library.
+3. `corecrypto_perf`: This project compiles corecrypto performance measurement files and links statically with the corecrypto release library.
+4. `corecrypto_wintest`: This project contains a simple code that links to the corecrypto.lib and complies in c++ using the Visul C++ compiler. This project created to
+   make sure corecrypto library can linked to c++ software that are compiled with the Microsoft Compiler.
+
+Android
+------
+corecrypto library, `corecrypto_test` and  `corecrypto_perf` complie under Android. The Android project file is in the android subdirectory. 
+
+Linux
+-----
+The corecrypto library, `corecrypto_test` and `corecrypto_perf` compile under Linux and are built using cmake. See Cmake section for more details.
+The Linux implementation does not use ASM implementations due to differences between assemblers on Darwin and Linux.
+
+CMake
+-----
+The corecrypto library, 'corecrypto_test' and 'corecrypto_perf' can also be built using cmake in macOS and Linux.
+
+To compile using cmake, run the usual cmake commands:
+```
+  $ cd <srcdir>
+  $ mkdir build && cd build
+  $ CC=clang CXX=clang++ cmake ..
+  $ make
+```
+where `<srcdir>` is the path to the directory containing the sources.
+
+To install, type `make install` from the build directory (will require root privileges).
+
+Prototypes changes:
+-------------------
+From time to time, corecrypto needs to change the prototypes of functions.
+In this case, we use a macro defined as:
+CC_CHANGEFUNCTION_<radar>_<function name>
+and the header will document instructions to migrate from the old to new function prototype.
+
+
+
--- a/acceleratecrypto/AccelerateCrypto.xcodeproj/project.pbxproj
+++ b/acceleratecrypto/AccelerateCrypto.xcodeproj/project.pbxproj
@ -0,0 +1,919 @@
+// !$*UTF8*$!
+{
+	archiveVersion = 1;
+	classes = {
+	};
+	objectVersion = 50;
+	objects = {
+
+/* Begin PBXAggregateTarget section */
+		2CD5E9C120D85B370097F130 /* AccelerateCrypto */ = {
+			isa = PBXAggregateTarget;
+			buildConfigurationList = 2CD5E9C420D85B370097F130 /* Build configuration list for PBXAggregateTarget "AccelerateCrypto" */;
+			buildPhases = (
+			);
+			dependencies = (
+				2C88439021B74BE100C49BD9 /* PBXTargetDependency */,
+				2C6CED2E20E195E90045D491 /* PBXTargetDependency */,
+			);
+			name = AccelerateCrypto;
+			productName = AccelerateCrypto;
+		};
+/* End PBXAggregateTarget section */
+
+/* Begin PBXBuildFile section */
+		2C6CED1120E1956A0045D491 /* sha512_compress_armv7neon.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EDA20DD5D2C00840ABB /* sha512_compress_armv7neon.s */; };
+		2C6CED1220E195710045D491 /* sha512_compress_arm64.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447ED820DD5D2C00840ABB /* sha512_compress_arm64.s */; };
+		2C6CED1320E1957F0045D491 /* sha512_compress_avx1.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447ED420DD5D2C00840ABB /* sha512_compress_avx1.s */; };
+		2C6CED1420E1957F0045D491 /* sha512_compress_avx2.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447ED320DD5D2C00840ABB /* sha512_compress_avx2.s */; };
+		2C6CED1520E1957F0045D491 /* sha512_compress_ssse3.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447ED220DD5D2C00840ABB /* sha512_compress_ssse3.s */; };
+		2C6CED1620E1957F0045D491 /* sha512_compress.c in Sources */ = {isa = PBXBuildFile; fileRef = 2C447ED620DD5D2C00840ABB /* sha512_compress.c */; };
+		2C6CED1720E1957F0045D491 /* sha512_K.c in Sources */ = {isa = PBXBuildFile; fileRef = 2C447ED120DD5D2C00840ABB /* sha512_K.c */; };
+		2C6CED1820E195850045D491 /* sha256_compress_armv7neon.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EC720DD5D1900840ABB /* sha256_compress_armv7neon.s */; };
+		2C6CED1920E195890045D491 /* sha256_compress_arm64.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EC520DD5D1800840ABB /* sha256_compress_arm64.s */; };
+		2C6CED1A20E1958D0045D491 /* sha256_compress_avx1.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EB920DD5D1800840ABB /* sha256_compress_avx1.s */; };
+		2C6CED1B20E1958D0045D491 /* sha256_compress_avx2.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EC820DD5D1900840ABB /* sha256_compress_avx2.s */; };
+		2C6CED1C20E1958D0045D491 /* sha256_compress_ssse3_32.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EC920DD5D1900840ABB /* sha256_compress_ssse3_32.s */; };
+		2C6CED1D20E1958D0045D491 /* sha256_compress_ssse3_64.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EBA20DD5D1800840ABB /* sha256_compress_ssse3_64.s */; };
+		2C6CED1E20E1958D0045D491 /* sha256_compress.c in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EBB20DD5D1800840ABB /* sha256_compress.c */; };
+		2C6CED1F20E1958D0045D491 /* sha256_K.c in Sources */ = {isa = PBXBuildFile; fileRef = 2C447ECA20DD5D1900840ABB /* sha256_K.c */; };
+		2C6CED2020E195930045D491 /* sha1_compress_armv7neon.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EB420DD5D0100840ABB /* sha1_compress_armv7neon.s */; };
+		2C6CED2120E195970045D491 /* sha1_compress_arm64.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EB020DD5D0100840ABB /* sha1_compress_arm64.s */; };
+		2C6CED2220E1959B0045D491 /* sha1_compress_avx1.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EA720DD5D0100840ABB /* sha1_compress_avx1.s */; };
+		2C6CED2320E1959B0045D491 /* sha1_compress_avx2.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EB220DD5D0100840ABB /* sha1_compress_avx2.s */; };
+		2C6CED2420E1959B0045D491 /* sha1_compress_sse.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EB120DD5D0100840ABB /* sha1_compress_sse.s */; };
+		2C6CED2520E1959B0045D491 /* sha1_compress.c in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EA820DD5D0100840ABB /* sha1_compress.c */; };
+		2C6CED2620E195A80045D491 /* decrypt.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EE120DD5D4600840ABB /* decrypt.s */; };
+		2C6CED2720E195A80045D491 /* encrypt.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EE220DD5D4600840ABB /* encrypt.s */; };
+		2C6CED2820E195A80045D491 /* vpaes-armv7.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EE020DD5D4600840ABB /* vpaes-armv7.s */; };
+		2C6CED2920E195B60045D491 /* decrypt.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EDC20DD5D4600840ABB /* decrypt.s */; };
+		2C6CED2A20E195B60045D491 /* decrypt_ecb.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EDD20DD5D4600840ABB /* decrypt_ecb.s */; };
+		2C6CED2B20E195B60045D491 /* encrypt.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EDE20DD5D4600840ABB /* encrypt.s */; };
+		2C6CED2C20E195B60045D491 /* encrypt_ecb.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EDF20DD5D4600840ABB /* encrypt_ecb.s */; };
+		2C6CED2F20E302B40045D491 /* AccelerateCrypto.h in Headers */ = {isa = PBXBuildFile; fileRef = 2C447E9E20DD5BD600840ABB /* AccelerateCrypto.h */; };
+		2C88436C21B74AD500C49BD9 /* sha1_compress.c in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EA820DD5D0100840ABB /* sha1_compress.c */; };
+		2C88436D21B74AD500C49BD9 /* sha256_compress_avx2.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EC820DD5D1900840ABB /* sha256_compress_avx2.s */; };
+		2C88436E21B74AD500C49BD9 /* sha256_compress_ssse3_64.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EBA20DD5D1800840ABB /* sha256_compress_ssse3_64.s */; };
+		2C88436F21B74AD500C49BD9 /* sha1_compress_sse.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EB120DD5D0100840ABB /* sha1_compress_sse.s */; };
+		2C88437021B74AD500C49BD9 /* encrypt_ecb.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EDF20DD5D4600840ABB /* encrypt_ecb.s */; };
+		2C88437121B74AD500C49BD9 /* encrypt.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EE220DD5D4600840ABB /* encrypt.s */; };
+		2C88437221B74AD500C49BD9 /* encrypt.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EDE20DD5D4600840ABB /* encrypt.s */; };
+		2C88437321B74AD500C49BD9 /* sha1_compress_avx1.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EA720DD5D0100840ABB /* sha1_compress_avx1.s */; };
+		2C88437421B74AD500C49BD9 /* sha256_compress.c in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EBB20DD5D1800840ABB /* sha256_compress.c */; };
+		2C88437521B74AD500C49BD9 /* sha512_compress_ssse3.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447ED220DD5D2C00840ABB /* sha512_compress_ssse3.s */; };
+		2C88437621B74AD500C49BD9 /* sha512_compress_arm64.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447ED820DD5D2C00840ABB /* sha512_compress_arm64.s */; };
+		2C88437721B74AD500C49BD9 /* sha512_compress_avx1.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447ED420DD5D2C00840ABB /* sha512_compress_avx1.s */; };
+		2C88437821B74AD500C49BD9 /* sha256_K.c in Sources */ = {isa = PBXBuildFile; fileRef = 2C447ECA20DD5D1900840ABB /* sha256_K.c */; };
+		2C88437921B74AD500C49BD9 /* sha1_compress_avx2.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EB220DD5D0100840ABB /* sha1_compress_avx2.s */; };
+		2C88437A21B74AD500C49BD9 /* sha512_compress.c in Sources */ = {isa = PBXBuildFile; fileRef = 2C447ED620DD5D2C00840ABB /* sha512_compress.c */; };
+		2C88437B21B74AD500C49BD9 /* sha256_compress_arm64.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EC520DD5D1800840ABB /* sha256_compress_arm64.s */; };
+		2C88437C21B74AD500C49BD9 /* sha256_compress_armv7neon.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EC720DD5D1900840ABB /* sha256_compress_armv7neon.s */; };
+		2C88437D21B74AD500C49BD9 /* vpaes-armv7.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EE020DD5D4600840ABB /* vpaes-armv7.s */; };
+		2C88437E21B74AD500C49BD9 /* sha256_compress_ssse3_32.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EC920DD5D1900840ABB /* sha256_compress_ssse3_32.s */; };
+		2C88437F21B74AD500C49BD9 /* decrypt.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EE120DD5D4600840ABB /* decrypt.s */; };
+		2C88438021B74AD500C49BD9 /* sha1_compress_arm64.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EB020DD5D0100840ABB /* sha1_compress_arm64.s */; };
+		2C88438121B74AD500C49BD9 /* decrypt.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EDC20DD5D4600840ABB /* decrypt.s */; };
+		2C88438221B74AD500C49BD9 /* sha512_K.c in Sources */ = {isa = PBXBuildFile; fileRef = 2C447ED120DD5D2C00840ABB /* sha512_K.c */; };
+		2C88438321B74AD500C49BD9 /* sha512_compress_avx2.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447ED320DD5D2C00840ABB /* sha512_compress_avx2.s */; };
+		2C88438421B74AD500C49BD9 /* sha256_compress_avx1.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EB920DD5D1800840ABB /* sha256_compress_avx1.s */; };
+		2C88438521B74AD500C49BD9 /* sha1_compress_armv7neon.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EB420DD5D0100840ABB /* sha1_compress_armv7neon.s */; };
+		2C88438621B74AD500C49BD9 /* decrypt_ecb.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EDD20DD5D4600840ABB /* decrypt_ecb.s */; };
+		2C88438721B74AD500C49BD9 /* sha512_compress_armv7neon.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C447EDA20DD5D2C00840ABB /* sha512_compress_armv7neon.s */; };
+		2C8843A921B8AA8200C49BD9 /* crypt_nonaesni.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C8843A421B8AA8200C49BD9 /* crypt_nonaesni.s */; };
+		2C8843AA21B8AA8200C49BD9 /* crypt_nonaesni.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C8843A421B8AA8200C49BD9 /* crypt_nonaesni.s */; };
+		2C8843AB21B8AA8200C49BD9 /* Context.h in Headers */ = {isa = PBXBuildFile; fileRef = 2C8843A521B8AA8200C49BD9 /* Context.h */; };
+		2C8843AC21B8AA8200C49BD9 /* Context.h in Headers */ = {isa = PBXBuildFile; fileRef = 2C8843A521B8AA8200C49BD9 /* Context.h */; };
+		2C8843AD21B8AA8200C49BD9 /* crypt_aesni.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C8843A621B8AA8200C49BD9 /* crypt_aesni.s */; };
+		2C8843AE21B8AA8200C49BD9 /* crypt_aesni.s in Sources */ = {isa = PBXBuildFile; fileRef = 2C8843A621B8AA8200C49BD9 /* crypt_aesni.s */; };
+		2C8843AF21B8AA8200C49BD9 /* aes.c in Sources */ = {isa = PBXBuildFile; fileRef = 2C8843A721B8AA8200C49BD9 /* aes.c */; };
+		2C8843B021B8AA8200C49BD9 /* aes.c in Sources */ = {isa = PBXBuildFile; fileRef = 2C8843A721B8AA8200C49BD9 /* aes.c */; };
+		2C93F58321BAF750009239B3 /* AccelerateCrypto.h in Headers */ = {isa = PBXBuildFile; fileRef = 2C447E9E20DD5BD600840ABB /* AccelerateCrypto.h */; };
+/* End PBXBuildFile section */
+
+/* Begin PBXContainerItemProxy section */
+		2C6CED2D20E195E90045D491 /* PBXContainerItemProxy */ = {
+			isa = PBXContainerItemProxy;
+			containerPortal = 2CC8863B20D859F200D17D95 /* Project object */;
+			proxyType = 1;
+			remoteGlobalIDString = 2C6CED0720E195360045D491;
+			remoteInfo = libAccelerateCrypto;
+		};
+		2C88438F21B74BE100C49BD9 /* PBXContainerItemProxy */ = {
+			isa = PBXContainerItemProxy;
+			containerPortal = 2CC8863B20D859F200D17D95 /* Project object */;
+			proxyType = 1;
+			remoteGlobalIDString = 2C88436A21B74AD500C49BD9;
+			remoteInfo = libAccelerateCrypto_kernel;
+		};
+/* End PBXContainerItemProxy section */
+
+/* Begin PBXFileReference section */
+		2C447E9E20DD5BD600840ABB /* AccelerateCrypto.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = AccelerateCrypto.h; path = Header/AccelerateCrypto.h; sourceTree = SOURCE_ROOT; };
+		2C447EA020DD5C1300840ABB /* config.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = config.h; path = Include/config.h; sourceTree = SOURCE_ROOT; };
+		2C447EA120DD5C1300840ABB /* arm64_isa_compatibility.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = arm64_isa_compatibility.h; path = Include/arm64_isa_compatibility.h; sourceTree = SOURCE_ROOT; };
+		2C447EA720DD5D0100840ABB /* sha1_compress_avx1.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = sha1_compress_avx1.s; path = Source/sha1/intel/sha1_compress_avx1.s; sourceTree = SOURCE_ROOT; };
+		2C447EA820DD5D0100840ABB /* sha1_compress.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = sha1_compress.c; path = Source/sha1/intel/sha1_compress.c; sourceTree = SOURCE_ROOT; };
+		2C447EAB20DD5D0100840ABB /* sha1_compress.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = sha1_compress.c; sourceTree = "<group>"; };
+		2C447EAC20DD5D0100840ABB /* sha1_compress_avx1.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha1_compress_avx1.s; sourceTree = "<group>"; };
+		2C447EAD20DD5D0100840ABB /* sha1_compress_avx2.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha1_compress_avx2.s; sourceTree = "<group>"; };
+		2C447EAE20DD5D0100840ABB /* sha1_compress_sse.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha1_compress_sse.s; sourceTree = "<group>"; };
+		2C447EB020DD5D0100840ABB /* sha1_compress_arm64.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha1_compress_arm64.s; sourceTree = "<group>"; };
+		2C447EB120DD5D0100840ABB /* sha1_compress_sse.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = sha1_compress_sse.s; path = Source/sha1/intel/sha1_compress_sse.s; sourceTree = SOURCE_ROOT; };
+		2C447EB220DD5D0100840ABB /* sha1_compress_avx2.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = sha1_compress_avx2.s; path = Source/sha1/intel/sha1_compress_avx2.s; sourceTree = SOURCE_ROOT; };
+		2C447EB420DD5D0100840ABB /* sha1_compress_armv7neon.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha1_compress_armv7neon.s; sourceTree = "<group>"; };
+		2C447EB920DD5D1800840ABB /* sha256_compress_avx1.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = sha256_compress_avx1.s; path = Source/sha256/intel/sha256_compress_avx1.s; sourceTree = SOURCE_ROOT; };
+		2C447EBA20DD5D1800840ABB /* sha256_compress_ssse3_64.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = sha256_compress_ssse3_64.s; path = Source/sha256/intel/sha256_compress_ssse3_64.s; sourceTree = SOURCE_ROOT; };
+		2C447EBB20DD5D1800840ABB /* sha256_compress.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = sha256_compress.c; path = Source/sha256/intel/sha256_compress.c; sourceTree = SOURCE_ROOT; };
+		2C447EBD20DD5D1800840ABB /* sha256_compress.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = sha256_compress.c; sourceTree = "<group>"; };
+		2C447EBE20DD5D1800840ABB /* sha256_compress_avx1.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha256_compress_avx1.s; sourceTree = "<group>"; };
+		2C447EBF20DD5D1800840ABB /* sha256_compress_avx2.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha256_compress_avx2.s; sourceTree = "<group>"; };
+		2C447EC020DD5D1800840ABB /* sha256_compress_ssse3_32.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha256_compress_ssse3_32.s; sourceTree = "<group>"; };
+		2C447EC120DD5D1800840ABB /* sha256_compress_ssse3_64.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha256_compress_ssse3_64.s; sourceTree = "<group>"; };
+		2C447EC220DD5D1800840ABB /* sha256_K.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = sha256_K.c; sourceTree = "<group>"; };
+		2C447EC520DD5D1800840ABB /* sha256_compress_arm64.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha256_compress_arm64.s; sourceTree = "<group>"; };
+		2C447EC720DD5D1900840ABB /* sha256_compress_armv7neon.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha256_compress_armv7neon.s; sourceTree = "<group>"; };
+		2C447EC820DD5D1900840ABB /* sha256_compress_avx2.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = sha256_compress_avx2.s; path = Source/sha256/intel/sha256_compress_avx2.s; sourceTree = SOURCE_ROOT; };
+		2C447EC920DD5D1900840ABB /* sha256_compress_ssse3_32.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = sha256_compress_ssse3_32.s; path = Source/sha256/intel/sha256_compress_ssse3_32.s; sourceTree = SOURCE_ROOT; };
+		2C447ECA20DD5D1900840ABB /* sha256_K.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = sha256_K.c; path = Source/sha256/intel/sha256_K.c; sourceTree = SOURCE_ROOT; };
+		2C447ECC20DD5D2C00840ABB /* sha512_compress.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = sha512_compress.c; sourceTree = "<group>"; };
+		2C447ECD20DD5D2C00840ABB /* sha512_compress_avx1.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha512_compress_avx1.s; sourceTree = "<group>"; };
+		2C447ECE20DD5D2C00840ABB /* sha512_compress_avx2.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha512_compress_avx2.s; sourceTree = "<group>"; };
+		2C447ECF20DD5D2C00840ABB /* sha512_compress_ssse3.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha512_compress_ssse3.s; sourceTree = "<group>"; };
+		2C447ED120DD5D2C00840ABB /* sha512_K.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = sha512_K.c; path = Source/sha512/sha512_K.c; sourceTree = SOURCE_ROOT; };
+		2C447ED220DD5D2C00840ABB /* sha512_compress_ssse3.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = sha512_compress_ssse3.s; path = Source/sha512/intel/sha512_compress_ssse3.s; sourceTree = SOURCE_ROOT; };
+		2C447ED320DD5D2C00840ABB /* sha512_compress_avx2.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = sha512_compress_avx2.s; path = Source/sha512/intel/sha512_compress_avx2.s; sourceTree = SOURCE_ROOT; };
+		2C447ED420DD5D2C00840ABB /* sha512_compress_avx1.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = sha512_compress_avx1.s; path = Source/sha512/intel/sha512_compress_avx1.s; sourceTree = SOURCE_ROOT; };
+		2C447ED620DD5D2C00840ABB /* sha512_compress.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = sha512_compress.c; path = Source/sha512/intel/sha512_compress.c; sourceTree = SOURCE_ROOT; };
+		2C447ED820DD5D2C00840ABB /* sha512_compress_arm64.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha512_compress_arm64.s; sourceTree = "<group>"; };
+		2C447EDA20DD5D2C00840ABB /* sha512_compress_armv7neon.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = sha512_compress_armv7neon.s; sourceTree = "<group>"; };
+		2C447EDC20DD5D4600840ABB /* decrypt.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = decrypt.s; sourceTree = "<group>"; };
+		2C447EDD20DD5D4600840ABB /* decrypt_ecb.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = decrypt_ecb.s; sourceTree = "<group>"; };
+		2C447EDE20DD5D4600840ABB /* encrypt.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = encrypt.s; sourceTree = "<group>"; };
+		2C447EDF20DD5D4600840ABB /* encrypt_ecb.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = encrypt_ecb.s; sourceTree = "<group>"; };
+		2C447EE020DD5D4600840ABB /* vpaes-armv7.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = "vpaes-armv7.s"; path = "Source/aes/arm/vpaes-armv7.s"; sourceTree = SOURCE_ROOT; };
+		2C447EE120DD5D4600840ABB /* decrypt.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = decrypt.s; path = Source/aes/arm/decrypt.s; sourceTree = SOURCE_ROOT; };
+		2C447EE220DD5D4600840ABB /* encrypt.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = encrypt.s; path = Source/aes/arm/encrypt.s; sourceTree = SOURCE_ROOT; };
+		2C447EE420DD5D4700840ABB /* EncryptDecrypt.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = EncryptDecrypt.s; path = Source/aes/arm/EncryptDecrypt.s; sourceTree = SOURCE_ROOT; };
+		2C6CED0820E195360045D491 /* libAccelerateCrypto.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libAccelerateCrypto.a; sourceTree = BUILT_PRODUCTS_DIR; };
+		2C88438E21B74AD500C49BD9 /* libAccelerateCrypto_kernel.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libAccelerateCrypto_kernel.a; sourceTree = BUILT_PRODUCTS_DIR; };
+		2C8843A421B8AA8200C49BD9 /* crypt_nonaesni.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = crypt_nonaesni.s; path = Source/aes/intel/crypt_nonaesni.s; sourceTree = SOURCE_ROOT; };
+		2C8843A521B8AA8200C49BD9 /* Context.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = Context.h; path = Source/aes/intel/Context.h; sourceTree = SOURCE_ROOT; };
+		2C8843A621B8AA8200C49BD9 /* crypt_aesni.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = crypt_aesni.s; path = Source/aes/intel/crypt_aesni.s; sourceTree = SOURCE_ROOT; };
+		2C8843A721B8AA8200C49BD9 /* aes.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = aes.c; path = Source/aes/intel/aes.c; sourceTree = SOURCE_ROOT; };
+		2C8843A821B8AA8200C49BD9 /* Data.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = Data.s; path = Source/aes/intel/Data.s; sourceTree = SOURCE_ROOT; };
+		2C8843B321B8AA9700C49BD9 /* EncryptDecrypt.s */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.asm; name = EncryptDecrypt.s; path = Source/aes/intel/EncryptDecrypt.s; sourceTree = SOURCE_ROOT; };
+/* End PBXFileReference section */
+
+/* Begin PBXFrameworksBuildPhase section */
+		2C6CED0520E195360045D491 /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+		2C88438821B74AD500C49BD9 /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXFrameworksBuildPhase section */
+
+/* Begin PBXGroup section */
+		2C447E9D20DD5B2600840ABB /* Header */ = {
+			isa = PBXGroup;
+			children = (
+				2C447E9E20DD5BD600840ABB /* AccelerateCrypto.h */,
+			);
+			path = Header;
+			sourceTree = "<group>";
+		};
+		2C447E9F20DD5BF300840ABB /* Include */ = {
+			isa = PBXGroup;
+			children = (
+				2C447EA120DD5C1300840ABB /* arm64_isa_compatibility.h */,
+				2C447EA020DD5C1300840ABB /* config.h */,
+			);
+			path = Include;
+			sourceTree = "<group>";
+		};
+		2C447EA220DD5C2400840ABB /* Source */ = {
+			isa = PBXGroup;
+			children = (
+				2C447EA620DD5C5F00840ABB /* sha512 */,
+				2C447EA520DD5C5600840ABB /* sha256 */,
+				2C447EA420DD5C4F00840ABB /* sha1 */,
+				2C447EA320DD5C4400840ABB /* aes */,
+			);
+			path = Source;
+			sourceTree = "<group>";
+		};
+		2C447EA320DD5C4400840ABB /* aes */ = {
+			isa = PBXGroup;
+			children = (
+				2C8843A321B8AA4900C49BD9 /* intel */,
+				2C447EE320DD5D4600840ABB /* arm */,
+				2C447EDB20DD5D4600840ABB /* arm64 */,
+			);
+			path = aes;
+			sourceTree = "<group>";
+		};
+		2C447EA420DD5C4F00840ABB /* sha1 */ = {
+			isa = PBXGroup;
+			children = (
+				2C447EB320DD5D0100840ABB /* arm */,
+				2C447EAF20DD5D0100840ABB /* arm64 */,
+				2C447EAA20DD5D0100840ABB /* intel */,
+				2C447EA720DD5D0100840ABB /* sha1_compress_avx1.s */,
+				2C447EB220DD5D0100840ABB /* sha1_compress_avx2.s */,
+				2C447EB120DD5D0100840ABB /* sha1_compress_sse.s */,
+				2C447EA820DD5D0100840ABB /* sha1_compress.c */,
+			);
+			path = sha1;
+			sourceTree = "<group>";
+		};
+		2C447EA520DD5C5600840ABB /* sha256 */ = {
+			isa = PBXGroup;
+			children = (
+				2C447EC620DD5D1900840ABB /* arm */,
+				2C447EC320DD5D1800840ABB /* arm64 */,
+				2C447EBC20DD5D1800840ABB /* intel */,
+				2C447EB920DD5D1800840ABB /* sha256_compress_avx1.s */,
+				2C447EC820DD5D1900840ABB /* sha256_compress_avx2.s */,
+				2C447EC920DD5D1900840ABB /* sha256_compress_ssse3_32.s */,
+				2C447EBA20DD5D1800840ABB /* sha256_compress_ssse3_64.s */,
+				2C447EBB20DD5D1800840ABB /* sha256_compress.c */,
+				2C447ECA20DD5D1900840ABB /* sha256_K.c */,
+			);
+			path = sha256;
+			sourceTree = "<group>";
+		};
+		2C447EA620DD5C5F00840ABB /* sha512 */ = {
+			isa = PBXGroup;
+			children = (
+				2C447ED920DD5D2C00840ABB /* arm */,
+				2C447ED720DD5D2C00840ABB /* arm64 */,
+				2C447ECB20DD5D2C00840ABB /* intel */,
+				2C447ED420DD5D2C00840ABB /* sha512_compress_avx1.s */,
+				2C447ED320DD5D2C00840ABB /* sha512_compress_avx2.s */,
+				2C447ED220DD5D2C00840ABB /* sha512_compress_ssse3.s */,
+				2C447ED620DD5D2C00840ABB /* sha512_compress.c */,
+				2C447ED120DD5D2C00840ABB /* sha512_K.c */,
+			);
+			path = sha512;
+			sourceTree = "<group>";
+		};
+		2C447EAA20DD5D0100840ABB /* intel */ = {
+			isa = PBXGroup;
+			children = (
+				2C447EAB20DD5D0100840ABB /* sha1_compress.c */,
+				2C447EAC20DD5D0100840ABB /* sha1_compress_avx1.s */,
+				2C447EAD20DD5D0100840ABB /* sha1_compress_avx2.s */,
+				2C447EAE20DD5D0100840ABB /* sha1_compress_sse.s */,
+			);
+			name = intel;
+			path = Source/sha1/intel;
+			sourceTree = SOURCE_ROOT;
+		};
+		2C447EAF20DD5D0100840ABB /* arm64 */ = {
+			isa = PBXGroup;
+			children = (
+				2C447EB020DD5D0100840ABB /* sha1_compress_arm64.s */,
+			);
+			name = arm64;
+			path = Source/sha1/arm64;
+			sourceTree = SOURCE_ROOT;
+		};
+		2C447EB320DD5D0100840ABB /* arm */ = {
+			isa = PBXGroup;
+			children = (
+				2C447EB420DD5D0100840ABB /* sha1_compress_armv7neon.s */,
+			);
+			name = arm;
+			path = Source/sha1/arm;
+			sourceTree = SOURCE_ROOT;
+		};
+		2C447EBC20DD5D1800840ABB /* intel */ = {
+			isa = PBXGroup;
+			children = (
+				2C447EBD20DD5D1800840ABB /* sha256_compress.c */,
+				2C447EBE20DD5D1800840ABB /* sha256_compress_avx1.s */,
+				2C447EBF20DD5D1800840ABB /* sha256_compress_avx2.s */,
+				2C447EC020DD5D1800840ABB /* sha256_compress_ssse3_32.s */,
+				2C447EC120DD5D1800840ABB /* sha256_compress_ssse3_64.s */,
+				2C447EC220DD5D1800840ABB /* sha256_K.c */,
+			);
+			name = intel;
+			path = Source/sha256/intel;
+			sourceTree = SOURCE_ROOT;
+		};
+		2C447EC320DD5D1800840ABB /* arm64 */ = {
+			isa = PBXGroup;
+			children = (
+				2C447EC520DD5D1800840ABB /* sha256_compress_arm64.s */,
+			);
+			name = arm64;
+			path = Source/sha256/arm64;
+			sourceTree = SOURCE_ROOT;
+		};
+		2C447EC620DD5D1900840ABB /* arm */ = {
+			isa = PBXGroup;
+			children = (
+				2C447EC720DD5D1900840ABB /* sha256_compress_armv7neon.s */,
+			);
+			name = arm;
+			path = Source/sha256/arm;
+			sourceTree = SOURCE_ROOT;
+		};
+		2C447ECB20DD5D2C00840ABB /* intel */ = {
+			isa = PBXGroup;
+			children = (
+				2C447ECC20DD5D2C00840ABB /* sha512_compress.c */,
+				2C447ECD20DD5D2C00840ABB /* sha512_compress_avx1.s */,
+				2C447ECE20DD5D2C00840ABB /* sha512_compress_avx2.s */,
+				2C447ECF20DD5D2C00840ABB /* sha512_compress_ssse3.s */,
+			);
+			name = intel;
+			path = Source/sha512/intel;
+			sourceTree = SOURCE_ROOT;
+		};
+		2C447ED720DD5D2C00840ABB /* arm64 */ = {
+			isa = PBXGroup;
+			children = (
+				2C447ED820DD5D2C00840ABB /* sha512_compress_arm64.s */,
+			);
+			name = arm64;
+			path = Source/sha512/arm64;
+			sourceTree = SOURCE_ROOT;
+		};
+		2C447ED920DD5D2C00840ABB /* arm */ = {
+			isa = PBXGroup;
+			children = (
+				2C447EDA20DD5D2C00840ABB /* sha512_compress_armv7neon.s */,
+			);
+			name = arm;
+			path = Source/sha512/arm;
+			sourceTree = SOURCE_ROOT;
+		};
+		2C447EDB20DD5D4600840ABB /* arm64 */ = {
+			isa = PBXGroup;
+			children = (
+				2C447EDC20DD5D4600840ABB /* decrypt.s */,
+				2C447EDD20DD5D4600840ABB /* decrypt_ecb.s */,
+				2C447EDE20DD5D4600840ABB /* encrypt.s */,
+				2C447EDF20DD5D4600840ABB /* encrypt_ecb.s */,
+			);
+			name = arm64;
+			path = Source/aes/arm64;
+			sourceTree = SOURCE_ROOT;
+		};
+		2C447EE320DD5D4600840ABB /* arm */ = {
+			isa = PBXGroup;
+			children = (
+				2C447EE120DD5D4600840ABB /* decrypt.s */,
+				2C447EE220DD5D4600840ABB /* encrypt.s */,
+				2C447EE420DD5D4700840ABB /* EncryptDecrypt.s */,
+				2C447EE020DD5D4600840ABB /* vpaes-armv7.s */,
+			);
+			name = arm;
+			path = Source/aes/arm;
+			sourceTree = SOURCE_ROOT;
+		};
+		2C447EEA20DD5FA700840ABB /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				2C6CED0820E195360045D491 /* libAccelerateCrypto.a */,
+				2C88438E21B74AD500C49BD9 /* libAccelerateCrypto_kernel.a */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		2C8843A321B8AA4900C49BD9 /* intel */ = {
+			isa = PBXGroup;
+			children = (
+				2C8843B321B8AA9700C49BD9 /* EncryptDecrypt.s */,
+				2C8843A721B8AA8200C49BD9 /* aes.c */,
+				2C8843A521B8AA8200C49BD9 /* Context.h */,
+				2C8843A621B8AA8200C49BD9 /* crypt_aesni.s */,
+				2C8843A421B8AA8200C49BD9 /* crypt_nonaesni.s */,
+				2C8843A821B8AA8200C49BD9 /* Data.s */,
+			);
+			name = intel;
+			sourceTree = "<group>";
+		};
+		2CC8863A20D859F200D17D95 = {
+			isa = PBXGroup;
+			children = (
+				2C447EA220DD5C2400840ABB /* Source */,
+				2C447E9F20DD5BF300840ABB /* Include */,
+				2C447E9D20DD5B2600840ABB /* Header */,
+				2C447EEA20DD5FA700840ABB /* Products */,
+			);
+			sourceTree = "<group>";
+		};
+/* End PBXGroup section */
+
+/* Begin PBXHeadersBuildPhase section */
+		2C6CED0620E195360045D491 /* Headers */ = {
+			isa = PBXHeadersBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				2C8843AB21B8AA8200C49BD9 /* Context.h in Headers */,
+				2C6CED2F20E302B40045D491 /* AccelerateCrypto.h in Headers */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+		2C88438921B74AD500C49BD9 /* Headers */ = {
+			isa = PBXHeadersBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				2C93F58321BAF750009239B3 /* AccelerateCrypto.h in Headers */,
+				2C8843AC21B8AA8200C49BD9 /* Context.h in Headers */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXHeadersBuildPhase section */
+
+/* Begin PBXNativeTarget section */
+		2C6CED0720E195360045D491 /* libAccelerateCrypto */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = 2C6CED1020E195360045D491 /* Build configuration list for PBXNativeTarget "libAccelerateCrypto" */;
+			buildPhases = (
+				2C6CED0420E195360045D491 /* Sources */,
+				2C6CED0520E195360045D491 /* Frameworks */,
+				2C6CED0620E195360045D491 /* Headers */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+			);
+			name = libAccelerateCrypto;
+			productName = libAccelerateCrypto;
+			productReference = 2C6CED0820E195360045D491 /* libAccelerateCrypto.a */;
+			productType = "com.apple.product-type.library.static";
+		};
+		2C88436A21B74AD500C49BD9 /* libAccelerateCrypto_kernel */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = 2C88438B21B74AD500C49BD9 /* Build configuration list for PBXNativeTarget "libAccelerateCrypto_kernel" */;
+			buildPhases = (
+				2C88436B21B74AD500C49BD9 /* Sources */,
+				2C88438821B74AD500C49BD9 /* Frameworks */,
+				2C88438921B74AD500C49BD9 /* Headers */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+			);
+			name = libAccelerateCrypto_kernel;
+			productName = libAccelerateCrypto;
+			productReference = 2C88438E21B74AD500C49BD9 /* libAccelerateCrypto_kernel.a */;
+			productType = "com.apple.product-type.library.static";
+		};
+/* End PBXNativeTarget section */
+
+/* Begin PBXProject section */
+		2CC8863B20D859F200D17D95 /* Project object */ = {
+			isa = PBXProject;
+			attributes = {
+				LastUpgradeCheck = 1000;
+				TargetAttributes = {
+					2C6CED0720E195360045D491 = {
+						CreatedOnToolsVersion = 10.0;
+					};
+					2CD5E9C120D85B370097F130 = {
+						CreatedOnToolsVersion = 10.0;
+					};
+				};
+			};
+			buildConfigurationList = 2CC8863E20D859F200D17D95 /* Build configuration list for PBXProject "AccelerateCrypto" */;
+			compatibilityVersion = "Xcode 9.3";
+			developmentRegion = en;
+			hasScannedForEncodings = 0;
+			knownRegions = (
+				en,
+			);
+			mainGroup = 2CC8863A20D859F200D17D95;
+			productRefGroup = 2C447EEA20DD5FA700840ABB /* Products */;
+			projectDirPath = "";
+			projectRoot = "";
+			targets = (
+				2CD5E9C120D85B370097F130 /* AccelerateCrypto */,
+				2C6CED0720E195360045D491 /* libAccelerateCrypto */,
+				2C88436A21B74AD500C49BD9 /* libAccelerateCrypto_kernel */,
+			);
+		};
+/* End PBXProject section */
+
+/* Begin PBXSourcesBuildPhase section */
+		2C6CED0420E195360045D491 /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				2C6CED2520E1959B0045D491 /* sha1_compress.c in Sources */,
+				2C6CED1B20E1958D0045D491 /* sha256_compress_avx2.s in Sources */,
+				2C6CED1D20E1958D0045D491 /* sha256_compress_ssse3_64.s in Sources */,
+				2C6CED2420E1959B0045D491 /* sha1_compress_sse.s in Sources */,
+				2C6CED2C20E195B60045D491 /* encrypt_ecb.s in Sources */,
+				2C6CED2720E195A80045D491 /* encrypt.s in Sources */,
+				2C6CED2B20E195B60045D491 /* encrypt.s in Sources */,
+				2C6CED2220E1959B0045D491 /* sha1_compress_avx1.s in Sources */,
+				2C6CED1E20E1958D0045D491 /* sha256_compress.c in Sources */,
+				2C6CED1520E1957F0045D491 /* sha512_compress_ssse3.s in Sources */,
+				2C6CED1220E195710045D491 /* sha512_compress_arm64.s in Sources */,
+				2C8843AD21B8AA8200C49BD9 /* crypt_aesni.s in Sources */,
+				2C6CED1320E1957F0045D491 /* sha512_compress_avx1.s in Sources */,
+				2C6CED1F20E1958D0045D491 /* sha256_K.c in Sources */,
+				2C6CED2320E1959B0045D491 /* sha1_compress_avx2.s in Sources */,
+				2C6CED1620E1957F0045D491 /* sha512_compress.c in Sources */,
+				2C6CED1920E195890045D491 /* sha256_compress_arm64.s in Sources */,
+				2C6CED1820E195850045D491 /* sha256_compress_armv7neon.s in Sources */,
+				2C6CED2820E195A80045D491 /* vpaes-armv7.s in Sources */,
+				2C6CED1C20E1958D0045D491 /* sha256_compress_ssse3_32.s in Sources */,
+				2C6CED2620E195A80045D491 /* decrypt.s in Sources */,
+				2C6CED2120E195970045D491 /* sha1_compress_arm64.s in Sources */,
+				2C6CED2920E195B60045D491 /* decrypt.s in Sources */,
+				2C6CED1720E1957F0045D491 /* sha512_K.c in Sources */,
+				2C6CED1420E1957F0045D491 /* sha512_compress_avx2.s in Sources */,
+				2C6CED1A20E1958D0045D491 /* sha256_compress_avx1.s in Sources */,
+				2C8843A921B8AA8200C49BD9 /* crypt_nonaesni.s in Sources */,
+				2C6CED2020E195930045D491 /* sha1_compress_armv7neon.s in Sources */,
+				2C6CED2A20E195B60045D491 /* decrypt_ecb.s in Sources */,
+				2C8843AF21B8AA8200C49BD9 /* aes.c in Sources */,
+				2C6CED1120E1956A0045D491 /* sha512_compress_armv7neon.s in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+		2C88436B21B74AD500C49BD9 /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				2C88436C21B74AD500C49BD9 /* sha1_compress.c in Sources */,
+				2C88436D21B74AD500C49BD9 /* sha256_compress_avx2.s in Sources */,
+				2C88436E21B74AD500C49BD9 /* sha256_compress_ssse3_64.s in Sources */,
+				2C88436F21B74AD500C49BD9 /* sha1_compress_sse.s in Sources */,
+				2C88437021B74AD500C49BD9 /* encrypt_ecb.s in Sources */,
+				2C88437121B74AD500C49BD9 /* encrypt.s in Sources */,
+				2C88437221B74AD500C49BD9 /* encrypt.s in Sources */,
+				2C88437321B74AD500C49BD9 /* sha1_compress_avx1.s in Sources */,
+				2C88437421B74AD500C49BD9 /* sha256_compress.c in Sources */,
+				2C88437521B74AD500C49BD9 /* sha512_compress_ssse3.s in Sources */,
+				2C88437621B74AD500C49BD9 /* sha512_compress_arm64.s in Sources */,
+				2C8843AE21B8AA8200C49BD9 /* crypt_aesni.s in Sources */,
+				2C88437721B74AD500C49BD9 /* sha512_compress_avx1.s in Sources */,
+				2C88437821B74AD500C49BD9 /* sha256_K.c in Sources */,
+				2C88437921B74AD500C49BD9 /* sha1_compress_avx2.s in Sources */,
+				2C88437A21B74AD500C49BD9 /* sha512_compress.c in Sources */,
+				2C88437B21B74AD500C49BD9 /* sha256_compress_arm64.s in Sources */,
+				2C88437C21B74AD500C49BD9 /* sha256_compress_armv7neon.s in Sources */,
+				2C88437D21B74AD500C49BD9 /* vpaes-armv7.s in Sources */,
+				2C88437E21B74AD500C49BD9 /* sha256_compress_ssse3_32.s in Sources */,
+				2C88437F21B74AD500C49BD9 /* decrypt.s in Sources */,
+				2C88438021B74AD500C49BD9 /* sha1_compress_arm64.s in Sources */,
+				2C88438121B74AD500C49BD9 /* decrypt.s in Sources */,
+				2C88438221B74AD500C49BD9 /* sha512_K.c in Sources */,
+				2C88438321B74AD500C49BD9 /* sha512_compress_avx2.s in Sources */,
+				2C88438421B74AD500C49BD9 /* sha256_compress_avx1.s in Sources */,
+				2C8843AA21B8AA8200C49BD9 /* crypt_nonaesni.s in Sources */,
+				2C88438521B74AD500C49BD9 /* sha1_compress_armv7neon.s in Sources */,
+				2C88438621B74AD500C49BD9 /* decrypt_ecb.s in Sources */,
+				2C8843B021B8AA8200C49BD9 /* aes.c in Sources */,
+				2C88438721B74AD500C49BD9 /* sha512_compress_armv7neon.s in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXSourcesBuildPhase section */
+
+/* Begin PBXTargetDependency section */
+		2C6CED2E20E195E90045D491 /* PBXTargetDependency */ = {
+			isa = PBXTargetDependency;
+			target = 2C6CED0720E195360045D491 /* libAccelerateCrypto */;
+			targetProxy = 2C6CED2D20E195E90045D491 /* PBXContainerItemProxy */;
+		};
+		2C88439021B74BE100C49BD9 /* PBXTargetDependency */ = {
+			isa = PBXTargetDependency;
+			target = 2C88436A21B74AD500C49BD9 /* libAccelerateCrypto_kernel */;
+			targetProxy = 2C88438F21B74BE100C49BD9 /* PBXContainerItemProxy */;
+		};
+/* End PBXTargetDependency section */
+
+/* Begin XCBuildConfiguration section */
+		2C6CED0E20E195360045D491 /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				CODE_SIGN_IDENTITY = "-";
+				CODE_SIGN_STYLE = Automatic;
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = dwarf;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				ENABLE_TESTABILITY = YES;
+				EXECUTABLE_PREFIX = "";
+				GCC_C_LANGUAGE_STANDARD = gnu11;
+				GCC_DYNAMIC_NO_PIC = NO;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_OPTIMIZATION_LEVEL = 0;
+				GCC_PREPROCESSOR_DEFINITIONS = (
+					"DEBUG=1",
+					"$(inherited)",
+				);
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				INSTALL_PATH = "";
+				MACOSX_DEPLOYMENT_TARGET = 10.14;
+				MTL_ENABLE_DEBUG_INFO = YES;
+				ONLY_ACTIVE_ARCH = NO;
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				PUBLIC_HEADERS_FOLDER_PATH = /usr/local/include;
+				SDKROOT = macosx.internal;
+				SKIP_INSTALL = YES;
+			};
+			name = Debug;
+		};
+		2C6CED0F20E195360045D491 /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				CODE_SIGN_IDENTITY = "-";
+				CODE_SIGN_STYLE = Automatic;
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+				ENABLE_NS_ASSERTIONS = NO;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				EXECUTABLE_PREFIX = "";
+				GCC_C_LANGUAGE_STANDARD = gnu11;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				INSTALL_PATH = "";
+				MACOSX_DEPLOYMENT_TARGET = 10.14;
+				MTL_ENABLE_DEBUG_INFO = NO;
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				PUBLIC_HEADERS_FOLDER_PATH = /usr/local/include;
+				SDKROOT = macosx.internal;
+				SKIP_INSTALL = YES;
+			};
+			name = Release;
+		};
+		2C88438C21B74AD500C49BD9 /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				CODE_SIGN_IDENTITY = "-";
+				CODE_SIGN_STYLE = Automatic;
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = dwarf;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				ENABLE_TESTABILITY = YES;
+				EXECUTABLE_PREFIX = "";
+				GCC_C_LANGUAGE_STANDARD = gnu11;
+				GCC_DYNAMIC_NO_PIC = NO;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_OPTIMIZATION_LEVEL = 0;
+				GCC_PREPROCESSOR_DEFINITIONS = (
+					"DEBUG=1",
+					"$(inherited)",
+				);
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				INSTALL_PATH = "";
+				MACOSX_DEPLOYMENT_TARGET = 10.14;
+				MTL_ENABLE_DEBUG_INFO = YES;
+				ONLY_ACTIVE_ARCH = NO;
+				OTHER_CFLAGS = "-DBUILDKERNEL=1";
+				"OTHER_CFLAGS[arch=*]" = "-DBUILDKERNEL=1";
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				PUBLIC_HEADERS_FOLDER_PATH = /usr/local/standalone/firmware/include;
+				SDKROOT = macosx.internal;
+				SYSTEM_HEADER_SEARCH_PATHS = "$(SDKROOT)/System/Library/Frameworks/Kernel.framework/Headers";
+			};
+			name = Debug;
+		};
+		2C88438D21B74AD500C49BD9 /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				CODE_SIGN_IDENTITY = "-";
+				CODE_SIGN_STYLE = Automatic;
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+				ENABLE_NS_ASSERTIONS = NO;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				EXECUTABLE_PREFIX = "";
+				GCC_C_LANGUAGE_STANDARD = gnu11;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				INSTALL_PATH = "";
+				MACOSX_DEPLOYMENT_TARGET = 10.14;
+				MTL_ENABLE_DEBUG_INFO = NO;
+				OTHER_CFLAGS = "-DBUILDKERNEL=1";
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				PUBLIC_HEADERS_FOLDER_PATH = /usr/local/standalone/firmware/include;
+				SDKROOT = macosx.internal;
+				SYSTEM_HEADER_SEARCH_PATHS = "$(SDKROOT)/System/Library/Frameworks/Kernel.framework/Headers";
+			};
+			name = Release;
+		};
+		2CC8863F20D859F200D17D95 /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				SUPPORTED_PLATFORMS = "macosx iphoneos tvos watchos";
+			};
+			name = Debug;
+		};
+		2CC8864020D859F200D17D95 /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				SUPPORTED_PLATFORMS = "macosx iphoneos tvos watchos";
+			};
+			name = Release;
+		};
+		2CD5E9C220D85B370097F130 /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				CODE_SIGN_STYLE = Automatic;
+				PRODUCT_NAME = "$(TARGET_NAME)";
+			};
+			name = Debug;
+		};
+		2CD5E9C320D85B370097F130 /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				CODE_SIGN_STYLE = Automatic;
+				PRODUCT_NAME = "$(TARGET_NAME)";
+			};
+			name = Release;
+		};
+/* End XCBuildConfiguration section */
+
+/* Begin XCConfigurationList section */
+		2C6CED1020E195360045D491 /* Build configuration list for PBXNativeTarget "libAccelerateCrypto" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				2C6CED0E20E195360045D491 /* Debug */,
+				2C6CED0F20E195360045D491 /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		2C88438B21B74AD500C49BD9 /* Build configuration list for PBXNativeTarget "libAccelerateCrypto_kernel" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				2C88438C21B74AD500C49BD9 /* Debug */,
+				2C88438D21B74AD500C49BD9 /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		2CC8863E20D859F200D17D95 /* Build configuration list for PBXProject "AccelerateCrypto" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				2CC8863F20D859F200D17D95 /* Debug */,
+				2CC8864020D859F200D17D95 /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		2CD5E9C420D85B370097F130 /* Build configuration list for PBXAggregateTarget "AccelerateCrypto" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				2CD5E9C220D85B370097F130 /* Debug */,
+				2CD5E9C320D85B370097F130 /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+/* End XCConfigurationList section */
+	};
+	rootObject = 2CC8863B20D859F200D17D95 /* Project object */;
+}
--- a/acceleratecrypto/Header/AccelerateCrypto.h
+++ b/acceleratecrypto/Header/AccelerateCrypto.h
@ -0,0 +1,121 @@
+/* Copyright (c) (2019,2020) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by
+ * Apple Inc. (if any) are limited to internal use within your organization only on
+ * devices and computers you own or control, for the sole purpose of verifying the
+ * security characteristics and correct functioning of the Apple Software.  You may
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#ifndef AccelerateCrypto_h
+#define AccelerateCrypto_h
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+/*! @abstract SHA-1 160-bit digest update for numBlocks chunks of 64-byte (512-bit) data.
+ *
+ *  @discussion
+ *  This routine is optimized for x86_64 (SSE3,AVX1,AVX2), arm64 (CRYPTO), and armv7 (NEON).
+ *
+ *  @param state (input/output) Array of 5 uint32_t elements.
+ *
+ *  @param numBlocks (input) Number of 64-byte data chunks.
+ *
+ *  @param data (input) Array of size numBlocks*64 input bytes. 
+ */
+void AccelerateCrypto_SHA1_compress(uint32_t *state, size_t numBlocks, const void *data);
+
+/*! @abstract SHA-256 256-bit digest update for numBlocks chunks of 64-byte (512-bit) data.
+ *
+ *  @discussion
+ *  This routine is optimized for x86_64 (SSE3,AVX1,AVX2), arm64 (CRYPTO), and armv7 (NEON).
+ *
+ *  @param state (input/output) Array of 8 uint32_t elements.
+ *
+ *  @param numBlocks (input) Number of 64-byte data chunks.
+ *
+ *  @param data (input) Array of size numBlocks*64 input bytes. 
+ */
+void AccelerateCrypto_SHA256_compress(uint32_t *state, size_t numBlocks, const void *data);
+
+#if defined(__arm64__)
+void AccelerateCrypto_SHA256_compress_arm64neon(uint32_t *state, size_t numBlocks, const void *data);
+#endif
+
+/*! @abstract SHA-512 512-bit digest update for numBlocks chunks of 128-byte (1,024-bit) data.
+ *
+ *  @discussion
+ *  This routine is optimized for x86_64 (SSE3,AVX1,AVX2), arm64 (NEON), and armv7 (NEON).
+ *
+ *  @param state (input/output) Array of 8 uint64_t elements.
+ *
+ *  @param numBlocks (input) Number of 128-byte data chunks.
+ *
+ *  @param data (input) Array of size numBlocks*128 input bytes. 
+ */
+void AccelerateCrypto_SHA512_compress(uint64_t *state, size_t numBlocks, const void *data);
+
+#if defined(__arm64__)
+void AccelerateCrypto_SHA512_compress_hwassist(uint64_t *state, size_t numBlocks, const void *data);
+#endif
+
+/* AES expanded key context */
+#define KS_LENGTH   60
+typedef struct
+{   uint32_t ks[KS_LENGTH]; // maximum expanded key length = (14+1)*16 bytes = 15*16/4 = 60 uint32 words
+    uint32_t rn;            // rn = 16*(10,12,14) for AES-128,192,256
+} AccelerateCrypto_AES_ctx;
+
+
+/*! @abstract AES function encrypts a 16-byte input buffer to a 16-byte output buffer according to 
+ *  a given input expanded key context.
+ *
+ *  @discussion
+ *  This routine is optimized for x86_64 (aesni), arm64 (CRYPTO), and armv7 (NEON).
+ *
+ *  @param in (input) Array of 16-byte message.
+ *
+ *  @param out (output) Array of 16-byte encrypted message.
+ *
+ *  @param key (input) Expanded key context for encryption.
+ * 
+ *  @return 0 on success; otherwise a nonzero number indicating failure in the encrypt function.
+ *
+ */
+int AccelerateCrypto_AES_encrypt(const void *in, void *out, const AccelerateCrypto_AES_ctx *key);
+
+/*! @abstract AES function decrypts a 16-byte input buffer to a 16-byte output buffer according to 
+ *  a given input expanded key context.
+ *
+ *  @discussion
+ *  This routine is optimized for x86_64 (aesni), arm64 (CRYPTO), and armv7 (NEON).
+ *
+ *  @param in (input) Array of 16-byte encrypted message.
+ *
+ *  @param out (output) Array of 16-byte decrypted message.
+ *
+ *  @param key (input) Expanded key context for decryption.
+ *
+ *  @return 0 on success; otherwise a nonzero number indicating failure in the decrypt function.
+ *
+ */
+int AccelerateCrypto_AES_decrypt(const void *in, void *out, const AccelerateCrypto_AES_ctx *key);
+
+#if defined(__arm64__)
+int AccelerateCrypto_ecb_AES_encrypt(const AccelerateCrypto_AES_ctx *key, uint32_t nblocks, const void *in, void *out);
+int AccelerateCrypto_ecb_AES_decrypt(const AccelerateCrypto_AES_ctx *key, uint32_t nblocks, const void *in, void *out);
+#endif
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+
+#endif  /* AccelerateCrypto_h */
+
--- a/acceleratecrypto/Include/arm64_isa_compatibility.h
+++ b/acceleratecrypto/Include/arm64_isa_compatibility.h
@ -0,0 +1,167 @@
+/* Copyright (c) (2013,2015,2016,2019) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to 
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by 
+ * Apple Inc. (if any) are limited to internal use within your organization only on 
+ * devices and computers you own or control, for the sole purpose of verifying the 
+ * security characteristics and correct functioning of the Apple Software.  You may 
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+
+// #include <Availability.h>
+#include <sys/cdefs.h>
+
+#if defined(__clang__) && ((defined(__apple_build_version__) && __apple_build_version__ > 5010000))
+#define __USES_V_CRYPTO_INTRINSICS 1
+#else
+#define __USES_V_CRYPTO_INTRINSICS 0
+#endif
+
+
+//  AES INSTRUCTIONS
+// aese.16b	v0, v1
+// aesd.16b	v0, v1
+// aesmc.16b	v0, v1
+// aesimc.16b	v0, v1
+
+// SHA1 INTRINSICS
+// sha1su0.4s	v0, v1, v2
+// sha1su1.4s	v0, v1
+// sha1c.4s	v0, v1, v2		// or q0, s1, v2.4s
+// sha1m.4s	v0, v1, v2		// or q0, s1, v2.4s
+// sha1p.4s	v0, v1, v2		// or q0, s1, v2.4s
+// sha1h.4s	v0, v1		// or s0, s1
+
+// SHA256 INTRINSICS
+// sha256su0.4s	v0, v1
+// sha256su1.4s	v0, v1, v2
+// sha256h.4s		v0, v1, v2		// or q0, q1, v2.4s
+// sha256h2.4s	v0, v1, v2		// or q0, q1, v2.4s
+
+
+#if __USES_V_CRYPTO_INTRINSICS == 1
+.macro	AESE
+aese.16b v$0, v$1
+.endm
+
+.macro	AESD
+aesd.16b v$0, v$1
+.endm
+
+.macro	AESMC
+aesmc.16b v$0, v$1
+.endm
+
+.macro	AESIMC
+aesimc.16b v$0, v$1
+.endm
+
+
+#else
+
+.macro	AESE
+aese q$0, q$1
+.endm
+
+.macro	AESD
+aesd q$0, q$1
+.endm
+
+.macro	AESMC
+aesmc q$0, q$1
+.endm
+
+.macro	AESIMC
+aesimc q$0, q$1
+.endm
+
+#endif
+
+#if __USES_V_CRYPTO_INTRINSICS == 1
+
+.macro SHA1SU0
+sha1su0	v$0.4s, v$1.4s, v$2.4s
+.endm
+
+.macro SHA1SU1
+sha1su1	v$0.4s, v$1.4s
+.endm
+
+.macro SHA1C
+sha1c	q$0, s$1, v$2.4s
+.endm
+
+.macro SHA1M
+sha1m	q$0, s$1, v$2.4s
+.endm
+
+.macro SHA1P
+sha1p	q$0, s$1, v$2.4s
+.endm
+
+.macro SHA1H
+sha1h	s$0, s$1
+.endm
+
+.macro SHA256SU0
+sha256su0    v$0.4s, v$1.4s
+.endm
+
+.macro SHA256SU1
+sha256su1    v$0.4s, v$1.4s, v$2.4s
+.endm
+
+.macro SHA256H
+sha256h    q$0, q$1, v$2.4s
+.endm
+
+.macro SHA256H2
+sha256h2    q$0, q$1, v$2.4s
+.endm
+
+#else
+
+.macro SHA1SU0
+sha1su0	q$0, q$1, q$2
+.endm
+
+.macro SHA1SU1
+sha1su1	q$0, q$1
+.endm
+
+.macro SHA1C
+sha1c	q$0, q$1, q$2
+.endm
+
+.macro SHA1M
+sha1m	q$0, q$1, q$2
+.endm
+
+.macro SHA1P
+sha1p	q$0, q$1, q$2
+.endm
+
+.macro SHA1H
+sha1h	q$0, q$1
+.endm
+
+.macro SHA256SU0
+sha256su0    q$0, q$1
+.endm
+
+.macro SHA256SU1
+sha256su1    q$0, q$1, q$2
+.endm
+
+.macro SHA256H
+sha256h    q$0, q$1, q$2
+.endm
+
+.macro SHA256H2
+sha256h2    q$0, q$1, q$2
+.endm
+
+#endif
+
--- a/acceleratecrypto/Include/config.h
+++ b/acceleratecrypto/Include/config.h
@ -0,0 +1,66 @@
+/* Copyright (c) (2019) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to 
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by 
+ * Apple Inc. (if any) are limited to internal use within your organization only on 
+ * devices and computers you own or control, for the sole purpose of verifying the 
+ * security characteristics and correct functioning of the Apple Software.  You may 
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+
+#if (defined(__x86_64__) || defined(__i386__))
+
+#if BUILDKERNEL
+
+    #include <i386/cpuid.h>
+    #define HAS_AESNI() ((cpuid_features() & CPUID_FEATURE_AES) != 0)
+    #define HAS_SupplementalSSE3() ((cpuid_features() & CPUID_FEATURE_SSSE3) != 0)
+    #define HAS_AVX1() ((cpuid_features() & CPUID_FEATURE_AVX1_0) != 0)
+    #define HAS_AVX2() ((cpuid_info()->cpuid_leaf7_features & CPUID_LEAF7_FEATURE_AVX2) != 0)
+    #define HAS_AVX512_AND_IN_KERNEL()    ((cpuid_info()->cpuid_leaf7_features & CPUID_LEAF7_FEATURE_AVX512F) !=0)
+
+#elif (defined(__APPLE__) && defined(__MACH__) && (__has_include(<System/i386/cpu_capabilities.h>) || __has_include(<System/arm/cpu_capabilities.h>)))   // XNU_KERNEL_AVAILABLE
+
+    #include <System/i386/cpu_capabilities.h>
+
+    extern int _cpu_capabilities;
+    #define HAS_AESNI() (_cpu_capabilities & kHasAES)
+    #define HAS_SupplementalSSE3() (_cpu_capabilities & kHasSupplementalSSE3)
+    #define HAS_AVX1() (_cpu_capabilities & kHasAVX1_0)
+    #define HAS_AVX2() (_cpu_capabilities & kHasAVX2_0)
+    #define HAS_AVX512_AND_IN_KERNEL() 0
+
+#else
+
+#if (defined(__AES__))
+    #define HAS_AESNI() __AES__
+#else
+    #define HAS_AESNI() 0
+#endif // defined(__AES__)
+
+#if (defined(__SSSE3__))
+    #define HAS_SupplementalSSE3() __SSSE3__
+#else
+    #define HAS_SupplementalSSE3() 0
+#endif // defined(__SSE3__)
+
+#if (defined(__AVX__))
+    #define HAS_AVX1() __AVX__
+#else
+    #define HAS_AVX1() 0
+#endif // defined(__AVX__)
+
+#if (defined(__AVX2__))
+    #define HAS_AVX2() __AVX2__
+#else
+    #define HAS_AVX2() 0
+#endif // defined(__AVX2__)
+
+    #define HAS_AVX512_AND_IN_KERNEL()  0
+
+#endif
+
+#endif // (defined(__x86_64__) || defined(__i386__))
+
--- a/acceleratecrypto/ReadMe.txt
+++ b/acceleratecrypto/ReadMe.txt
@ -0,0 +1,12 @@
+# Copyright (c) (2019) Apple Inc. All rights reserved.
+#
+# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+# is contained in the License.txt file distributed with corecrypto) and only to 
+# people who accept that license. IMPORTANT:  Any license rights granted to you by 
+# Apple Inc. (if any) are limited to internal use within your organization only on 
+# devices and computers you own or control, for the sole purpose of verifying the 
+# security characteristics and correct functioning of the Apple Software.  You may 
+# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+#
+
+This is a clone of AccelerateCrypto-2.
--- a/acceleratecrypto/Source/aes/arm/EncryptDecrypt.s
+++ b/acceleratecrypto/Source/aes/arm/EncryptDecrypt.s
@ -0,0 +1,477 @@
+# Copyright (c) (2011,2012,2013,2014,2015,2016,2019) Apple Inc. All rights reserved.
+#
+# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+# is contained in the License.txt file distributed with corecrypto) and only to 
+# people who accept that license. IMPORTANT:  Any license rights granted to you by 
+# Apple Inc. (if any) are limited to internal use within your organization only on 
+# devices and computers you own or control, for the sole purpose of verifying the 
+# security characteristics and correct functioning of the Apple Software.  You may 
+# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+#if defined(__arm__)
+
+#define	S0	r0
+#define	S1	r1
+#define	S2	r2
+#define	S3	r3
+
+#if Select == 0
+	#define	Name		_AccelerateCrypto_AES_encrypt   // Routine name.
+	#define	MTable		_AESEncryptTable			    // Main table.
+	#define	FTable		_AESSubBytesWordTable		    // Final table.
+	#define	P0			S0							    // State permutation.
+	#define	P1			S1
+	#define	P2			S2
+	#define	P3			S3
+	#define	Increment	+16							    // ExpandedKey increment.
+#elif Select == 1
+	#define	Name		_AccelerateCrypto_AES_decrypt   // Routine name.
+	#define	MTable		_AESDecryptTable			    // Main table.
+	#define	FTable		_AESInvSubBytesWordTable	    // Final table.
+	#define	P0			S2							    // State permutation.
+	#define	P1			S3
+	#define	P2			S0
+	#define	P3			S1
+	#define	Increment	-16							// ExpandedKey increment.
+#endif	// Select
+
+#if defined(__ARM_NEON__)   // vpaes uses NEON instructions
+    .extern _AccelerateCrypto_vpaes_encrypt
+    .extern _AccelerateCrypto_vpaes_decrypt
+#endif
+
+#define	ExpandedKey			r11
+#define	ExpandedKeyEnd		lr
+#define	ContextKeyLength	240	
+#define	t					r12
+
+.subsections_via_symbols
+	.text
+    .syntax unified
+    .p2align  2
+    .code   16
+    .thumb_func Name
+	.globl Name
+Name:
+#if defined(__ARM_NEON__)   // if neon is available, use cache-attack resilient vector permute AES
+
+#if Select == 0
+    b   _AccelerateCrypto_vpaes_encrypt
+#else
+    b   _AccelerateCrypto_vpaes_decrypt
+#endif
+
+#else   // __ARM_NEON__
+
+	// set up debug trace frame pointer
+	push	{r7,lr}
+	mov		r7, sp
+
+	// now setup the stack for the current function
+	push	{r1,r4-r6,r8-r11}
+	sub		sp, #(16+8)         // make sp 16-byte aligned
+
+	// copy r0,r2 to r4,r11 to release r0,r2 (r1 is saved in stack) for use as S0-S3
+	mov		r4, r0
+	mov		ExpandedKey, r2
+
+	// Get and check "key length".
+	ldr		t, [ExpandedKey, #ContextKeyLength]
+	cmp		t, #160
+	beq		2f
+	cmp		t, #192
+	beq		2f
+	cmp		t, #224
+	beq		2f
+	mov		r0, #-1		// Return error.
+	b		9f
+2:
+
+	#if (Select == 0)
+		// For encryption, prepare to iterate forward through expanded key.
+		add		ExpandedKeyEnd, ExpandedKey, t
+	#else
+		// For decryption, prepare to iterate backward through expanded key.
+		mov		ExpandedKeyEnd, ExpandedKey
+		add		ExpandedKey, t
+	#endif
+
+    /*
+        we need to do this for otherwise ldmia $0, {$1-$4} will hit memory access error when $0 is not word-aligned in thumb state
+    */
+    .macro  thumb2_ldmia
+    ldr     $1, [$0, #0]
+    ldr     $2, [$0, #4]
+    ldr     $3, [$0, #8]
+    ldr     $4, [$0, #12]
+    .endm
+
+    .macro  thumb2_stmia
+    str     $1, [$0, #0]
+    str     $2, [$0, #4]
+    str     $3, [$0, #8]
+    str     $4, [$0, #12]
+    .endm
+
+	// Initialize State from input text.
+    // we need to do this otherwise ldmia will crash when input (pointed by r4) is not word aligned
+    thumb2_ldmia    r4, S0, S1, S2, S3
+
+	// Add round key and save results.
+    thumb2_ldmia    ExpandedKey, r4, r5, r8, r10 
+	add		ExpandedKey, #Increment
+
+	eor		S0, r4
+	eor		S1, r5
+	eor		S2, r8
+	eor		S3, r10
+	
+	// Set up r6 = _AESEncryptTable or _AESDecryptTable
+    ldr		r6, L_table1
+L_table0:	
+    mov     r12, pc
+    ldr     r6, [r12, r6]
+
+	// save S0-S3 in the stack memory
+	stmia	sp, {S0-S3}
+
+	// use this to extract byte from a shifted word, tried use uxtb, same complexity, but then limit to armv6 or above
+	mov		r9, #0xff
+
+	// Get round key.
+	thumb2_ldmia	ExpandedKey, S0, S1, S2, S3
+	add		ExpandedKey, #Increment 
+
+	// per round operation
+
+	/*
+        the following macro defines the per round operation for aes
+        the state computed from the previous round is now saved in sp[0:15]
+        and r0-r3 has been initialized with the next expanded round key
+        the macro reads those 16 bytes in sp[0:15] and for each byte does a table look up
+        the result (4-byte) word is xor-ed to one of r0-r3
+        the final r0-r3 is the aes state
+        r6 : points to Main or Final table
+        r9 : 0xff is used as a byte mask
+    */
+
+	.macro	aes_per_round
+
+#if defined (__ARM_ARCH_7S__)
+    // better for swift and (old cortex-a8) 
+
+	// S0 process
+	ldr		t, [sp, #0]					// load 4 bytes for S0 process
+	and		r4, r9, t					// byte 0
+	and		r5, r9, t, lsr #8			// byte 1
+	ldr		r4, [r6, r4, lsl #2]		// 1st table lookup
+	and		r8, r9, t, lsr #16			// byte 2
+	ldr		r5, [r6, r5, lsl #2]		// 2nd table lookup
+	and		r10, r9, t, lsr #24			// byte 3
+	ldr		r8, [r6, r8, lsl #2]		// 3rd table lookup
+	eor		S0, r4						// S0 ^= 1st table lookup
+	ldr		r10, [r6, r10, lsl #2]		// 4th table lookup
+	eor		P3, r5, ror #24				// P3 ^= 2nd table lookup
+	ldr		t, [sp, #4]					//   read Word for next S1 process
+	eor		S2, r8, ror #16				// S2 ^= 3rd table lookup
+	eor		P1, r10, ror #8				// P1 ^= 4th table lookup
+
+	// S1 process
+	and		r4, r9, t
+	and		r5, r9, t, lsr #8
+	ldr		r4, [r6, r4, lsl #2]
+	and		r8, r9, t, lsr #16
+	ldr		r5, [r6, r5, lsl #2]
+	and		r10, r9, t, lsr #24
+	ldr		r8, [r6, r8, lsl #2]
+	eor		S1, r4
+	ldr		r10, [r6, r10, lsl #2]
+	eor		P0, r5, ror #24
+	ldr		t, [sp, #8]
+	eor		S3, r8, ror #16
+	eor		P2, r10, ror #8
+
+	// S2 process
+	and		r4, r9, t
+	and		r5, r9, t, lsr #8
+	ldr		r4, [r6, r4, lsl #2]
+	and		r8, r9, t, lsr #16
+	ldr		r5, [r6, r5, lsl #2]
+	and		r10, r9, t, lsr #24
+	ldr		r8, [r6, r8, lsl #2]
+	eor		S2, r4
+	ldr		r10, [r6, r10, lsl #2]
+	eor		P1, r5, ror #24
+	ldr		t, [sp, #12]
+	eor		S0, r8, ror #16
+	eor		P3, r10, ror #8
+
+	// S3 process
+	and		r4, r9, t
+	and		r5, r9, t, lsr #8
+	ldr		r4, [r6, r4, lsl #2]
+	and		r8, r9, t, lsr #16
+	ldr		r5, [r6, r5, lsl #2]
+	and		r10, r9, t, lsr #24
+	ldr		r8, [r6, r8, lsl #2]
+	eor		S3, r4
+	ldr		r10, [r6, r10, lsl #2]
+	eor		P2, r5, ror #24
+	eor		S1, r8, ror #16
+	eor		P0, r10, ror #8
+
+#else
+
+    // better for cortex-a7 and cortex-a9
+
+    // S0 process
+	ldrb	r4, [sp, #0]					// byte 0
+	ldrb	r5, [sp, #1]					// byte 1 
+	ldrb	r8, [sp, #2]					// byte 2
+	ldrb	r10, [sp, #3]					// byte 3 
+	ldr		r4, [r6, r4, lsl #2]		// 1st table lookup
+	ldr		r5, [r6, r5, lsl #2]		// 2nd table lookup
+	ldr		r8, [r6, r8, lsl #2]		// 1st table lookup
+	eor		S0, r4						// S0 ^= 1st table lookup
+	ldr		r10, [r6, r10, lsl #2]		// 2nd table lookup
+	eor		P3, r5, ror #24				// P3 ^= 2nd table lookup
+	eor		S2, r8, ror #16				// S2 ^= 3rd table lookup
+	eor		P1, r10, ror #8				// P1 ^= 4th table lookup
+
+    // S1 process
+	ldrb	r4, [sp, #4]					// byte 0
+	ldrb	r5, [sp, #5]					// byte 1 
+	ldrb	r8, [sp, #6]					// byte 2
+	ldrb	r10, [sp, #7]					// byte 3 
+	ldr		r4, [r6, r4, lsl #2]
+	ldr		r5, [r6, r5, lsl #2]
+	ldr		r8, [r6, r8, lsl #2]
+	eor		S1, r4
+	ldr		r10, [r6, r10, lsl #2]
+	eor		P0, r5, ror #24
+	eor		S3, r8, ror #16
+	eor		P2, r10, ror #8
+
+    // S2 process
+	ldrb	r4, [sp, #8]					// byte 0
+	ldrb	r5, [sp, #9]					// byte 1 
+	ldrb	r8, [sp, #10]					// byte 2
+	ldrb	r10, [sp, #11]					// byte 3 
+	ldr		r4, [r6, r4, lsl #2]
+	ldr		r5, [r6, r5, lsl #2]
+	ldr		r8, [r6, r8, lsl #2]
+	eor		S2, r4
+	ldr		r10, [r6, r10, lsl #2]
+	eor		P1, r5, ror #24
+	eor		S0, r8, ror #16
+	eor		P3, r10, ror #8
+
+    // S3 process
+	ldrb	r4, [sp, #12]					// byte 0
+	ldrb	r5, [sp, #13]					// byte 1 
+	ldrb	r8, [sp, #14]					// byte 2
+	ldrb	r10, [sp, #15]					// byte 3 
+	ldr		r4, [r6, r4, lsl #2]
+	ldr		r5, [r6, r5, lsl #2]
+	ldr		r8, [r6, r8, lsl #2]
+	eor		S3, r4
+	ldr		r10, [r6, r10, lsl #2]
+	eor		P2, r5, ror #24
+	eor		S1, r8, ror #16
+	eor		P0, r10, ror #8
+
+#endif
+
+	.endm
+
+	.macro	aes_last_round
+#if defined (__ARM_ARCH_7S__)
+    // better for swift (and old cortex-a8)
+
+	// S0 process
+	ldr		t, [sp, #0]					// load 4 bytes for S0 process
+	and		r4, r9, t					// byte 0
+	and		r5, r9, t, lsr #8			// byte 1
+	ldrb	r4, [r6, r4]				// 1st table lookup
+	and		r8, r9, t, lsr #16			// byte 2
+	ldrb	r5, [r6, r5]				// 2nd table lookup
+	and		r10, r9, t, lsr #24			// byte 3
+	ldrb	r8, [r6, r8]				// 3rd table lookup
+	eor		S0, r4						// S0 ^= 1st table lookup
+	ldrb	r10, [r6, r10]				// 4th table lookup
+	eor		P3, r5, ror #24				// P3 ^= 2nd table lookup
+	ldr		t, [sp, #4]					//   read Word for next S1 process
+	eor		S2, r8, ror #16				// S2 ^= 3rd table lookup
+	eor		P1, r10, ror #8				// P1 ^= 4th table lookup
+
+	// S1 process
+	and		r4, r9, t
+	and		r5, r9, t, lsr #8
+	ldrb	r4, [r6, r4]
+	and		r8, r9, t, lsr #16
+	ldrb	r5, [r6, r5]
+	and		r10, r9, t, lsr #24
+	ldrb	r8, [r6, r8]
+	eor		S1, r4
+	ldrb	r10, [r6, r10]
+	eor		P0, r5, ror #24
+	ldr		t, [sp, #8]
+	eor		S3, r8, ror #16
+	eor		P2, r10, ror #8
+
+	// S2 process
+	and		r4, r9, t
+	and		r5, r9, t, lsr #8
+	ldrb	r4, [r6, r4]
+	and		r8, r9, t, lsr #16
+	ldrb	r5, [r6, r5]
+	and		r10, r9, t, lsr #24
+	ldrb	r8, [r6, r8]
+	eor		S2, r4
+	ldrb	r10, [r6, r10]
+	eor		P1, r5, ror #24
+	ldr		t, [sp, #12]
+	eor		S0, r8, ror #16
+	eor		P3, r10, ror #8
+
+	// S3 process
+	and		r4, r9, t
+	and		r5, r9, t, lsr #8
+	ldrb	r4, [r6, r4]
+	and		r8, r9, t, lsr #16
+	ldrb	r5, [r6, r5]
+	and		r10, r9, t, lsr #24
+	ldrb	r8, [r6, r8]
+	eor		S3, r4
+	ldrb	r10, [r6, r10]
+	eor		P2, r5, ror #24
+	eor		S1, r8, ror #16
+	eor		P0, r10, ror #8
+
+#else
+    // better for cortex-a7 and cortex-a9
+
+	// S0 process
+	ldrb	r4, [sp, #0]					// byte 0
+	ldrb	r5, [sp, #1]					// byte 1 
+	ldrb	r8, [sp, #2]					// byte 2
+	ldrb	r10, [sp, #3]					// byte 3 
+	ldrb	r4, [r6, r4]				// 1st table lookup
+	ldrb	r5, [r6, r5]				// 2nd table lookup
+	ldrb	r8, [r6, r8]				// 3rd table lookup
+	eor		S0, r4						// S0 ^= 1st table lookup
+	ldrb	r10, [r6, r10]				// 4th table lookup
+	eor		P3, r5, ror #24				// P3 ^= 2nd table lookup
+	eor		S2, r8, ror #16				// S2 ^= 3rd table lookup
+	eor		P1, r10, ror #8				// P1 ^= 4th table lookup
+
+	// S1 process
+	ldrb	r4, [sp, #4]					// byte 0
+	ldrb	r5, [sp, #5]					// byte 1 
+	ldrb	r8, [sp, #6]					// byte 2
+	ldrb	r10, [sp, #7]					// byte 3 
+	ldrb	r4, [r6, r4]
+	ldrb	r5, [r6, r5]
+	ldrb	r8, [r6, r8]
+	eor		S1, r4
+	ldrb	r10, [r6, r10]
+	eor		P0, r5, ror #24
+	eor		S3, r8, ror #16
+	eor		P2, r10, ror #8
+
+	// S2 process
+	ldrb	r4, [sp, #8]					// byte 0
+	ldrb	r5, [sp, #9]					// byte 1 
+	ldrb	r8, [sp, #10]					// byte 2
+	ldrb	r10, [sp, #11]					// byte 3 
+	ldrb	r4, [r6, r4]
+	ldrb	r5, [r6, r5]
+	ldrb	r8, [r6, r8]
+	eor		S2, r4
+	ldrb	r10, [r6, r10]
+	eor		P1, r5, ror #24
+	eor		S0, r8, ror #16
+	eor		P3, r10, ror #8
+
+	// S3 process
+	ldrb	r4, [sp, #12]					// byte 0
+	ldrb	r5, [sp, #13]					// byte 1 
+	ldrb	r8, [sp, #14]					// byte 2
+	ldrb	r10, [sp, #15]					// byte 3 
+	ldrb	r4, [r6, r4]
+	ldrb	r5, [r6, r5]
+	ldrb	r8, [r6, r8]
+	eor		S3, r4
+	ldrb	r10, [r6, r10]
+	eor		P2, r5, ror #24
+	eor		S1, r8, ror #16
+	eor		P0, r10, ror #8
+#endif
+
+	.endm
+
+1:
+	aes_per_round
+
+	// Save state for next iteration and load next round key.
+	stmia	sp,{S0-S3}
+	thumb2_ldmia	ExpandedKey, S0, S1, S2, S3
+
+	cmp		ExpandedKeyEnd, ExpandedKey
+	add		ExpandedKey, #Increment 
+	bne		1b
+
+	// setup r6 = _AESSubBytesWordTable or _AESInvSubBytesWordTable 
+    ldr		r6, L_table3
+L_table2:	
+    mov     r12, pc
+    ldr     r6, [r12, r6]
+
+	aes_last_round
+
+	ldr		r4, [sp, #(16+8)]		// restore OutputText
+	thumb2_stmia	r4, S0, S1, S2, S3
+	eor		r0, r0				// Return success.
+
+9:
+
+	add		sp, #(4+16+8)       // skip r1 restore 
+	pop		{r4-r6,r8-r11}
+	pop		{r7, pc}
+
+
+	.p2align 	2
+L_table1:
+    .long   L_Tab$non_lazy_ptr-(L_table0+4)
+
+	.p2align 	2
+L_table3:
+    .long   L_Tab$non_lazy_ptr2-(L_table2+4)
+
+    .section    __DATA,__nl_symbol_ptr,non_lazy_symbol_pointers
+    .p2align  2
+L_Tab$non_lazy_ptr:
+    .indirect_symbol    MTable
+    .long   0
+
+    .p2align  2
+L_Tab$non_lazy_ptr2:
+    .indirect_symbol    FTable
+    .long   0
+
+#endif  // __ARM_NEON__
+
+#undef	S0
+#undef	S1
+#undef	S2
+#undef	S3
+#undef	Name
+#undef	MTable
+#undef	FTable
+#undef	P0
+#undef	P1
+#undef	P2
+#undef	P3
+#undef	Increment
+
+#endif /* defined(__arm__) */
+
--- a/acceleratecrypto/Source/aes/arm/decrypt.s
+++ b/acceleratecrypto/Source/aes/arm/decrypt.s
@ -0,0 +1,13 @@
+# Copyright (c) (2019) Apple Inc. All rights reserved.
+#
+# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+# is contained in the License.txt file distributed with corecrypto) and only to 
+# people who accept that license. IMPORTANT:  Any license rights granted to you by 
+# Apple Inc. (if any) are limited to internal use within your organization only on 
+# devices and computers you own or control, for the sole purpose of verifying the 
+# security characteristics and correct functioning of the Apple Software.  You may 
+# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+
+#define Select  1
+#include "EncryptDecrypt.s"
+#undef  Select
--- a/acceleratecrypto/Source/aes/arm/encrypt.s
+++ b/acceleratecrypto/Source/aes/arm/encrypt.s
@ -0,0 +1,13 @@
+# Copyright (c) (2019) Apple Inc. All rights reserved.
+#
+# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+# is contained in the License.txt file distributed with corecrypto) and only to 
+# people who accept that license. IMPORTANT:  Any license rights granted to you by 
+# Apple Inc. (if any) are limited to internal use within your organization only on 
+# devices and computers you own or control, for the sole purpose of verifying the 
+# security characteristics and correct functioning of the Apple Software.  You may 
+# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+
+#define Select  0
+#include "EncryptDecrypt.s"
+#undef  Select
--- a/acceleratecrypto/Source/aes/arm/vpaes-armv7.s
+++ b/acceleratecrypto/Source/aes/arm/vpaes-armv7.s
@ -0,0 +1,751 @@
+# Copyright (c) (2015,2016,2019) Apple Inc. All rights reserved.
+#
+# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+# is contained in the License.txt file distributed with corecrypto) and only to 
+# people who accept that license. IMPORTANT:  Any license rights granted to you by 
+# Apple Inc. (if any) are limited to internal use within your organization only on 
+# devices and computers you own or control, for the sole purpose of verifying the 
+# security characteristics and correct functioning of the Apple Software.  You may 
+# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+
+
+#if !defined(__arm64__) && defined(__ARM_NEON__)
+
+    #define ekey    r2
+    #define eax     r4
+
+    .macro  save_all_neon
+#if BUILDKERNEL
+    vstmdb  sp!, {q12-q15}
+    vstmdb  sp!, {q8-q11}
+    vstmdb  sp!, {q0-q3}
+#endif
+    vstmdb  sp!, {q4-q7}
+    .endm
+
+    .macro  restore_all_neon
+    vldmia  sp!, {q4-q7}
+#if BUILDKERNEL
+    vldmia  sp!, {q0-q3}
+    vldmia  sp!, {q8-q11}
+    vldmia  sp!, {q12-q15}
+#endif
+    .endm
+
+    .macro  vpaes_push
+    push    {r4-r7,lr}
+    add     r7, sp, #12
+    push    {r8,r10,r11}
+    .endm
+
+    .macro  vpaes_pop
+    pop    {r8,r10,r11}
+    pop    {r4-r7,pc}
+    .endm
+
+    .p2align  6
+.Lk_ipt:
+.quad   0xC2B2E8985A2A7000, 0xCABAE09052227808
+.quad   0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+
+.Lk_sbo:
+.quad   0xD0D26D176FBDC700, 0x15AABF7AC502A878
+.quad   0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+
+.Lk_mc_forward:
+.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
+.quad	0x080B0A0904070605, 0x000302010C0F0E0D
+.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
+.quad	0x000302010C0F0E0D, 0x080B0A0904070605
+
+.Lk_mc_backward:
+.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
+.quad	0x020100030E0D0C0F, 0x0A09080B06050407
+.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
+.quad	0x0A09080B06050407, 0x020100030E0D0C0F
+
+.quad   0x0706050403020100, 0x0F0E0D0C0B0A0908
+.quad   0x030E09040F0A0500, 0x0B06010C07020D08
+.quad   0x0F060D040B020900, 0x070E050C030A0108
+.quad   0x0B0E0104070A0D00, 0x0306090C0F020508
+
+
+    .p2align	4
+vpaes_encrypt_core:
+	mov     r9, ekey
+	mov	    r11, #16
+    adr     r10, .Lk_ipt
+    ldr     eax, [ekey, #240]
+    vldmia  r10!,{q3-q4}
+    vbic    q1, q0, q9
+    vld1.8 {q5}, [r9]!
+    vshr.u32    q1, q1, #4
+    vand    q0, q0, q9
+
+    vtbl.8  d4, {q3}, d0
+    vtbl.8  d5, {q3}, d1
+
+    adr     r10, .Lk_mc_backward
+
+    vtbl.8  d0, {q4}, d2
+    vtbl.8  d1, {q4}, d3
+    veor    q2, q2, q5
+    veor    q0, q0, q2
+    cmp     eax, #0
+	b       .Lenc_entry
+
+    .p2align	4
+.Lenc_loop:
+
+    vtbl.8  d8, {q13}, d4
+    vtbl.8  d9, {q13}, d5
+    vtbl.8  d0, {q12}, d6
+    vtbl.8  d1, {q12}, d7
+    veor    q4, q4, q5
+    add     r12, r10, r11
+    veor    q5, q0, q4
+    vld1.8 {q4}, [r12 :128]
+    sub     r12, r12, #64
+    vtbl.8  d12, {q15}, d4
+    vtbl.8  d13, {q15}, d5
+    vld1.8 {q1}, [r12 :128]
+
+    vtbl.8  d4, {q14}, d6
+    vtbl.8  d5, {q14}, d7
+
+    veor    q2, q2, q6
+
+    vtbl.8  d6, {q5}, d8
+    vtbl.8  d7, {q5}, d9
+    vtbl.8  d0, {q5}, d2
+    vtbl.8  d1, {q5}, d3
+    veor    q5, q0, q2
+
+    add     r11, r11, #16
+    veor    q3, q3, q5
+    vtbl.8  d0, {q5}, d2
+    vtbl.8  d1, {q5}, d3
+    and     r11, r11, #48
+    subs    eax, eax, #1
+    veor    q0, q0, q3
+
+.Lenc_entry:
+
+
+    vbic    q1, q0, q9
+    vand    q0, q0, q9
+    vshr.u32    q1, q1, #4
+
+    vtbl.8  d10, {q11}, d0
+    vtbl.8  d11, {q11}, d1
+
+    veor    q0, q0, q1
+
+    vtbl.8  d6, {q10}, d2
+    vtbl.8  d7, {q10}, d3
+    vtbl.8  d8, {q10}, d0
+    vtbl.8  d9, {q10}, d1
+
+    veor    q3, q3, q5
+    veor    q4, q4, q5
+
+    vtbl.8  d4, {q10}, d6
+    vtbl.8  d5, {q10}, d7
+    vtbl.8  d6, {q10}, d8
+    vtbl.8  d7, {q10}, d9
+
+    veor    q2, q2, q0
+    veor    q3, q3, q1
+
+    vld1.8 {q5}, [r9]!
+    bgt 	.Lenc_loop
+
+    adr     r12, .Lk_sbo 
+
+    vld1.8 {q1}, [r12]!
+    vtbl.8  d8, {q1}, d4
+    vtbl.8  d9, {q1}, d5
+    vld1.8 {q2}, [r12]
+    add     r12, r10, r11
+    veor    q4, q4, q5
+    add     r12, r12, #64
+    vtbl.8  d0, {q2}, d6
+    vtbl.8  d1, {q2}, d7
+    vld1.8 {q1}, [r12]
+    veor    q2, q0, q4
+    vtbl.8  d0, {q2}, d2
+    vtbl.8  d1, {q2}, d3
+    bx      lr
+
+
+    .p2align  4
+.Lk_dipt:
+.quad	0x0F505B040B545F00, 0x154A411E114E451A
+.quad	0x86E383E660056500, 0x12771772F491F194
+.quad	0x000302010C0F0E0D, 0x080B0A0904070605      // .Lk_mc_forward+48
+
+.Lk_dsb9:
+.quad   0x851C03539A86D600, 0xCAD51F504F994CC9
+.quad   0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+.Lk_dsbd:
+.quad   0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+.quad   0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+.Lk_dsbb:
+.quad   0xD022649296B44200, 0x602646F6B0F2D404
+.quad   0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+.Lk_dsbe:
+.quad   0x46F2929626D4D000, 0x2242600464B4F6B0
+.quad   0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+.Lk_dsbo:
+.quad   0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+.quad   0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+
+.quad   0x0706050403020100, 0x0F0E0D0C0B0A0908
+.quad   0x0F060D040B020900, 0x070E050C030A0108
+
+
+    .p2align	4
+vpaes_decrypt_core:
+    mov     r9, r2              // dkey
+    ldr     eax, [r2, #240]     // Nr
+    adr     r12, .Lk_dipt
+    vbic    q1, q0, q9
+    vld1.64 {q3}, [r12 :128]!
+    vshr.u32    q1, q1, #4
+    vld1.8  {q5}, [r9]!
+	lsl     r11, eax, #4
+    vand    q2, q0, q9
+    vtbl.8  d4, {q3}, d4
+    vtbl.8  d5, {q3}, d5
+    vld1.64 {q4}, [r12 :128]!
+    eor     r11, r11, #48
+    adr     r10, .Lk_dsbd
+    vtbl.8  d0, {q4}, d2
+    vtbl.8  d1, {q4}, d3
+    and     r11, r11, #48
+    veor    q2, q2, q5
+    vld1.64 {q5}, [r12 :128]!
+    veor    q0, q0, q2
+    cmp     eax, #0
+	b       .Ldec_entry
+
+    .p2align	4
+.Ldec_loop:
+
+    sub     r12, r10, 32
+    vld1.64 {q6-q7}, [r12 :128]!
+    vtbl.8  d8, {q6}, d4
+    vtbl.8  d9, {q6}, d5
+    vtbl.8  d2, {q7}, d6
+    vtbl.8  d3, {q7}, d7
+    vld1.64 {q6-q7}, [r12 :128]!
+    veor    q0, q0, q4
+    vtbl.8  d8, {q6}, d4
+    vtbl.8  d9, {q6}, d5
+    veor    q6, q0, q1
+    vtbl.8  d2, {q7}, d6
+    vtbl.8  d3, {q7}, d7
+    vtbl.8  d0, {q6}, d10
+    vtbl.8  d1, {q6}, d11
+    vld1.64 {q6-q7}, [r12 :128]!
+
+    veor    q0, q0, q4
+    vtbl.8  d8, {q6}, d4
+    vtbl.8  d9, {q6}, d5
+    veor    q6, q0, q1
+    vtbl.8  d2, {q7}, d6
+    vtbl.8  d3, {q7}, d7
+    vtbl.8  d0, {q6}, d10
+    vtbl.8  d1, {q6}, d11
+    vld1.64 {q6-q7}, [r12 :128]!
+
+    veor    q0, q0, q4
+    vtbl.8  d8, {q6}, d4
+    vtbl.8  d9, {q6}, d5
+    veor    q6, q0, q1
+    vtbl.8  d2, {q7}, d6
+    vtbl.8  d3, {q7}, d7
+    vtbl.8  d0, {q6}, d10
+    vtbl.8  d1, {q6}, d11
+
+    veor    q0, q0, q4
+
+    vext.8  q5, q5, q5, #12
+    veor    q0, q0, q1
+    subs    eax, eax, #1
+
+.Ldec_entry:
+
+    vbic    q1, q0, q9
+    vand    q0, q0, q9
+    vshr.u32    q1, q1, #4
+    vtbl.8  d4, {q11}, d0
+    vtbl.8  d5, {q11}, d1
+
+
+    veor    q0, q0, q1
+    vtbl.8  d6, {q10}, d2
+    vtbl.8  d7, {q10}, d3
+
+
+    veor    q3, q3, q2
+    vtbl.8  d8, {q10}, d0
+    vtbl.8  d9, {q10}, d1
+
+    veor    q4, q4, q2
+    vtbl.8  d4, {q10}, d6
+    vtbl.8  d5, {q10}, d7
+
+    veor    q2, q2, q0
+    vtbl.8  d6, {q10}, d8
+    vtbl.8  d7, {q10}, d9
+
+    vld1.8 {q0}, [r9]!
+    veor    q3, q3, q1
+    bne     .Ldec_loop
+
+    vld1.64 {q6-q7}, [r12 :128]!
+
+    vtbl.8  d8, {q6}, d4
+    vtbl.8  d9, {q6}, d5
+    add     r12, r12, r11, lsr #1
+    vtbl.8  d6, {q7}, d6
+    vtbl.8  d7, {q7}, d7
+    vld1.64 {q2}, [r12]
+    veor    q0, q0, q4
+    veor    q1, q0, q3
+
+    vtbl.8  d0, {q1}, d4
+    vtbl.8  d1, {q1}, d5
+    bx      lr
+
+    .p2align  6
+.Lk_ipt2:
+.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
+.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+.Lk_rcon:
+.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+.Lk_sr:
+.quad   0x0706050403020100, 0x0F0E0D0C0B0A0908
+.quad   0x030E09040F0A0500, 0x0B06010C07020D08
+.quad   0x0F060D040B020900, 0x070E050C030A0108
+.quad   0x0B0E0104070A0D00, 0x0306090C0F020508
+
+
+    .p2align	4
+vpaes_schedule_core:
+    bl      vpaes_preheat
+    adr     r10, .Lk_rcon
+    vld1.8  {q0}, [r0]
+    vld1.64 {q8}, [r10 :128]!
+    vmov    q3, q0
+    adr     r11, .Lk_ipt2
+	bl      vpaes_schedule_transform
+    vmov    q7, q0
+
+    cmp     r3, #0
+    bne     .Lschedule_am_decrypting
+
+    vst1.8 {q0}, [r2]
+
+	b       .Lschedule_go
+
+.Lschedule_am_decrypting:
+
+    add     r12, r10, r8
+    vmov    q1, q3
+    vld1.8 {q3}, [r12]
+    vtbl.8  d6, {q1}, d6
+    vtbl.8  d7, {q1}, d7
+    eor     r8, r8, #48
+    vst1.8 {q3}, [r2]
+
+
+.Lschedule_go:
+	cmp     r1, #192
+	bgt	    .Lschedule_256
+	beq     .Lschedule_192
+
+.Lschedule_128:
+    mov     r1, #10
+
+.Loop_schedule_128:
+	bl  	vpaes_schedule_round
+    subs    r1, r1, #1
+    beq     .Lschedule_mangle_last
+	bl  	vpaes_schedule_mangle
+	b   	.Loop_schedule_128
+
+    .p2align	4
+.Lschedule_192:
+    add     r12, r0, #8
+    vld1.8 {q0}, [r12]
+	bl      vpaes_schedule_transform
+    vmov    d13, d1
+    veor    d12, d12, d12
+    mov     r1, #4
+    
+.Loop_schedule_192:
+	bl	vpaes_schedule_round
+    vext.8  q0, q6, q0, #8
+    
+	bl	vpaes_schedule_mangle
+	bl	vpaes_schedule_192_smear
+	bl	vpaes_schedule_mangle
+	bl	vpaes_schedule_round
+    subs    r1, r1, #1
+	beq	.Lschedule_mangle_last
+	bl	vpaes_schedule_mangle
+	bl	vpaes_schedule_192_smear
+	b	.Loop_schedule_192
+
+.p2align	4
+.Lschedule_256:
+    add     r12, r0, #16
+    vld1.8 {q0}, [r12]
+	bl	vpaes_schedule_transform
+    mov r1, #7
+
+.Loop_schedule_256:
+	bl	vpaes_schedule_mangle
+    vmov    q6, q0
+
+	bl	vpaes_schedule_round
+    subs    r1, r1, #1
+	beq	.Lschedule_mangle_last
+	bl	vpaes_schedule_mangle
+
+    vdup.32 q0, d1[1]
+    vmov    q5, q7
+    vmov    q7, q6
+	bl  	vpaes_schedule_low_round
+    vmov    q7, q5
+
+	b	.Loop_schedule_256
+
+    .p2align  4
+.Lk_opt:
+.quad   0xFF9F4929D6B66000, 0xF7974121DEBE6808
+.quad   0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+
+.Lk_deskew:
+.quad   0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+.quad   0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+    .p2align	4
+.Lschedule_mangle_last:
+
+	adr     r11, .Lk_deskew
+    cmp     r3, #0
+    bne 	.Lschedule_mangle_last_dec
+
+    add     r12, r8, r10
+    vld1.8  {q1}, [r12]
+	adr	    r11, .Lk_opt
+    vtbl.8  d2, {q0}, d2
+    vtbl.8  d3, {q0}, d3
+    vmov    q0, q1
+    add     r2, r2, #32
+
+.Lschedule_mangle_last_dec:
+    adr     r12, .Lk_s63
+	sub     r2, r2, #16
+    vld1.8  {q1}, [r12]
+    veor    q0, q0, q1
+	bl  	vpaes_schedule_transform
+    vst1.8  {q0}, [r2]
+
+    restore_all_neon
+
+    eor     r0, r0, r0
+    vpaes_pop
+
+
+    .p2align	4
+vpaes_schedule_192_smear:
+    vdup.32 q1, d12[0]
+    vdup.32 q0, d15[1]
+    vmov    s7, s26
+    vmov    s0, s30
+    veor    q6, q6, q1
+    veor    q6, q6, q0
+    vmov    q0, q6
+    veor    d12, d12, d12
+    bx      lr
+
+
+    .p2align	4
+vpaes_schedule_round:
+
+    veor    q1, q1, q1
+    vext.8  q1, q8, q1, #15
+    vext.8  q8, q8, q8, #15
+    veor    q7, q7, q1
+    vdup.32 q0, d1[1]
+    vext.8  q0, q0, q0, #1
+
+vpaes_schedule_low_round:
+
+    veor    q1, q1, q1
+    adr     r12, .Lk_s63
+    vext.8  q1, q1, q7, #12
+    veor    q2, q2, q2
+    veor    q7, q7, q1
+    vld1.8  {q1}, [r12]
+    vext.8  q2, q2, q7, #8
+    veor    q7, q7, q1
+    veor    q7, q7, q2
+
+
+    vbic    q1, q0, q9
+    vshr.u32    q1, q1, #4
+    vand    q0, q0, q9
+
+    vtbl.8  d4, {q11}, d0
+    vtbl.8  d5, {q11}, d1
+
+    veor    q0, q0, q1
+
+    vtbl.8  d6, {q10}, d2
+    vtbl.8  d7, {q10}, d3
+
+    veor    q3, q3, q2
+
+    vtbl.8  d8, {q10}, d0
+    vtbl.8  d9, {q10}, d1
+
+    veor    q4, q4, q2
+
+    vtbl.8  d4, {q10}, d6
+    vtbl.8  d5, {q10}, d7
+
+    veor    q2, q2, q0
+
+
+    vtbl.8  d6, {q10}, d8
+    vtbl.8  d7, {q10}, d9
+
+    veor    q3, q3, q1
+
+    vtbl.8  d8, {q13}, d4
+    vtbl.8  d9, {q13}, d5
+
+    vtbl.8  d0, {q12}, d6
+    vtbl.8  d1, {q12}, d7
+
+    veor    q0, q0, q4
+    veor    q0, q0, q7
+    vmov    q7, q0
+
+    bx      lr
+
+    .p2align	4
+vpaes_schedule_transform:
+    vbic        q1, q0, q9
+    vldmia      r11, {q4-q5}
+    vand        q0, q0, q9
+    vshr.u32    q1, q1, #4
+    vtbl.8      d0, {q4}, d0
+    vtbl.8      d1, {q4}, d1
+    vtbl.8      d2, {q5}, d2
+    vtbl.8      d3, {q5}, d3
+    veor        q0, q0, q1
+    bx          lr
+
+
+    .p2align  4
+.Lk_mc_forward2:
+    .quad   0x0407060500030201, 0x0C0F0E0D080B0A09
+.Lk_s63:
+    .quad   0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
+
+.Lk_dksd:
+.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+.Lk_dksb:
+.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
+.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+.Lk_dkse:
+.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
+.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+.Lk_dks9:
+.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
+.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+    .p2align	4
+vpaes_schedule_mangle:
+    vstmdb  sp!, {q6-q7}
+    adr     r12, .Lk_mc_forward2
+    vmov    q4, q0
+    cmp     r3, #0
+    vldmia  r12!, {q5-q6}        // q5 = Lk_mc_forward2, q6 = Lk_s63
+    bne     .Lschedule_mangle_dec
+    add     r2, r2, #16
+    veor    q4, q4, q6
+
+    vtbl.8  d6, {q4}, d10
+    vtbl.8  d7, {q4}, d11
+    vtbl.8  d8, {q3}, d10
+    vtbl.8  d9, {q3}, d11
+    vtbl.8  d2, {q4}, d10
+    vtbl.8  d3, {q4}, d11
+    veor    q3, q3, q4
+    veor    q3, q3, q1
+	b       .Lschedule_mangle_both
+
+    .p2align	4
+.Lschedule_mangle_dec:
+
+    vbic    q1, q4, q9
+    vldmia  r12!, {q6-q7}
+    vshr.u32    q1, q1, #4
+    vand    q4, q4, q9
+
+    vtbl.8  d4, {q6}, d8
+    vtbl.8  d5, {q6}, d9
+    vtbl.8  d6, {q7}, d2
+    vtbl.8  d7, {q7}, d3
+    vldmia  r12!, {q6-q7}
+    veor    q2, q3, q2
+    vtbl.8  d6, {q2}, d10
+    vtbl.8  d7, {q2}, d11
+
+
+    vtbl.8  d4, {q6}, d8
+    vtbl.8  d5, {q6}, d9
+    veor    q2, q2, q3
+    vtbl.8  d6, {q7}, d2
+    vtbl.8  d7, {q7}, d3
+    vldmia  r12!, {q6-q7}
+    veor    q2, q3, q2
+    vtbl.8  d6, {q2}, d10
+    vtbl.8  d7, {q2}, d11
+
+    vtbl.8  d4, {q6}, d8
+    vtbl.8  d5, {q6}, d9
+    veor    q2, q2, q3
+    vtbl.8  d6, {q7}, d2
+    vtbl.8  d7, {q7}, d3
+    vldmia  r12!, {q6-q7}
+    veor    q2, q3, q2
+    vtbl.8  d6, {q2}, d10
+    vtbl.8  d7, {q2}, d11
+
+    vtbl.8  d4, {q6}, d8
+    vtbl.8  d5, {q6}, d9
+    veor    q2, q2, q3
+    vtbl.8  d6, {q7}, d2
+    vtbl.8  d7, {q7}, d3
+    veor    q3, q3, q2
+
+    sub     r2, r2, #16
+
+.Lschedule_mangle_both:
+    add     r12, r10, r8
+    vld1.8  {q1}, [r12]
+    sub     r8, r8, #16
+    vtbl.8  d4, {q3}, d2
+    vtbl.8  d5, {q3}, d3
+    and     r8, r8, #48
+    vst1.8  {q2}, [r2]
+    vldmia  sp!, {q6-q7}
+    bx      lr
+
+
+
+
+/*
+    int vpaes_set_encrypt_key(const uint8_t *userKey, int bits, void *key);
+*/
+
+    #define userKey     r0
+    #define AES_bits    r1
+    #define key         r2 
+    #define t           r12
+    .globl	_AccelerateCrypto_vpaes_set_encrypt_key
+    .p2align	4
+_AccelerateCrypto_vpaes_set_encrypt_key:
+
+
+    // 128/192/256 divide by 32 = 4/6/8 + 5 - 9/11/13
+    lsr     t, AES_bits, #5  
+    vpaes_push
+    mov     r11, t
+    save_all_neon
+    add     t, r11, #5
+    mov     r3, #0
+    str     t, [key, #240] 
+    mov     r8, #48 
+    b       vpaes_schedule_core
+
+    .globl	_AccelerateCrypto_vpaes_set_decrypt_key
+    .p2align	4
+_AccelerateCrypto_vpaes_set_decrypt_key:
+    lsr     t, AES_bits, #5  
+    vpaes_push
+    mov     r11, t
+    save_all_neon
+    mov     r8, #32
+    add     t, r11, #5
+    and     r8, r8, AES_bits, lsr #1
+    mov     r3, #1
+    str     t, [key, #240] 
+    add     key, key, #16
+    eor     r8, r8, #32
+    add     key, key, t, lsl #4
+    b       vpaes_schedule_core
+
+/*
+    void vpaes_encrypt(const unsigned char *in, unsigned char *out, const AES_KEY *key);
+*/
+    #define in      r0
+    #define out     r1
+    #define key     r2
+
+    .globl	_AccelerateCrypto_vpaes_encrypt
+    .p2align	4
+_AccelerateCrypto_vpaes_encrypt:
+    vpaes_push
+    save_all_neon
+    vld1.8  {q0}, [in]
+    bl      vpaes_preheat
+	bl	    vpaes_encrypt_core
+    vst1.8  {q0}, [out]
+    restore_all_neon
+    eor     r0, r0      // return 0 for SUCCESS
+    vpaes_pop
+
+    .globl	_AccelerateCrypto_vpaes_decrypt
+    .p2align	4
+_AccelerateCrypto_vpaes_decrypt:
+    vpaes_push
+    save_all_neon
+    vld1.8  {q0}, [in]
+	bl  	vpaes_preheat
+	bl      vpaes_decrypt_core
+    vst1.8  {q0}, [out]
+    restore_all_neon
+    eor     r0, r0      // return 0 for SUCCESS
+    vpaes_pop
+
+    .p2align	4
+vpaes_preheat:
+    adr     r12, .Lk_s0F
+    vldmia  r12, {q9-q15}
+    bx      lr
+
+    .p2align  6
+// the following 7 16-bytes words are loaded into 
+.Lk_s0F:
+.quad	0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
+.Lk_inv:
+.quad	0x0E05060F0D080180, 0x040703090A0B0C02
+.quad	0x01040A060F0B0780, 0x030D0E0C02050809
+.Lk_sb1:
+.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+.Lk_sb2:
+.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
+.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
+
+#endif      // !defined(__arm64__) && defined(__ARM_NEON__)
--- a/acceleratecrypto/Source/aes/arm64/decrypt.s
+++ b/acceleratecrypto/Source/aes/arm64/decrypt.s
@ -0,0 +1,65 @@
+# Copyright (c) (2019,2020) Apple Inc. All rights reserved.
+#
+# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+# is contained in the License.txt file distributed with corecrypto) and only to
+# people who accept that license. IMPORTANT:  Any license rights granted to you by
+# Apple Inc. (if any) are limited to internal use within your organization only on
+# devices and computers you own or control, for the sole purpose of verifying the
+# security characteristics and correct functioning of the Apple Software.  You may
+# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+
+#if defined(__arm64__)
+#include "arm64_isa_compatibility.h"
+#include "ccarm_pac_bti_macros.h"
+	// per block
+	#define in      x0
+    #define out     x1
+    #define key     x2
+    #define keylen  x3
+    #define t       x5
+
+
+.subsections_via_symbols
+	.text	
+	.p2align	4
+	    .globl _AccelerateCrypto_AES_decrypt
+_AccelerateCrypto_AES_decrypt:
+	BRANCH_TARGET_CALL
+#if BUILDKERNEL
+	// save used vector registers
+	sub		sp, sp, #3*16
+	st1.4s		{v0,v1,v2}, [sp]
+#endif
+
+    ldr     w3, [key, #240]         // keylength = 32-bit
+    ldr     q0, [in]                // plain data
+    mov     t, keylen
+    ldr     q1, [key, t]		        // expanded key
+	sub		t, t, #16
+    ldr     q2, [key]               // expanded key
+0:
+    AESD    0, 1
+    AESIMC   0, 0
+    ldr     q1, [key, t]				// expanded key
+    subs    t, t, #16
+    b.gt    0b
+    AESD    0, 1
+    eor.16b v0, v0, v2
+    str     q0, [out]
+
+#if BUILDKERNEL
+	// restore used vector registers
+	ld1.4s		{v0,v1,v2}, [sp], #48
+#endif
+
+    mov     x0, #0
+    ret     lr
+
+	#undef in
+    #undef out
+    #undef key
+    #undef keylen
+
+
+#endif
+
--- a/acceleratecrypto/Source/aes/arm64/decrypt_ecb.s
+++ b/acceleratecrypto/Source/aes/arm64/decrypt_ecb.s
@ -0,0 +1,114 @@
+# Copyright (c) (2011-2016,2018-2020) Apple Inc. All rights reserved.
+#
+# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+# is contained in the License.txt file distributed with corecrypto) and only to
+# people who accept that license. IMPORTANT:  Any license rights granted to you by
+# Apple Inc. (if any) are limited to internal use within your organization only on
+# devices and computers you own or control, for the sole purpose of verifying the
+# security characteristics and correct functioning of the Apple Software.  You may
+# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+
+#if defined(__arm64__)
+#include "arm64_isa_compatibility.h"
+#include "ccarm_pac_bti_macros.h"
+	// ecb mode
+
+    #define key     x0
+	#define	nblocks	w1
+	#define in      x2
+    #define out     x3
+    #define keylen  x4
+    #define t       x5
+
+.subsections_via_symbols
+    .text
+
+	.globl _AccelerateCrypto_ecb_AES_decrypt
+	.p2align	4
+_AccelerateCrypto_ecb_AES_decrypt:
+    BRANCH_TARGET_CALL
+#if BUILDKERNEL
+    // save used vector registers
+    sub     x4, sp, #6*16
+    sub     sp, sp, #6*16
+    st1.4s      {v0,v1,v2,v3}, [x4], #4*16
+    st1.4s      {v4,v5}, [x4], #2*16
+#endif
+
+    ldr     w4, [key, #240]             // keylength = 32-bit
+    ldr     q5, [key]               // expanded key
+	subs	nblocks, nblocks, #4
+	b.lt	L_lessthan4
+
+L_4blocks:
+    mov     t, keylen
+	ld1.4s	{v0,v1,v2,v3}, [in], #4*16
+    ldr     q4, [key, t]	        // expanded key
+    sub     t, t, #16
+0:
+    AESD     0, 4
+    AESIMC   0, 0
+    AESD     1, 4
+    AESIMC   1, 1
+    AESD     2, 4
+    AESIMC   2, 2
+    AESD     3, 4
+    AESIMC   3, 3
+    ldr         q4, [key, t]				// expanded key
+    subs        t, t, #16
+    b.gt        0b
+    AESD     0, 4
+    eor.16b v0, v0, v5
+    AESD     1, 4
+    eor.16b v1, v1, v5
+    AESD     2, 4
+    eor.16b v2, v2, v5
+    AESD     3, 4
+    eor.16b v3, v3, v5
+
+	st1.4s		{v0,v1,v2,v3}, [out], #4*16
+
+	subs	nblocks, nblocks, #4
+	b.ge	L_4blocks
+
+L_lessthan4:
+	ands	nblocks, nblocks, #3
+	b.eq	9f
+
+L_1block:
+    mov     t, keylen
+    ldr     q0, [in], #16          // plain data
+    ldr     q4, [key, t]	        // expanded key
+    sub     t, t, #16
+0:
+    AESD    0, 4
+    AESIMC   0, 0
+    ldr     q4, [key, t]			// expanded key
+    subs        t, t, #16
+    b.gt        0b
+
+    AESD    0, 4
+    eor.16b v0, v0, v5
+
+    str     q0, [out], #16
+	subs	nblocks, nblocks, #1
+	b.gt	L_1block
+
+9:
+#if BUILDKERNEL
+	// restore used vector registers
+	ld1.4s		{v0,v1,v2,v3}, [sp], #4*16
+	ld1.4s		{v4,v5}, [sp], #2*16
+#endif
+
+    mov     x0, #0
+    ret     lr
+
+	#undef in
+    #undef out
+    #undef key
+	#undef nblocks
+    #undef keylen
+
+#endif
+
--- a/acceleratecrypto/Source/aes/arm64/encrypt.s
+++ b/acceleratecrypto/Source/aes/arm64/encrypt.s
@ -0,0 +1,66 @@
+# Copyright (c) (2019,2020) Apple Inc. All rights reserved.
+#
+# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+# is contained in the License.txt file distributed with corecrypto) and only to
+# people who accept that license. IMPORTANT:  Any license rights granted to you by
+# Apple Inc. (if any) are limited to internal use within your organization only on
+# devices and computers you own or control, for the sole purpose of verifying the
+# security characteristics and correct functioning of the Apple Software.  You may
+# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+
+#if defined(__arm64__)
+#include "arm64_isa_compatibility.h"
+#include "ccarm_pac_bti_macros.h"
+	// per block implementation
+
+	#define in      x0
+    #define out     x1
+    #define key     x2
+    #define keylen  x3
+    #define t       x5
+
+.subsections_via_symbols
+	.text
+	.p2align	4
+    .globl  _AccelerateCrypto_AES_encrypt
+_AccelerateCrypto_AES_encrypt:
+    BRANCH_TARGET_CALL
+#if BUILDKERNEL
+    // save used vector registers
+    sub         sp, sp, #3*16
+    st1.4s      {v0,v1,v2}, [sp]
+#endif
+
+    ldr     w3, [key, #240]         // keylength = 32-bit, 160/192/224
+    ldr     q0, [in]                // plain data
+    ldr     q1, [key]	            // expanded key
+    ldr     q2, [key, keylen]       // final expanded key
+    mov     t, #16
+0:
+    AESE    0, 1
+    AESMC   0, 0
+    ldr     q1, [key, t]	        // expanded key
+	add		t, t, #16
+    cmp     t, keylen
+    b.lt    0b
+
+    AESE    0, 1
+    eor.16b v0, v0, v2
+
+    str     q0, [out]
+
+#if BUILDKERNEL
+    // restore used vector registers
+    ld1.4s      {v0,v1,v2}, [sp], #48
+#endif
+
+    mov     x0, #0
+    ret     lr
+
+	#undef in
+    #undef out
+    #undef key
+    #undef keylen
+
+#endif
+
--- a/acceleratecrypto/Source/aes/arm64/encrypt_ecb.s
+++ b/acceleratecrypto/Source/aes/arm64/encrypt_ecb.s
@ -0,0 +1,119 @@
+# Copyright (c) (2011-2016,2018-2020) Apple Inc. All rights reserved.
+#
+# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+# is contained in the License.txt file distributed with corecrypto) and only to
+# people who accept that license. IMPORTANT:  Any license rights granted to you by
+# Apple Inc. (if any) are limited to internal use within your organization only on
+# devices and computers you own or control, for the sole purpose of verifying the
+# security characteristics and correct functioning of the Apple Software.  You may
+# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+
+#if defined(__arm64__)
+#include "arm64_isa_compatibility.h"
+#include "ccarm_pac_bti_macros.h"
+    #define key     x0
+	#define	nblocks	w1
+	#define in      x2
+    #define out     x3
+    #define keylen  x4
+    #define t       x5
+
+.subsections_via_symbols
+    .text
+
+	.p2align	4
+    .globl  _AccelerateCrypto_ecb_AES_encrypt
+_AccelerateCrypto_ecb_AES_encrypt:
+    BRANCH_TARGET_CALL
+#if BUILDKERNEL
+    // save used vector registers
+    sub     x4, sp, #6*16
+    sub     sp, sp, #6*16
+    st1.4s      {v0,v1,v2,v3}, [x4], #4*16
+    st1.4s      {v4,v5}, [x4], #2*16
+#endif
+
+    ldr     w4, [key, #240]         // keylength = 32-bit, 160/192/224
+	subs	nblocks, nblocks, #4	// pre-decrement nblocks by 4
+    ldr     q5, [key, keylen]            // expanded key
+	b.lt	1f						// if nblocks < 4, go to scalar loop
+
+
+L_4blocks:
+
+	// handle 4 blocks per iteration
+    ldr     q4, [key]	            // expanded key
+    mov     t, #16
+    ld1.4s	{v0,v1,v2,v3}, [in], #4*16
+0:
+    AESE    0, 4
+    AESMC   0, 0
+    AESE    1, 4
+    AESMC   1, 1
+    AESE    2, 4
+    AESMC   2, 2
+    AESE    3, 4
+    AESMC   3, 3
+    ldr     q4, [key, t]		         // expanded key
+	add		t, t, #16
+    cmp     t, keylen
+    b.lt    0b
+
+    AESE    0, 4
+    eor.16b v0, v0, v5
+    AESE    1, 4
+    eor.16b v1, v1, v5
+    AESE    2, 4
+    eor.16b v2, v2, v5
+    AESE    3, 4
+    eor.16b v3, v3, v5
+
+    st1.4s	{v0,v1,v2,v3}, [out], #4*16
+	subs	nblocks, nblocks, #4
+	b.ge	L_4blocks
+
+
+
+1:	// handle 1 block per iteration
+	ands	nblocks, nblocks, #3
+	b.eq	9f	
+
+L_1block:
+    ldr     q4, [key]	            // expanded key
+    mov     t, #16
+    ldr     q0, [in], #16		// plain data
+0:
+    AESE    0, 4
+    AESMC   0, 0
+    ldr     q4, [key, t]		         // expanded key
+    add     t, t, #16
+    cmp     t, keylen
+    b.lt    0b
+
+    AESE    0, 4
+    eor.16b v0, v0, v5
+
+    str     q0, [out], #16
+
+	subs	nblocks, nblocks, #1	
+	b.gt	L_1block
+
+9:
+#if BUILDKERNEL
+    // restore used vector registers
+    ld1.4s      {v0,v1,v2,v3}, [sp], #4*16
+    ld1.4s      {v4,v5}, [sp], #2*16
+#endif
+
+    mov     x0, #0
+    ret     lr
+
+	#undef in
+    #undef out
+    #undef key
+    #undef nblocks
+    #undef keylen
+
+
+#endif
+
--- a/acceleratecrypto/Source/aes/intel/Context.h
+++ b/acceleratecrypto/Source/aes/intel/Context.h
@ -0,0 +1,25 @@
+/* Copyright (c) (2012,2015,2016,2019) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to 
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by 
+ * Apple Inc. (if any) are limited to internal use within your organization only on 
+ * devices and computers you own or control, for the sole purpose of verifying the 
+ * security characteristics and correct functioning of the Apple Software.  You may 
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#ifndef _CORECRYPTO_CONTEXT_H_
+#define _CORECRYPTO_CONTEXT_H_
+
+// Define byte offset of key within context structure.
+#define	ContextKey			0
+
+/*	Define byte offset of key length within context structure.  The number
+	stored there is the number of bytes from the start of the first round key
+	to the start of the last round key.  That is 16 less than the number of
+	bytes in the entire key.
+*/
+#define	ContextKeyLength	240
+
+#endif /* _CORECRYPTO_CONTEXT_H_ */
--- a/acceleratecrypto/Source/aes/intel/Data.s
+++ b/acceleratecrypto/Source/aes/intel/Data.s
--- a/acceleratecrypto/Source/aes/intel/EncryptDecrypt.s
+++ b/acceleratecrypto/Source/aes/intel/EncryptDecrypt.s
@ -0,0 +1,576 @@
+# Copyright (c) (2012,2015,2016,2019) Apple Inc. All rights reserved.
+#
+# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+# is contained in the License.txt file distributed with corecrypto) and only to 
+# people who accept that license. IMPORTANT:  Any license rights granted to you by 
+# Apple Inc. (if any) are limited to internal use within your organization only on 
+# devices and computers you own or control, for the sole purpose of verifying the 
+# security characteristics and correct functioning of the Apple Software.  You may 
+# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+#if  defined(__i386__) || defined(__x86_64__)
+
+/*	This file defines _vng_aes_encrypt or _vng_aes_decrypt, according to the value of
+	the Select preprocessor symbol.  This file is designed to be included in
+	another assembly file using the preprocessor #include directive, to benefit
+	from some assembly-time calculations.
+
+	These two routines are nearly identical.  They differ only in the tables
+	they use, the direction they iterate through the key, and the permutation
+	performed on part of the state.
+
+	Written by Eric Postpischil, January 2008.
+*/
+
+#if Select == 0
+	#define	Name		_aes_encrypt_nonaesni   	// Routine name.
+	#define	MTable		_AESEncryptTable			// Main table.
+	#define	FTable		_AESSubBytesWordTable		// Final table.
+	#define	P0			S0							// State permutation.
+	#define	P1			S1
+	#define	P2			S2
+	#define	P3			S3
+	#define	Increment	+16							// ExpandedKey increment.
+#elif Select == 1
+	#define	Name		_aes_decrypt_nonaesni		// Routine name.
+	#define	MTable		_AESDecryptTable			// Main table.
+	#define	FTable		_AESInvSubBytesWordTable	// Final table.
+	#define	P0			S2							// State permutation.
+	#define	P1			S3
+	#define	P2			S0
+	#define	P3			S1
+	#define	Increment	-16							// ExpandedKey increment.
+#endif	// Select
+
+
+/*	Routine:
+
+		_AESEncryptWithExpandedKey (if Select is 0) or
+		_AESDecryptWithExpandedKey (if Select is 1).
+
+	Function:
+
+		Perform the AES cipher or its inverse as defined in Federal Information
+		Processing Standards Publication 197 (FIPS-197), November 26, 2001.
+
+		The inverse cipher here is the "Equivalent Inverse Cipher" in FIPS-197.
+
+	Input:
+
+		Constant data:
+
+			The following names must be locally defined so the assembler
+			can calculate certain offsets.
+				
+			For encryption:
+
+				static const Word _AESEncryptTable[4][256].
+
+					_AESEncryptTable[i] contains the tables T[i] defined in AES
+					Proposal: Rijndael, version 2, 03/09/99, by Joan Daemen and
+					Vincent Rijmen, section 5.2.1, page 18.  These tables
+					combine the SubBytes and MixColumns operations.
+
+				static const Word _AESSubBytesWordTable[256].
+
+					_AESSubBytesWordTable[i][j] = SubBytes(j) << 8*i, where
+					SubBytes is defined in FIPS-197.  _AESSubBytesWordTable
+					differs from _AESEncryptTable in that it does not include
+					the MixColumn operation.  It is used in performing the last
+					round, which differs fromm the previous rounds in that it
+					does not include the MixColumn operation.
+
+			For decryption:
+
+				static const Word _AESDecryptTable[4][256].
+
+					The analog of _AESEncryptTable for decryption.
+
+				static const Word _AESSubBytesWordTable[256].
+
+					_AESInvSubBytesWordTable[i][j] = InvSubBytes(j) << 8*i,
+					where InvSubBytes is defined in FIPS-197.
+					_AESInvSubBytesWordTable differs from _AESDecryptTable in
+					that it does not include the InvMixColumn operation.  It is
+					used in performing the last round, which differs from the
+					previous rounds in that it does not include the
+					InvMixColumn operation.
+
+		Arguments:
+
+			const Byte *InputText.
+
+				Address of input, 16 bytes.  Best if four-byte aligned.
+
+			Byte *OutputText.
+
+				Address of output, 16 bytes.  Best if four-byte aligned.
+
+			vng_aes_encrypt_ctx *Context or vng_aes_decrypt_ctx *Context
+
+				vng_aes_encrypt_ctx and vng_aes_decrypt_ctx are identical except the
+				former is used for encryption and the latter for decryption.
+
+				Each is a structure containing the expanded key beginning at
+				offset ContextKey and a four-byte "key length" beginning at
+				offset ContextKeyLength.  The "key length" is the number of
+				bytes from the start of the first round key to the start of the
+				last round key.  That is 16 less than the number of bytes in
+				the entire key.
+
+	Output:
+
+		Encrypted or decrypted data is written to *OutputText.
+
+	Return:
+
+		aes_rval	// -1 if "key length" is invalid.  0 otherwise.
+*/
+
+	.text
+	.globl Name
+Name:
+
+// Jimmur removed the capabilities check and the just to the AESNI code.  This
+// will be handled by the C code.
+
+	// Push new stack frame.
+	push	r5
+
+	/*	Save registers and set SaveSize to the number of bytes pushed onto the
+		stack so far, including the caller's return address.
+	*/
+	push	r3
+	#if defined __i386__
+		push	r6
+		push	r7
+		#define	SaveSize	(5*4)
+	#else
+		#define	SaveSize	(3*8)
+	#endif
+
+	/*	Number of bytes used for local variables:
+
+			4 (i386) or 0 (x86_64) bytes for ExpandedKeyEnd.
+
+			5 (i386) or 3 (x86_64) 16-byte spaces to save XMM registers.
+	*/
+	#define	LocalsSize	(Arch(4, 0) + Arch(5, 3)*16)
+
+	#if 0 < LocalsSize
+		// Padding to position stack pointer at a multiple of 16 bytes.
+		#define	Padding	(15 & -(SaveSize + LocalsSize))
+		sub		$Padding + LocalsSize, r4	// Allocate space on stack.
+	#else
+		#define	Padding	0
+	#endif
+
+#if BUILDKERNEL
+
+	// Save XMM registers.
+	movaps	%xmm0, 0*16(r4)
+	movaps	%xmm1, 1*16(r4)
+	movaps	%xmm2, 2*16(r4)
+
+#if defined __i386__
+	movaps	%xmm3, 3*16(r4)
+	movaps	%xmm4, 4*16(r4)
+#endif
+
+#endif	// BUILDKERNEL
+
+#if defined __i386__
+
+	// Number of bytes from caller's stack pointer to ours.
+	#define	StackFrame	(SaveSize + Padding + LocalsSize)
+
+	// Define location of argument i (presuming 4-byte arguments).
+	#define	Argument(i)	StackFrame+4*(i)(%esp)
+
+	#define	ArgInputText	Argument(0)
+	#define	ArgOutputText	Argument(1)
+	#define	ArgContext		Argument(2)
+
+#elif defined __x86_64__
+
+	// Arguments.
+	#define	InputText		r7	// Used early then overwritten for other use.
+	#define	OutputText		r6	// Needed near end of routine.
+	#define	ArgContext		r2
+		/*	The argument passed in r2 overlaps registers we need for other
+		 	work, so it must be moved early in the routine.
+		*/
+
+#endif
+
+#define	BaseP		Arch(r6, r9)	// Base pointer for addressing global data.
+#define	ExpandedKey	Arch(t0, r10)	// Address of expanded key.
+
+/*	The Work registers defined below are used to hold parts of the AES state
+	while we dissect or assemble it.  They must be assigned to the A, B, C, and
+	D registers so that we can access the bytes in %al, %ah, and so on.
+*/
+#define	Work0d	r0d
+#define	Work0l	r0l
+#define	Work0h	r0h
+#define	Work1d	r3d
+#define	Work1l	r3l
+#define	Work1h	r3h
+#define	Work2d	r1d
+#define	Work2l	r1l
+#define	Work2h	r1h
+#define	Work3d	r2d
+#define	Work3l	r2l
+#define	Work3h	r2h
+
+#define	t0		r5
+#define	t0d		r5d		// Low 32 bits of t0.
+#define	t0l		r5l		// Low byte of t0.
+
+#define	t1		r7
+
+/*	S0, S1, S2, and S3 are where we assemble the new AES state when computing
+	a regular round.  S1, S2, and S3 are assigned to the Work registers, but
+	S0 needs to go somewhere else because Work0 holds part of the old state.
+*/
+#define	S0		Arch(t1, r8d)
+#define	S1		Work1d
+#define	S2		Work2d
+#define	S3		Work3d
+
+/*	These XMM registers are used as holding space, because it is faster to
+	spill to these registers than to the stack.  (On x86_64, we do not need
+	to spill, because there are additional general registers available.
+	However, using more general registers requires saving them to the stack
+	and restoring them.  I timed it, and no time was saved.)
+*/
+#define	vS1		%xmm0
+#define	vS2		%xmm1
+#define	vS3		%xmm2
+#if defined __i386__
+	#define	vExpandedKey	%xmm3
+	#define	vIncrement		%xmm4
+#endif
+
+	// Get address of expanded key.
+	mov	ArgContext, ExpandedKey
+	#if 0 != ContextKey
+		add		$ContextKey, ExpandedKey
+	#endif
+
+/*	Store sentinel value of ExpandedKey on the stack on i386, a register on
+ 	x86_64.
+*/
+#define	ExpandedKeyEnd	Arch(5*16(r4), r11)
+
+	// Get and check "key length".
+	movzb	ContextKeyLength(ExpandedKey), r0
+	cmp		$160, r0
+	je		2f
+	cmp		$192, r0
+	je		2f
+	cmp		$224, r0
+	je		2f
+	mov		$-1, r0		// Return error.
+	jmp		9f
+2:
+
+	#if (Select == 0 || Select == 2)
+		// For encryption, prepare to iterate forward through expanded key.
+		add		ExpandedKey, r0
+		mov		r0, ExpandedKeyEnd
+	#else
+		// For decryption, prepare to iterate backward through expanded key.
+		mov		ExpandedKey, ExpandedKeyEnd
+		add		r0, ExpandedKey
+	#endif
+
+	// Initialize State from input text.
+	#if defined __i386__
+		mov		ArgInputText, BaseP
+		#define	InputText	BaseP
+	#endif
+	mov		0*4(InputText), Work0d
+	mov		1*4(InputText), S1
+	mov		2*4(InputText), S2
+	mov		3*4(InputText), S3
+#undef	InputText	// Register is reused after this for other purposes.
+
+	// Add round key and save results.
+	xor		0*4(ExpandedKey), Work0d	// S0 is in dissection register.
+	xor		1*4(ExpandedKey), S1
+	movd	S1, vS1						// Save S1 to S3 in vector registers.
+	xor		2*4(ExpandedKey), S2
+	movd	S2, vS2
+	xor		3*4(ExpandedKey), S3
+	movd	S3, vS3
+
+	add		$Increment, ExpandedKey		 // Advance to next round key.
+
+	#if defined __i386__
+		// Save expanded key address and increment in vector registers.
+		mov		$Increment, t1
+		movp	ExpandedKey, vExpandedKey
+		movp	t1, vIncrement
+	#endif
+
+	// Set up relative addressing.
+	#if defined __i386__
+
+		// Get address of 0 in BaseP.
+			call	0f				// Push program counter onto stack.
+		0:
+			pop		BaseP			// Get program counter.
+
+		// Define macros to help address data.
+#define	LookupM(table, index)	MTable-0b+(table)*TableSize(BaseP, index, 4)
+#define LookupF(table, index)	FTable-0b+(table)*TableSize(BaseP, index, 4)
+
+	#elif defined __x86_64__
+
+		lea	MTable(%rip), BaseP
+
+		// Define macros to help address data.
+		#define	LookupM(table, index)	(table)*TableSize(BaseP, index, 4)
+		#define	LookupF(table, index)	(table)*TableSize(BaseP, index, 4)
+
+/*	With these definitions of LookupM and LookupF, BaseP must be loaded with
+	the address of the table at the point where it is used.  So we need an
+	instruction to change BaseP after we are done with MTable and before we
+	start using FTable.  I would prefer to use something like:
+
+		.set	FMinusM, FTable - MTable
+		#define LookupF(table, index)	\
+			FMinusM+(table)*TableSize(BaseP, index, 4)
+
+	Then BaseP would not need to change.  However, this fails due to an
+	assembler/linker bug.
+*/
+
+	#endif
+
+	// Get round key.
+	mov		0*4(ExpandedKey), S0
+	mov		1*4(ExpandedKey), S1
+	mov		2*4(ExpandedKey), S2
+	mov		3*4(ExpandedKey), S3
+
+1:
+	/*	Word 0 of the current state must be in Work0 now, and the next round
+		key must be in S0 to S3.
+	*/
+
+	// Process previous S0.
+	movzb	Work0l, t0
+	xor		LookupM(0, t0), S0
+	movzb	Work0h, t0d
+	xor		LookupM(1, t0), P3
+	shr		$16, Work0d
+	movzb	Work0l, t0d
+	xor		LookupM(2, t0), S2
+	movzb	Work0h, t0d
+	xor		LookupM(3, t0), P1
+
+	// Process previous S1.
+	movd	vS1, Work0d
+	movzb	Work0l, t0d
+	xor		LookupM(0, t0), S1
+	movzb	Work0h, t0d
+	xor		LookupM(1, t0), P0
+	shr		$16, Work0d
+	movzb	Work0l, t0d
+	xor		LookupM(2, t0), S3
+	movzb	Work0h, t0d
+	xor		LookupM(3, t0), P2
+
+	// Process previous S2.
+	movd	vS2, Work0d
+	movzb	Work0l, t0d
+	xor		LookupM(0, t0), S2
+	movzb	Work0h, t0d
+	xor		LookupM(1, t0), P1
+	shr		$16, Work0d
+	movzb	Work0l, t0d
+	xor		LookupM(2, t0), S0
+	movzb	Work0h, t0d
+	xor		LookupM(3, t0), P3
+
+	// Process previous S3.
+	movd	vS3, Work0d
+	movzb	Work0l, t0d
+	xor		LookupM(0, t0), S3
+	movzb	Work0h, t0d
+	xor		LookupM(1, t0), P2
+	shr		$16, Work0d
+	movzb	Work0l, t0d
+	xor		LookupM(2, t0), S1
+	movzb	Work0h, t0d
+	xor		LookupM(3, t0), P0
+
+	#if defined __i386__
+		paddd	vIncrement, vExpandedKey
+		movp	vExpandedKey, ExpandedKey
+	#else
+		add		$Increment, ExpandedKey
+	#endif
+
+	// Save state for next iteration and load next round key.
+	mov		S0, Work0d
+	mov		0*4(ExpandedKey), S0
+	movd	S1, vS1
+	mov		1*4(ExpandedKey), S1
+	movd	S2, vS2
+	mov		2*4(ExpandedKey), S2
+	movd	S3, vS3
+	mov		3*4(ExpandedKey), S3
+
+	cmp		ExpandedKeyEnd, ExpandedKey
+	jne		1b
+
+	/*	Word 0 of the current state must be in Work0 now, and the next round
+		key must be in S0 to S3.
+	*/
+
+	// Work around assembler bug.  See comments above about Radar 5683882.
+	#if defined __x86_64__
+		lea	FTable(%rip), BaseP
+	#endif
+
+	// Process previous S0.
+	movzb	Work0l, t0
+	xor		LookupF(0, t0), S0
+	movzb	Work0h, t0d
+	xor		LookupF(1, t0), P3
+	shr		$16, Work0d
+	movzb	Work0l, t0d
+	xor		LookupF(2, t0), S2
+	movzb	Work0h, t0d
+	xor		LookupF(3, t0), P1
+
+	// Process previous S1.
+	movd	vS1, Work0d
+	movzb	Work0l, t0d
+	xor		LookupF(0, t0), S1
+	movzb	Work0h, t0d
+	xor		LookupF(1, t0), P0
+	shr		$16, Work0d
+	movzb	Work0l, t0d
+	xor		LookupF(2, t0), S3
+	movzb	Work0h, t0d
+	xor		LookupF(3, t0), P2
+
+	// Process previous S2.
+	movd	vS2, Work0d
+	movzb	Work0l, t0d
+	xor		LookupF(0, t0), S2
+	movzb	Work0h, t0d
+	xor		LookupF(1, t0), P1
+	shr		$16, Work0d
+	movzb	Work0l, t0d
+	xor		LookupF(2, t0), S0
+	movzb	Work0h, t0d
+	xor		LookupF(3, t0), P3
+
+	// Process previous S3.
+	movd	vS3, Work0d
+	movzb	Work0l, t0d
+	xor		LookupF(0, t0), S3
+	movzb	Work0h, t0d
+	xor		LookupF(1, t0), P2
+	shr		$16, Work0d
+	movzb	Work0l, t0d
+	xor		LookupF(2, t0), S1
+	movzb	Work0h, t0d
+	xor		LookupF(3, t0), P0
+
+	#if defined __i386__	// Architecture.
+		// Get OutputText address.
+		#define	OutputText	BaseP
+		mov		ArgOutputText, OutputText
+	#endif	// Architecture.
+
+	// Write output.
+	mov		S0, 0*4(OutputText)
+	mov		S1, 1*4(OutputText)
+	mov		S2, 2*4(OutputText)
+	mov		S3, 3*4(OutputText)
+
+	xor		r0, r0		// Return success.
+
+9:
+	// Pop stack and restore registers.
+#if BUILDKERNEL
+#if defined __i386__
+	movaps	4*16(r4), %xmm4
+	movaps	3*16(r4), %xmm3
+#endif
+	movaps	2*16(r4), %xmm2
+	movaps	1*16(r4), %xmm1
+	movaps	0*16(r4), %xmm0
+#endif	// BUILDKERNEL
+	#if 0 < LocalsSize
+		add		$Padding + LocalsSize, r4
+	#endif
+	#if defined __i386__
+		pop		r7
+		pop		r6
+	#elif defined __x86_64__
+	#endif
+	pop		r3
+	pop		r5
+
+	ret
+
+
+#undef	ArgExpandedKey
+#undef	ArgInputText
+#undef	ArgNr
+#undef	ArgOutputText
+#undef	Argument
+#undef	BaseP
+#undef	ExpandedKey
+#undef	ExpandedKeyEnd
+#undef	FTable
+#undef	InputText
+#undef	LocalsSize
+#undef	LookupM
+#undef	LookupF
+#undef	MTable
+#undef	OutputText
+#undef	Padding
+#undef	SaveSize
+#undef	S0
+#undef	S1
+#undef	S2
+#undef	S3
+#undef	StackFrame
+#undef	Work0d
+#undef	Work0h
+#undef	Work0l
+#undef	Work1d
+#undef	Work1h
+#undef	Work1l
+#undef	Work2d
+#undef	Work2h
+#undef	Work2l
+#undef	Work3d
+#undef	Work3h
+#undef	Work3l
+#undef	t0
+#undef	t0d
+#undef	t0l
+#undef	t1
+#undef	vExpandedKey
+#undef	vS1
+#undef	vS2
+#undef	vS3
+
+#undef	Name
+#undef	MTable
+#undef	FTable
+#undef	P0
+#undef	P1
+#undef	P2
+#undef	P3
+#undef	Increment
+
+#endif  // defined(__x86_64__) || defined(__i386__)
--- a/acceleratecrypto/Source/aes/intel/aes.c
+++ b/acceleratecrypto/Source/aes/intel/aes.c
@ -0,0 +1,38 @@
+/* Copyright (c) (2019) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to 
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by 
+ * Apple Inc. (if any) are limited to internal use within your organization only on 
+ * devices and computers you own or control, for the sole purpose of verifying the 
+ * security characteristics and correct functioning of the Apple Software.  You may 
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#if (defined(__x86_64__) || defined(__i386__))
+#include <stddef.h>
+#include "config.h"
+#include "AccelerateCrypto.h"
+
+
+extern int aes_encrypt_aesni(const void *in, void *out, const AccelerateCrypto_AES_ctx *key);
+extern int aes_decrypt_aesni(const void *in, void *out, const AccelerateCrypto_AES_ctx *key);
+extern int aes_encrypt_nonaesni(const void *in, void *out, const AccelerateCrypto_AES_ctx *key);
+extern int aes_decrypt_nonaesni(const void *in, void *out, const AccelerateCrypto_AES_ctx *key);
+
+int AccelerateCrypto_AES_encrypt(const void *in, void *out, const AccelerateCrypto_AES_ctx *key)
+{
+    if (HAS_AESNI()) return aes_encrypt_aesni(in, out, key);
+    else 
+        return aes_encrypt_nonaesni(in, out, key);
+}
+
+int AccelerateCrypto_AES_decrypt(const void *in, void *out, const AccelerateCrypto_AES_ctx *key)
+{
+    if (HAS_AESNI()) return aes_decrypt_aesni(in, out, key);
+    else 
+        return aes_decrypt_nonaesni(in, out, key);
+}
+
+#endif  // (defined(__x86_64__) || defined(__i386__))
+
--- a/acceleratecrypto/Source/aes/intel/crypt_aesni.s
+++ b/acceleratecrypto/Source/aes/intel/crypt_aesni.s
@ -0,0 +1,483 @@
+# Copyright (c) (2012,2015,2016,2018,2019) Apple Inc. All rights reserved.
+#
+# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+# is contained in the License.txt file distributed with corecrypto) and only to 
+# people who accept that license. IMPORTANT:  Any license rights granted to you by 
+# Apple Inc. (if any) are limited to internal use within your organization only on 
+# devices and computers you own or control, for the sole purpose of verifying the 
+# security characteristics and correct functioning of the Apple Software.  You may 
+# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+/* 	This files defines _aes_encrypt_aesni and _aes_decrypt_aesni --- Intel Westmere HW AES-based implementation
+	of _aes_encrypt and _aes_decrypt. 
+
+	These 2 functions SHOULD BE entried ONLY after the AES HW is verified to be available. 
+	They SHOULD NOT be called without AES HW detection. It might cause xnu to crash.
+
+	The AES HW is detected 1st thing in 
+		_aes_encrypt (EncryptDecrypt.s) 
+		_aes_decrypt (EncryptDecrypt.s)
+	and, if AES HW is detected, branch without link (ie, jump) to the functions here.
+
+	The implementation here follows the examples in an Intel White Paper
+	"Intel Advanced Encryption Standard (AES) Instruction Set" Rev.2 01
+
+	Note: Rev. 03 Final 2010 01 26 is available. Looks like some code change from Rev.2 01
+
+*/
+
+#if  (defined __i386__ || defined __x86_64__)
+
+    .text
+    .p2align 4,0x90
+.globl	_aes_encrypt_aesni
+_aes_encrypt_aesni:
+
+#if	defined	__i386__	
+	movl	4(%esp), %eax	// in
+	movl	12(%esp), %edx	// ctx
+	movl	8(%esp), %ecx	// out
+
+	#define	LOCAL_SIZE	(12+16+16)		// 16-byte align (-4 for return address) + 16 (xmm0) + 16 (xmm1)
+	#define	in		%eax
+	#define	ctx		%edx
+	#define	out		%ecx
+	#define	r13		%esp
+
+#else		// x86_64
+
+	#define	LOCAL_SIZE	(8+16+16)		// 16-byte align (-8 for return address) + 16 (xmm0) + 16 (xmm1)
+	#define	in			%rdi
+	#define	ctx			%rdx
+	#define	out			%rsi
+	#define	r13			%rsp
+
+#endif		// i386 or x86_64
+
+#if BUILDKERNEL
+	sub		$LOCAL_SIZE, r13
+	movaps	%xmm0, (r13)
+#endif
+	movups	(in), %xmm0
+
+	// key length identification
+	movl	240(ctx), %eax			// key length
+	cmp		$160, %eax
+	je		L_AES_128
+	cmp		$192, %eax
+	je		L_AES_192
+	cmp		$224, %eax
+	je		L_AES_256
+	mov		$-1, %eax					// return ERROR
+#if BUILDKERNEL
+	movaps	(r13), %xmm0	
+	add		$LOCAL_SIZE, r13
+#endif
+	ret
+
+L_AES_128:
+	testb	$15, %dl					// check whether expanded key is 16-byte aligned
+	jne		0f							// if not 16-byte aligned, aesenc xmm, m128 won't work	
+	pxor	(ctx), %xmm0
+	aesenc	16(ctx), %xmm0
+	aesenc	32(ctx), %xmm0
+	aesenc	48(ctx), %xmm0
+	aesenc	64(ctx), %xmm0
+	aesenc	80(ctx), %xmm0
+	aesenc	96(ctx), %xmm0
+	aesenc	112(ctx), %xmm0
+	aesenc	128(ctx), %xmm0
+	aesenc	144(ctx), %xmm0
+	aesenclast	160(ctx), %xmm0
+	xorl	%eax, %eax
+	movups	%xmm0, (out)
+#if BUILDKERNEL
+	movaps	(r13), %xmm0	
+	add		$LOCAL_SIZE, r13
+#endif
+	ret
+0:										// special case expanded key is not 16-byte aligned	
+#if BUILDKERNEL
+	movaps	%xmm1, 16(r13)				// save xmm1 into stack
+#endif
+	movups	(ctx), %xmm1
+	pxor	%xmm1, %xmm0
+	movups	16(ctx), %xmm1
+	aesenc	%xmm1, %xmm0
+	movups	32(ctx), %xmm1
+	aesenc	%xmm1, %xmm0
+	movups	48(ctx), %xmm1
+	aesenc	%xmm1, %xmm0
+	movups	64(ctx), %xmm1
+	aesenc	%xmm1, %xmm0
+	movups	80(ctx), %xmm1
+	aesenc	%xmm1, %xmm0
+	movups	96(ctx), %xmm1
+	aesenc	%xmm1, %xmm0
+	movups	112(ctx), %xmm1
+	aesenc	%xmm1, %xmm0
+	movups	128(ctx), %xmm1
+	aesenc	%xmm1, %xmm0
+	movups	144(ctx), %xmm1
+	aesenc	%xmm1, %xmm0
+	movups	160(ctx), %xmm1
+	aesenclast	%xmm1, %xmm0
+	xorl	%eax, %eax
+	movups	%xmm0, (out)
+#if BUILDKERNEL
+	movaps	(r13), %xmm0	
+	movaps	16(r13), %xmm1
+	add		$LOCAL_SIZE, r13
+#endif
+	ret
+
+L_AES_192:
+	testb	$15, %dl					// check whether expanded key is 16-byte aligned
+	jne		0f							// if not 16-byte aligned, aesenc xmm, m128 won't work	
+	pxor	(ctx), %xmm0
+	aesenc	16(ctx), %xmm0
+	aesenc	32(ctx), %xmm0
+	aesenc	48(ctx), %xmm0
+	aesenc	64(ctx), %xmm0
+	aesenc	80(ctx), %xmm0
+	aesenc	96(ctx), %xmm0
+	aesenc	112(ctx), %xmm0
+	aesenc	128(ctx), %xmm0
+	aesenc	144(ctx), %xmm0
+	aesenc	160(ctx), %xmm0
+	aesenc	176(ctx), %xmm0
+	aesenclast	192(ctx), %xmm0
+	xorl	%eax, %eax
+	movups	%xmm0, (out)
+#if BUILDKERNEL
+	movaps	(r13), %xmm0	
+	add		$LOCAL_SIZE, r13
+#endif
+	ret
+0:										// special case expanded key is not 16-byte aligned	
+#if BUILDKERNEL
+	movaps	%xmm1, 16(r13)				// save xmm1 into stack
+#endif
+	movups	(ctx), %xmm1
+	pxor	%xmm1, %xmm0
+	movups	16(ctx), %xmm1
+	aesenc	%xmm1, %xmm0
+	movups	32(ctx), %xmm1
+	aesenc	%xmm1, %xmm0
+	movups	48(ctx), %xmm1
+	aesenc	%xmm1, %xmm0
+	movups	64(ctx), %xmm1
+	aesenc	%xmm1, %xmm0
+	movups	80(ctx), %xmm1
+	aesenc	%xmm1, %xmm0
+	movups	96(ctx), %xmm1
+	aesenc	%xmm1, %xmm0
+	movups	112(ctx), %xmm1
+	aesenc	%xmm1, %xmm0
+	movups	128(ctx), %xmm1
+	aesenc	%xmm1, %xmm0
+	movups	144(ctx), %xmm1
+	aesenc	%xmm1, %xmm0
+	movups	160(ctx), %xmm1
+	aesenc	%xmm1, %xmm0
+	movups	176(ctx), %xmm1
+	aesenc	%xmm1, %xmm0
+	movups	192(ctx), %xmm1
+	aesenclast	%xmm1, %xmm0
+	xorl	%eax, %eax
+	movups	%xmm0, (out)
+#if BUILDKERNEL
+	movaps	(r13), %xmm0	
+	movaps	16(r13), %xmm1
+	add		$LOCAL_SIZE, r13
+#endif
+	ret
+
+L_AES_256:
+	testb	$15, %dl					// check whether expanded key is 16-byte aligned
+	jne		0f							// if not 16-byte aligned, aesenc xmm, m128 won't work	
+	pxor	(ctx), %xmm0
+	aesenc	16(ctx), %xmm0
+	aesenc	32(ctx), %xmm0
+	aesenc	48(ctx), %xmm0
+	aesenc	64(ctx), %xmm0
+	aesenc	80(ctx), %xmm0
+	aesenc	96(ctx), %xmm0
+	aesenc	112(ctx), %xmm0
+	aesenc	128(ctx), %xmm0
+	aesenc	144(ctx), %xmm0
+	aesenc	160(ctx), %xmm0
+	aesenc	176(ctx), %xmm0
+	aesenc	192(ctx), %xmm0
+	aesenc	208(ctx), %xmm0
+	aesenclast	224(ctx), %xmm0
+	xorl	%eax, %eax
+	movups	%xmm0, (out)
+#if BUILDKERNEL
+	movaps	(r13), %xmm0	
+	add		$LOCAL_SIZE, r13
+#endif
+	ret
+0:										// special case expanded key is not 16-byte aligned	
+#if BUILDKERNEL
+	movaps	%xmm1, 16(r13)				// save xmm1 into stack
+#endif
+	movups	(ctx), %xmm1
+	pxor	%xmm1, %xmm0
+	movups	16(ctx), %xmm1
+	aesenc	%xmm1, %xmm0
+	movups	32(ctx), %xmm1
+	aesenc	%xmm1, %xmm0
+	movups	48(ctx), %xmm1
+	aesenc	%xmm1, %xmm0
+	movups	64(ctx), %xmm1
+	aesenc	%xmm1, %xmm0
+	movups	80(ctx), %xmm1
+	aesenc	%xmm1, %xmm0
+	movups	96(ctx), %xmm1
+	aesenc	%xmm1, %xmm0
+	movups	112(ctx), %xmm1
+	aesenc	%xmm1, %xmm0
+	movups	128(ctx), %xmm1
+	aesenc	%xmm1, %xmm0
+	movups	144(ctx), %xmm1
+	aesenc	%xmm1, %xmm0
+	movups	160(ctx), %xmm1
+	aesenc	%xmm1, %xmm0
+	movups	176(ctx), %xmm1
+	aesenc	%xmm1, %xmm0
+	movups	192(ctx), %xmm1
+	aesenc	%xmm1, %xmm0
+	movups	208(ctx), %xmm1
+	aesenc	%xmm1, %xmm0
+	movups	224(ctx), %xmm1
+	aesenclast	%xmm1, %xmm0
+	xorl	%eax, %eax
+	movups	%xmm0, (out)
+#if BUILDKERNEL
+	movaps	(r13), %xmm0	
+	movaps	16(r13), %xmm1
+	add		$LOCAL_SIZE, r13
+#endif
+	ret
+
+
+    .text
+    .p2align 4,0x90
+.globl	_aes_decrypt_aesni
+_aes_decrypt_aesni:
+
+#if	defined	__i386__	
+	movl	4(%esp), %eax	// in
+	movl	12(%esp), %edx	// ctx
+	movl	8(%esp), %ecx	// out
+
+#endif
+
+#if BUILDKERNEL
+	sub		$LOCAL_SIZE, r13
+	movaps	%xmm0, (r13)
+#endif
+	movups	(in), %xmm0
+
+	// key length identification
+	movl	240(ctx), %eax			// key length
+	cmp		$160, %eax
+	je		0f						// AES-128
+	cmp		$192, %eax
+	je		1f						// AES-192
+	cmp		$224, %eax
+	je		2f						// AES-256
+	mov		$-1, %eax				// return ERROR
+#if BUILDKERNEL
+	movaps	(r13), %xmm0	
+	add		$LOCAL_SIZE, r13
+#endif
+	ret
+
+0:									// AES-128
+	testb	$15, %dl					// check whether expanded key is 16-byte aligned
+	jne		9f							// if not 16-byte aligned, aesenc xmm, m128 won't work	
+	pxor	160(ctx), %xmm0
+	aesdec	144(ctx), %xmm0
+	aesdec	128(ctx), %xmm0
+	aesdec	112(ctx), %xmm0
+	aesdec	96(ctx), %xmm0
+	aesdec	80(ctx), %xmm0
+	aesdec	64(ctx), %xmm0
+	aesdec	48(ctx), %xmm0
+	aesdec	32(ctx), %xmm0
+	aesdec	16(ctx), %xmm0
+	aesdeclast	(ctx), %xmm0
+	xorl	%eax, %eax
+	movups	%xmm0, (out)
+#if BUILDKERNEL
+	movaps	(r13), %xmm0	
+	add		$LOCAL_SIZE, r13
+#endif
+	ret
+9:										// AES-128 Decrypt : special case expanded key is not 16-byte aligned 
+#if BUILDKERNEL
+	movaps	%xmm1, 16(r13)				// save xmm1 into stack
+#endif
+	movups	160(ctx), %xmm1
+	pxor	%xmm1, %xmm0
+	movups	144(ctx), %xmm1
+	aesdec	%xmm1, %xmm0
+	movups	128(ctx), %xmm1
+	aesdec	%xmm1, %xmm0
+	movups	112(ctx), %xmm1
+	aesdec	%xmm1, %xmm0
+	movups	96(ctx), %xmm1
+	aesdec	%xmm1, %xmm0
+	movups	80(ctx), %xmm1
+	aesdec	%xmm1, %xmm0
+	movups	64(ctx), %xmm1
+	aesdec	%xmm1, %xmm0
+	movups	48(ctx), %xmm1
+	aesdec	%xmm1, %xmm0
+	movups	32(ctx), %xmm1
+	aesdec	%xmm1, %xmm0
+	movups	16(ctx), %xmm1
+	aesdec	%xmm1, %xmm0
+	movups	(ctx), %xmm1
+	aesdeclast	%xmm1, %xmm0
+	xorl	%eax, %eax
+	movups	%xmm0, (out)
+#if BUILDKERNEL
+	movaps	(r13), %xmm0	
+	movaps	16(r13), %xmm1	
+	add		$LOCAL_SIZE, r13
+#endif
+	ret
+
+1:								// AES-192
+	testb	$15, %dl					// check whether expanded key is 16-byte aligned
+	jne		9f							// if not 16-byte aligned, aesenc xmm, m128 won't work	
+	pxor	192(ctx), %xmm0
+	aesdec	176(ctx), %xmm0
+	aesdec	160(ctx), %xmm0
+	aesdec	144(ctx), %xmm0
+	aesdec	128(ctx), %xmm0
+	aesdec	112(ctx), %xmm0
+	aesdec	96(ctx), %xmm0
+	aesdec	80(ctx), %xmm0
+	aesdec	64(ctx), %xmm0
+	aesdec	48(ctx), %xmm0
+	aesdec	32(ctx), %xmm0
+	aesdec	16(ctx), %xmm0
+	aesdeclast	(ctx), %xmm0
+	xorl	%eax, %eax
+	movups	%xmm0, (out)
+#if BUILDKERNEL
+	movaps	(r13), %xmm0	
+	add		$LOCAL_SIZE, r13
+#endif
+	ret
+9:										// AES-192 Decrypt : special case expanded key is not 16-byte aligned 
+#if BUILDKERNEL
+	movaps	%xmm1, 16(r13)				// save xmm1 into stack
+#endif
+	movups	192(ctx), %xmm1
+	pxor	%xmm1, %xmm0
+	movups	176(ctx), %xmm1
+	aesdec	%xmm1, %xmm0
+	movups	160(ctx), %xmm1
+	aesdec	%xmm1, %xmm0
+	movups	144(ctx), %xmm1
+	aesdec	%xmm1, %xmm0
+	movups	128(ctx), %xmm1
+	aesdec	%xmm1, %xmm0
+	movups	112(ctx), %xmm1
+	aesdec	%xmm1, %xmm0
+	movups	96(ctx), %xmm1
+	aesdec	%xmm1, %xmm0
+	movups	80(ctx), %xmm1
+	aesdec	%xmm1, %xmm0
+	movups	64(ctx), %xmm1
+	aesdec	%xmm1, %xmm0
+	movups	48(ctx), %xmm1
+	aesdec	%xmm1, %xmm0
+	movups	32(ctx), %xmm1
+	aesdec	%xmm1, %xmm0
+	movups	16(ctx), %xmm1
+	aesdec	%xmm1, %xmm0
+	movups	(ctx), %xmm1
+	aesdeclast	%xmm1, %xmm0
+	xorl	%eax, %eax
+	movups	%xmm0, (out)
+#if BUILDKERNEL
+	movaps	(r13), %xmm0	
+	movaps	16(r13), %xmm1	
+	add		$LOCAL_SIZE, r13
+#endif
+	ret
+
+2:							// AES-256
+	testb	$15, %dl					// check whether expanded key is 16-byte aligned
+	jne		9f							// if not 16-byte aligned, aesenc xmm, m128 won't work	
+	pxor	224(ctx), %xmm0
+	aesdec	208(ctx), %xmm0
+	aesdec	192(ctx), %xmm0
+	aesdec	176(ctx), %xmm0
+	aesdec	160(ctx), %xmm0
+	aesdec	144(ctx), %xmm0
+	aesdec	128(ctx), %xmm0
+	aesdec	112(ctx), %xmm0
+	aesdec	96(ctx), %xmm0
+	aesdec	80(ctx), %xmm0
+	aesdec	64(ctx), %xmm0
+	aesdec	48(ctx), %xmm0
+	aesdec	32(ctx), %xmm0
+	aesdec	16(ctx), %xmm0
+	aesdeclast	(ctx), %xmm0
+	xorl	%eax, %eax
+	movups	%xmm0, (out)
+#if BUILDKERNEL
+	movaps	(r13), %xmm0	
+	add		$LOCAL_SIZE, r13
+#endif
+	ret
+9:										// AES-256 Decrypt : special case expanded key is not 16-byte aligned 
+#if BUILDKERNEL
+	movaps	%xmm1, 16(r13)				// save xmm1 into stack
+#endif
+	movups	224(ctx), %xmm1
+	pxor	%xmm1, %xmm0
+	movups	208(ctx), %xmm1
+	aesdec	%xmm1, %xmm0
+	movups	192(ctx), %xmm1
+	aesdec	%xmm1, %xmm0
+	movups	176(ctx), %xmm1
+	aesdec	%xmm1, %xmm0
+	movups	160(ctx), %xmm1
+	aesdec	%xmm1, %xmm0
+	movups	144(ctx), %xmm1
+	aesdec	%xmm1, %xmm0
+	movups	128(ctx), %xmm1
+	aesdec	%xmm1, %xmm0
+	movups	112(ctx), %xmm1
+	aesdec	%xmm1, %xmm0
+	movups	96(ctx), %xmm1
+	aesdec	%xmm1, %xmm0
+	movups	80(ctx), %xmm1
+	aesdec	%xmm1, %xmm0
+	movups	64(ctx), %xmm1
+	aesdec	%xmm1, %xmm0
+	movups	48(ctx), %xmm1
+	aesdec	%xmm1, %xmm0
+	movups	32(ctx), %xmm1
+	aesdec	%xmm1, %xmm0
+	movups	16(ctx), %xmm1
+	aesdec	%xmm1, %xmm0
+	movups	(ctx), %xmm1
+	aesdeclast	%xmm1, %xmm0
+	xorl	%eax, %eax
+	movups	%xmm0, (out)
+#if BUILDKERNEL
+	movaps	(r13), %xmm0	
+	movaps	16(r13), %xmm1	
+	add		$LOCAL_SIZE, r13
+#endif
+	ret
+
+#endif /* x86 based build */
--- a/acceleratecrypto/Source/aes/intel/crypt_nonaesni.s
+++ b/acceleratecrypto/Source/aes/intel/crypt_nonaesni.s
@ -0,0 +1,146 @@
+# Copyright (c) (2012,2015,2016,2019) Apple Inc. All rights reserved.
+#
+# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+# is contained in the License.txt file distributed with corecrypto) and only to 
+# people who accept that license. IMPORTANT:  Any license rights granted to you by 
+# Apple Inc. (if any) are limited to internal use within your organization only on 
+# devices and computers you own or control, for the sole purpose of verifying the 
+# security characteristics and correct functioning of the Apple Software.  You may 
+# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+#include <corecrypto/cc_config.h>
+
+/*	AES.s -- Core AES routines for Intel processors.
+
+	Written by Eric Postpischil, January 30, 2008.
+*/
+
+#if (defined __i386__ || defined __x86_64__)
+
+
+/*	We build these AES routines as a single module because the routines refer
+	to labels in Data.s and it is easier and faster to refer to them as local
+	labels.  In my implementations of AES for CommonCrypto, both i386 and
+	x86_64 use position-independent code.  For this in-kernel implementation,
+	i386 has been converted to absolute addressing, but x86_64 still uses PIC.
+
+	A local label can be referred to with position-independent assembler
+	expressions such as "label-base(register)", where <base> is a local label
+	whose address has been loaded into <register>.  (On i386, this is typically
+	done with the idiom of a call to the next instruction and a pop of that
+	return address into a register.)  Without local labels, the references must
+	be done using spaces for addresses of "lazy symbols" that are filled in by
+	the dynamic loader and loaded by the code that wants the address.
+
+	So the various routines in other files are assembled here via #include
+	directives.
+*/
+#include "Data.s"
+
+
+#define	TableSize	(256*4)
+	/*	Each of the arrays defined in Data.s except for the round constants
+		in _AESRcon is composed of four tables of 256 entries of four bytes
+		each.  TableSize is the number of bytes in one of those four tables.
+	*/
+
+
+// Include constants describing the AES context structures.
+#include "Context.h"
+
+
+/*	Define a macro to select a value based on architecture.  This reduces
+	some of the architecture conditionalization later in the source.
+*/
+#if defined __i386__
+	#define	Arch(i386, x86_64)	i386
+#elif defined __x86_64__
+	#define	Arch(i386, x86_64)	x86_64
+#endif
+
+
+// Define an instruction for moving pointers.
+#define	movp	Arch(movd, movd)
+	// Latter argument should be "movq", but the assembler uses "movd".
+
+
+/*	Rename the general registers.  This makes it easier to keep track of them
+	and provides names for the "whole register" that are uniform between i386
+	and x86_64.
+*/
+#if defined __i386__
+	#define	r0	%eax	// Available for any use.
+	#define	r1	%ecx	// Available for any use, some special purposes (loop).
+	#define	r2	%edx	// Available for any use.
+	#define	r3	%ebx	// Must be preserved by called routine.
+	#define	r4	%esp	// Stack pointer.
+	#define	r5	%ebp	// Frame pointer, must preserve, no bare indirect.
+	#define	r6	%esi	// Must be preserved by called routine.
+	#define	r7	%edi	// Must be preserved by called routine.
+#elif defined __x86_64__
+	#define	r0	%rax	// Available for any use.
+	#define	r1	%rcx	// Available for any use.
+	#define	r2	%rdx	// Available for any use.
+	#define	r3	%rbx	// Must be preserved by called routine.
+	#define	r4	%rsp	// Stack pointer.
+	#define	r5	%rbp	// Frame pointer.  Must be preserved by called routine.
+	#define	r6	%rsi	// Available for any use.
+	#define	r7	%rdi	// Available for any use.
+	#define	r8	%r8		// Available for any use.
+	#define	r9	%r9		// Available for any use.
+	#define	r10	%r10	// Available for any use.
+	#define	r11	%r11	// Available for any use.
+	#define	r12	%r12	// Must be preserved by called routine.
+	#define	r13	%r13	// Must be preserved by called routine.
+	#define	r14	%r14	// Must be preserved by called routine.
+	#define	r15	%r15	// Must be preserved by called routine.
+#else
+	#error "Unknown architecture."
+#endif
+
+// Define names for parts of registers.
+
+#define	r0d		%eax	// Low 32 bits of r0.
+#define	r1d		%ecx	// Low 32 bits of r1.
+#define	r2d		%edx	// Low 32 bits of r2.
+#define	r3d		%ebx	// Low 32 bits of r3.
+#define	r5d		%ebp	// Low 32 bits of r5.
+#define	r6d		%esi	// Low 32 bits of r6.
+#define	r7d		%edi	// Low 32 bits of r7.
+#define	r8d		%r8d	// Low 32 bits of r8.
+#define	r9d		%r9d	// Low 32 bits of r9.
+#define	r11d	%r11d	// Low 32 bits of r11.
+
+#define	r0l		%al		// Low byte of r0.
+#define	r1l		%cl		// Low byte of r1.
+#define	r2l		%dl		// Low byte of r2.
+#define	r3l		%bl		// Low byte of r3.
+#define	r5l		%bpl	// Low byte of r5.
+
+#define	r0h		%ah		// Second lowest byte of r0.
+#define	r1h		%ch		// Second lowest byte of r1.
+#define	r2h		%dh		// Second lowest byte of r2.
+#define	r3h		%bh		// Second lowest byte of r3.
+
+
+	.text
+
+
+// Define encryption routine, _AESEncryptWithExpandedKey
+#define	Select	0
+#include "EncryptDecrypt.s"
+#undef	Select
+
+
+// Define decryption routine, _AESDecryptWithExpandedKey
+#define	Select	1
+#include "EncryptDecrypt.s"
+#undef	Select
+
+// Define key expansion routine for encryption, _AESExpandKeyForEncryption.
+// #include "ExpandKeyForEncryption.s"
+
+
+// Define key expansion for decryption routine, _AESExpandKeyForDecryption.
+// #include "ExpandKeyForDecryption.s"
+#endif /* x86 based build */
+
--- a/acceleratecrypto/Source/sha1/arm/sha1_compress_armv7neon.s
+++ b/acceleratecrypto/Source/sha1/arm/sha1_compress_armv7neon.s
--- a/acceleratecrypto/Source/sha1/arm64/sha1_compress_arm64.s
+++ b/acceleratecrypto/Source/sha1/arm64/sha1_compress_arm64.s
@ -0,0 +1,362 @@
+# Copyright (c) (2018-2020) Apple Inc. All rights reserved.
+#
+# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+# is contained in the License.txt file distributed with corecrypto) and only to
+# people who accept that license. IMPORTANT:  Any license rights granted to you by
+# Apple Inc. (if any) are limited to internal use within your organization only on
+# devices and computers you own or control, for the sole purpose of verifying the
+# security characteristics and correct functioning of the Apple Software.  You may
+# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+
+/*
+
+void SHA1( int HASH[], int MESSAGE[] )
+{
+	int A[81], B[81], C[81], D[81], E[81];
+	int W[80];
+	int i, FN;
+
+	A[0] = HASH[0]; B[0] = HASH[1]; C[0] = HASH[2]; D[0] = HASH[3]; E[0] = HASH[4];
+
+	for ( i=0; i<80; ++i ) {
+		if ( i < 16 )
+			W[i] = BIG_ENDIAN_LOAD( MESSAGE[i] );
+		else
+		 	W[i] = ROTATE_LEFT( W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16], 1 ); 
+
+		FN = F( i, B[i], C[i], D[i] );
+		A[i+1] = FN + E[i] + ROTATE_LEFT( A[i], 5 ) + W[i] + K(i);
+		B[i+1] = A[i];
+		C[i+1] = ROTATE_LEFT( B[i], 30 );
+		D[i+1] = C[i];
+		E[i+1] = D[i];
+	}
+
+	HASH[0] += A[80]; HASH[1] += B[80]; HASH[2] += C[80]; HASH[3] += D[80]; HASH[4] += E[80];
+} 
+
+
+	For i=0:15, W[i] is simply big-endian loading of MESSAGE[i]. 
+	For i=16:79, W[i] is updated according to W[i] = ROTATE_LEFT( W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16], 1 );
+
+    The approach (by Dean Gaudet) can be used to vectorize the computation of W[i] for i=16:79,
+
+    1. update 4 consequtive W[i] (stored in a single 16-byte register)
+    W[i  ] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1
+    W[i+1] = (W[i-2] ^ W[i-7] ^ W[i-13] ^ W[i-15]) rol 1
+    W[i+2] = (W[i-1] ^ W[i-6] ^ W[i-12] ^ W[i-14]) rol 1
+    W[i+3] = (   0   ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1
+
+    2. this additional calculation unfortunately requires many additional operations
+    W[i+3] ^= W[i] rol 1
+
+    3. once we have 4 W[i] values in a Q register, we can also add four K values with one instruction
+    W[i:i+3] += {K,K,K,K}
+
+    Let W0 = {W[i] W[i+1] W[i+2] W[i+3]} be the current W-vector to be computed, 
+		W4 = {W[i-4] W[i-3] W[i-2] W[i-1]} be the previous vector, and so on
+
+    The Dean Gaudet approach can be expressed as
+
+    1. W0 = rotate_left(left_shift(W4,32) ^ W8 ^ left_shift(concatenate(W16,W12),64) ^ W16,1);
+    2. W[i+3] ^= W[i] rol 1
+    3. W0 += {K,K,K,K}
+
+    For i>=32, the Intel online article suggests that (using a basic identity (X rol 1) rol 1 = X rol 2) 
+	the update equation is equivalent to
+
+    1. W0 = rotate_left(left_shift(concatenate(W8,W4),64) ^ W16 ^ W28 ^ W32, 2); 
+
+    Note:
+    1. In total, we need 8 16-byte registers or memory for W0,W4,...,W28. W0 and W32 can be the same register or memory.
+    2. The registers are used in a circular buffering mode. For example, we start with W28,W24,...,W0 
+		(with W0 indicating the most recent 16-byte) 
+		i=0, W28,W24,...,W0
+        i=4, W24,W20,...,W28
+        i=8, W20,W16,...,W24
+        .
+        .
+        and so forth.
+    3. once W-vector is computed, W+K is then computed and saved in the stack memory, this will be used later when
+		updating the digests A/B/C/D/E 
+
+	the execution flow (for 1 single 64-byte block) looks like
+
+	W_PRECALC_00_15		// big-endian loading of 64-bytes into 4 W-vectors, compute WK=W+K, save WK in the stack memory
+
+	W_PRECALC_16_31		// for each vector, update digests, update W (Gaudet) and WK=W+K, save WK in the stack memory
+
+	W_PRECALC_32_79		// for each vector, update digests, update W (Intel) and WK=W+K, save WK in the stack memory 
+
+	our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block 
+    into the last 16 rounds of its previous block:
+
+	----------------------------------------------------------------------------------------------------------
+
+	load W(0:15) (big-endian per 4 bytes) into 4 Q registers
+    pre_calculate and store WK = W+K(0:15) in 16-byte aligned stack memory
+
+L_loop:
+
+    load digests a-e from ctx->state;
+
+    for (r=0;r<16;r+=4) {
+        digests a-e update and permute round r:r+3
+        update W([r:r+3]%16) (Gaudet) and WK([r:r+3]%16) for the next 4th iteration 
+    }
+
+    for (r=16;r<64;r+=4) {
+        digests a-e update and permute round r:r+3
+        update W([r:r+3]%16) (Intel) and WK([r:r+3]%16) for the next 4th iteration 
+    }
+
+    num_block--;
+    if (num_block==0)   jmp L_last_block;
+
+    for (r=64;r<80;r+=4) {
+        digests a-e update and permute round r:r+3
+        load W([r:r+3]%16) (big-endian per 4 bytes) into 4 Q registers
+        pre_calculate and store W+K([r:r+3]%16) in stack
+    }
+
+    ctx->states += digests a-e;
+
+    jmp L_loop;
+
+L_last_block:
+
+    for (r=64;r<80;r+=4) {
+        digests a-e update and permute round r:r+3
+    }
+
+    ctx->states += digests a-e;
+
+
+	----------------------------------------------------------------------------------------------------------
+	
+*/
+
+#if defined(__arm64__)
+
+#include "arm64_isa_compatibility.h"
+#include "ccarm_pac_bti_macros.h"
+
+.subsections_via_symbols
+	.text
+
+	.p2align	4
+
+#define K1 0x5a827999
+#define K2 0x6ed9eba1
+#define K3 0x8f1bbcdc
+#define K4 0xca62c1d6
+
+K_XMM_AR:
+    .long	K1
+	.long	K1
+	.long	K1
+	.long	K1
+    .long	K2
+	.long	K2
+	.long	K2
+	.long	K2
+    .long	K3
+	.long	K3
+	.long	K3
+	.long	K3
+    .long	K4
+	.long	K4
+	.long	K4
+	.long	K4
+
+	.p2align	4
+
+    .globl _AccelerateCrypto_SHA1_compress
+_AccelerateCrypto_SHA1_compress:
+
+
+	#define hashes		x0
+	#define	numblocks	x1
+	#define	data		x2
+	#define	ktable		x3
+
+	BRANCH_TARGET_CALL
+
+#ifdef __ILP32__
+    uxtw    numblocks, numblocks        // in arm64_32 size_t is 32-bit, so we need to extend it
+#endif
+
+	// early exit if input number of blocks is zero
+
+    adrp    ktable, K_XMM_AR@page
+	cbnz	numblocks, 1f
+	ret		lr
+1:
+    add     ktable, ktable, K_XMM_AR@pageoff	// K table
+
+#if BUILDKERNEL
+
+	// saved vector registers that will be used in the computation v0-v7, v16-v21
+
+	sub		x4, sp, #17*16
+	sub		sp, sp, #17*16
+
+	st1.4s	{v0,v1,v2,v3}, [x4], #64
+	st1.4s	{v4,v5,v6,v7}, [x4], #64
+	st1.4s	{v16,v17,v18,v19}, [x4], #64
+	st1.4s	{v20,v21,v22,v23}, [x4], #64
+	st1.4s	{v24}, [x4], #16
+
+#endif
+
+	ld1.4s	{v0,v1,v2,v3}, [data], #64			// w0,w1,w2,w3 need to bswap into big-endian
+	ld1.4s	{v21,v22,v23,v24}, [ktable], #64	// k1,k2,k3,k4
+	ldr		q16, [hashes], #16
+	ldr		s17, [hashes], #-16
+
+    rev32.16b	v0, v0					// byte swap of 1st 4 ints
+    rev32.16b	v1, v1					// byte swap of 2nd 4 ints
+    rev32.16b	v2, v2					// byte swap of 3rd 4 ints
+    rev32.16b	v3, v3					// byte swap of 4th 4 ints
+
+	mov.16b		v18, v16
+    add.4s		v4, v0, v21				// 1st 4 input + K256
+    add.4s		v5, v1, v21				// 2nd 4 input + K256
+	mov.16b		v19, v17
+    add.4s		v6, v2, v21				// 3rd 4 input + K256
+    add.4s		v7, v3, v21				// 4th 4 input + K256
+
+
+	.macro	sha1c_round
+	SHA1SU0	$0, $1, $2
+	mov.16b		v20, v18
+	SHA1C	18, 19, $4
+	SHA1H	19, 20
+	SHA1SU1	$0, $3
+	add.4s		$6, $5, $7
+	.endm
+
+	.macro	sha1p_round
+	SHA1SU0	$0, $1, $2
+	mov.16b		v20, v18
+	SHA1P	18, 19, $4
+	SHA1H	19, 20
+	SHA1SU1	$0, $3
+	add.4s		$6, $5, $7
+	.endm
+
+	.macro	sha1m_round
+	SHA1SU0	$0, $1, $2
+	mov.16b		v20, v18
+	SHA1M	18, 19, $4
+	SHA1H	19, 20
+	SHA1SU1	$0, $3
+	add.4s		$6, $5, $7
+	.endm
+
+	// 4 vector hashes update and load next vector rounds
+	.macro	sha1p_hash_load_round
+    rev32.16b	$1, $1
+	mov.16b		v20, v18
+	SHA1P	18, 19, $0
+	SHA1H	19, 20
+    add.4s		$2, $1, $3
+	.endm
+
+	.macro	sha1p_hash_round
+	mov.16b		v20, v18
+	SHA1P	18, 19, $0
+	SHA1H	19, 20
+	.endm
+
+	sha1c_round			0, 1, 2, 3, 4, v0, v4, v21
+	sha1c_round			1, 2, 3, 0, 5, v1, v5, v22
+	sha1c_round			2, 3, 0, 1, 6, v2, v6, v22
+	sha1c_round			3, 0, 1, 2, 7, v3, v7, v22
+
+	sha1c_round			0, 1, 2, 3, 4, v0, v4, v22
+	sha1p_round			1, 2, 3, 0, 5, v1, v5, v22
+	sha1p_round			2, 3, 0, 1, 6, v2, v6, v23
+	sha1p_round			3, 0, 1, 2, 7, v3, v7, v23
+
+	sha1p_round			0, 1, 2, 3, 4, v0, v4, v23
+	sha1p_round			1, 2, 3, 0, 5, v1, v5, v23
+	sha1m_round			2, 3, 0, 1, 6, v2, v6, v23
+	sha1m_round			3, 0, 1, 2, 7, v3, v7, v24
+
+	sha1m_round			0, 1, 2, 3, 4, v0, v4, v24
+	sha1m_round			1, 2, 3, 0, 5, v1, v5, v24
+	sha1m_round			2, 3, 0, 1, 6, v2, v6, v24
+	sha1p_round			3, 0, 1, 2, 7, v3, v7, v24
+
+	subs 		numblocks, numblocks, #1	// pre-decrement num_blocks by 1
+	b.le		L_wrapup
+
+
+L_loop:
+
+	ld1.4s	{v0,v1,v2,v3}, [data], #64			// w0,w1,w2,w3 need to bswap into big-endian
+
+	sha1p_hash_load_round	4, v0, v4, v21
+	sha1p_hash_load_round	5, v1, v5, v21
+	sha1p_hash_load_round	6, v2, v6, v21
+	sha1p_hash_load_round	7, v3, v7, v21
+
+	add.4s		v18, v16, v18
+	add.4s		v19, v17, v19
+	mov.16b		v16, v18
+	mov.16b		v17, v19
+
+	sha1c_round			0, 1, 2, 3, 4, v0, v4, v21
+	sha1c_round			1, 2, 3, 0, 5, v1, v5, v22
+	sha1c_round			2, 3, 0, 1, 6, v2, v6, v22
+	sha1c_round			3, 0, 1, 2, 7, v3, v7, v22
+
+	sha1c_round			0, 1, 2, 3, 4, v0, v4, v22
+	sha1p_round			1, 2, 3, 0, 5, v1, v5, v22
+	sha1p_round			2, 3, 0, 1, 6, v2, v6, v23
+	sha1p_round			3, 0, 1, 2, 7, v3, v7, v23
+
+	sha1p_round			0, 1, 2, 3, 4, v0, v4, v23
+	sha1p_round			1, 2, 3, 0, 5, v1, v5, v23
+	sha1m_round			2, 3, 0, 1, 6, v2, v6, v23
+	sha1m_round			3, 0, 1, 2, 7, v3, v7, v24
+
+	sha1m_round			0, 1, 2, 3, 4, v0, v4, v24
+	sha1m_round			1, 2, 3, 0, 5, v1, v5, v24
+	sha1m_round			2, 3, 0, 1, 6, v2, v6, v24
+	sha1p_round			3, 0, 1, 2, 7, v3, v7, v24
+
+	subs 		numblocks, numblocks, #1	// pre-decrement num_blocks by 1
+	b.gt		L_loop
+
+L_wrapup:
+
+	sha1p_hash_round	4
+	sha1p_hash_round	5
+	sha1p_hash_round	6
+	sha1p_hash_round	7
+
+	add.4s		v16, v16, v18
+	add.4s		v17, v17, v19
+	str			q16,[hashes], #16
+	str			s17,[hashes]
+
+
+
+#if BUILDKERNEL
+
+	// restore vector registers that have be used clobbered in the computation v0-v7, v16-v21
+
+	ld1.4s	{v0,v1,v2,v3}, [sp], #64
+	ld1.4s	{v4,v5,v6,v7}, [sp], #64
+	ld1.4s	{v16,v17,v18,v19}, [sp], #64
+	ld1.4s	{v20,v21,v22,v23}, [sp], #64
+	ld1.4s	{v24}, [sp], #16
+
+#endif
+
+	ret			lr
+
+#endif // define(__arm64__)
+
--- a/acceleratecrypto/Source/sha1/intel/sha1_compress.c
+++ b/acceleratecrypto/Source/sha1/intel/sha1_compress.c
@ -0,0 +1,30 @@
+/* Copyright (c) (2019) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to 
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by 
+ * Apple Inc. (if any) are limited to internal use within your organization only on 
+ * devices and computers you own or control, for the sole purpose of verifying the 
+ * security characteristics and correct functioning of the Apple Software.  You may 
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#include <stddef.h>
+#include "config.h"
+#include "AccelerateCrypto.h"
+
+#if (defined(__x86_64__) || defined(__i386__))
+extern void AccelerateCrypto_SHA1_compress_ssse3(uint32_t *state, size_t num, const void *buf) __asm__("_AccelerateCrypto_SHA1_compress_ssse3");
+extern void AccelerateCrypto_SHA1_compress_AVX1(uint32_t *state, size_t num, const void *buf) __asm__("_AccelerateCrypto_SHA1_compress_AVX1");
+extern void AccelerateCrypto_SHA1_compress_AVX2(uint32_t *state, size_t num, const void *buf) __asm__("_AccelerateCrypto_SHA1_compress_AVX2");
+
+void  AccelerateCrypto_SHA1_compress(uint32_t *state, size_t num, const void *buf)
+{
+#if defined(__x86_64__)
+    if (HAS_AVX2()) AccelerateCrypto_SHA1_compress_AVX2(state, num, buf);
+    else if (HAS_AVX1()) AccelerateCrypto_SHA1_compress_AVX1(state, num, buf);
+    else 
+#endif
+        AccelerateCrypto_SHA1_compress_ssse3(state, num, buf);  
+}
+#endif  // (defined(__x86_64__) || defined(__i386__))
--- a/acceleratecrypto/Source/sha1/intel/sha1_compress_avx1.s
+++ b/acceleratecrypto/Source/sha1/intel/sha1_compress_avx1.s
@ -0,0 +1,785 @@
+# Copyright (c) (2016,2018,2019) Apple Inc. All rights reserved.
+#
+# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+# is contained in the License.txt file distributed with corecrypto) and only to 
+# people who accept that license. IMPORTANT:  Any license rights granted to you by 
+# Apple Inc. (if any) are limited to internal use within your organization only on 
+# devices and computers you own or control, for the sole purpose of verifying the 
+# security characteristics and correct functioning of the Apple Software.  You may 
+# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+#include <corecrypto/cc_config.h>
+
+#if defined(__x86_64__)
+
+/* 	vng_sha1LittleEndian.s : this file provides optimized x86_64 avx1 implementation of the sha1 function
+	CoreOS - vector and numerics group
+
+	The implementation is based on the principle described in an Intel online article
+	"Improving the Performance of the Secure Hash Algorithm (SHA-1)"
+	http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
+
+
+	Update HASH[] by processing a one 64-byte block in MESSAGE[] can be represented by the following C function
+
+void SHA1( int HASH[], int MESSAGE[] )
+{
+    int A[81], B[81], C[81], D[81], E[81];
+    int W[80];
+
+    int i, FN;
+
+    A[0] = HASH[0];
+    B[0] = HASH[1];
+    C[0] = HASH[2];
+    D[0] = HASH[3];
+    E[0] = HASH[4];
+
+    for ( i=0; i<80; ++i )
+    {
+        if ( i < 16 )
+            W[i] = BIG_ENDIAN_LOAD( MESSAGE[i] );
+        else
+            W[i] = ROTATE_LEFT( W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16], 1 );
+
+        FN = F( i, B[i], C[i], D[i] );
+
+        A[i+1] = FN + E[i] + ROTATE_LEFT( A[i], 5 ) + W[i] + K(i);
+        B[i+1] = A[i];
+        C[i+1] = ROTATE_LEFT( B[i], 30 );
+        D[i+1] = C[i];
+        E[i+1] = D[i];
+    }
+
+    HASH[0] += A[80];
+    HASH[1] += B[80];
+    HASH[2] += C[80];
+    HASH[3] += D[80];
+    HASH[4] += E[80];
+}
+
+	For i=0:15, W[i] is simply big-endian loading of MESSAGE[i]. For i=16:79, W[i] is updated according to W[i] = ROTATE_LEFT( W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16], 1 );
+
+	The approach (by Dean Gaudet) can be used to vectorize the computation of W[i] for i=16:79,
+
+	1. done on 4 consequtive W[i] values in a single XMM register
+    W[i  ] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1
+    W[i+1] = (W[i-2] ^ W[i-7] ^ W[i-13] ^ W[i-15]) rol 1
+    W[i+2] = (W[i-1] ^ W[i-6] ^ W[i-12] ^ W[i-14]) rol 1
+    W[i+3] = (   0   ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1
+
+    2. this additional calculation unfortunately requires many additional operations
+    W[i+3] ^= W[i] rol 1
+
+    3. once we have 4 W[i] values in XMM we can also add four K values with one instruction
+    W[i:i+3] += {K,K,K,K}
+
+	Let W0 = {W[i] W[i+1] W[i+2] W[i+3]} be the current W-vector to be computed, W4 = {W[i-4] W[i-3] W[i-2] W[i-1]} be the previous vector, and so on
+	The Dean Gaudet approach can be expressed as
+
+	1. W0 = rotate_left(left_shift(W4,32) ^ W8 ^ left_shift(concatenate(W16,W12),64) ^ W16,1);
+	2. W[i+3] ^= W[i] rol 1
+	3. W0 += {K,K,K,K}
+
+	For i>=32, the Intel online article suggests that (using a basic identity (X rol 1) rol 1 = X rol 2) the update equation is equivalent to
+
+	1. W0 = rotate_left(left_shift(concatenate(W8,W4),64) ^ W16 ^ W28 ^ W32, 2);
+
+	Note:
+	1. In total, we need 8 16-byte registers or memory for W0,W4,...,W28. W0 and W32 can be the same register or memory.
+	2. The registers are used in a circular buffering mode. For example, we start with W28,W24,...,W0 (with W0 indicating the most recent 16-byte)
+		i=0, W28,W24,...,W0
+		i=4, W24,W20,...,W28
+		i=8, W20,W16,...,W24
+		.
+		.
+		and so forth.
+	3. 2 ssse3 instructions are used in the Intel article, pshufb and palignr.
+		a. pshufb is used to simplify the BIG_ENDIAN_LOAD operation
+		b. palignr is used to simplify the computation of left_shift(concatenate(W12,W8),64)
+
+*/
+
+/* the code can be compiled into single block (64 bytes) per call mode by setting Multiple_blocks to 0 */
+#define	Multiple_Blocks	1
+
+#if defined (__x86_64__)
+
+#if BUILDKERNEL
+#define	stack_size	(32*10+16*4+16)					// ymm0-9 + 4 128-bits for intermediate WK(t) storage + 32-byte alignment
+#else
+#define	stack_size	(16*4)					        // 4 128-bits for intermediate WK(t) storage 
+#endif
+#define	sp			%rsp							// unifying architectural stack pointer representation
+#define	ctx			%rdi							// 1st input argument, will move to HASH_PTR (%r9)
+#define	buf			%rdx							// 3rd input argument, will move to BUFFER_PTR (%r10)
+#define	cnt         %r11							// will copy from the 2nd input argument (%rsi)
+#define K_BASE		%r8								// an aligned pointer to point to shufb reference numbers of table of K values
+#define HASH_PTR	%r9								// pointer to Hash values (A,B,C,D,E)
+#define BUFFER_PTR  %r10							// pointer to input blocks
+
+// symbolizing registers or stack memory with algorithmic variables	W0,W4,...,W28 + W_TMP, W_TMP2, and XMM_SHUFB_BSWAP for code with avx1 support
+
+#define W_TMP  	%xmm0
+#define W_TMP2 	%xmm1
+#define W0  	%xmm2
+#define W4  	%xmm3
+#define W8  	%xmm4
+#define W12 	%xmm5
+#define W16 	%xmm6
+#define W20 	%xmm7
+#define W24 	%xmm8
+#define W28 	%xmm9
+#define XMM_SHUFB_BSWAP REV32(%rip)
+
+#define	xmov	vmovaps						// aligned 16-byte move
+#define	xmovu	vmovups						// unaligned 16-byte move
+
+// intermediate hash variables
+#define A %ecx
+#define B %esi
+#define C %edi
+#define D %r15d
+#define E %edx
+
+// temp variables
+#define T1 %eax
+#define T2 %ebx
+
+#define WK(t)	((t)&15)*4(sp)
+
+	// int F1(int B, int C, int D) { return (D ^ ( B & (C ^ D)); }
+	// result in T1
+	.macro	F1 arg0, arg1, arg2
+	mov	\arg1, T1
+	xor	\arg2, T1
+	and	\arg0, T1
+	xor	\arg2, T1
+	.endm
+
+	// int F2(int B, int C, int D) { return (D ^ B ^ C); }
+	// result in T1
+	.macro	F2 arg0, arg1, arg2
+	mov	\arg2, T1
+	xor	\arg1, T1
+	xor	\arg0, T1
+	.endm
+
+	// int F3(int B, int C, int D) { return (B & C) | (D & (B ^ C)); }
+	// result in T1
+	.macro	F3 arg0, arg1, arg2
+		mov \arg1, T1
+        mov \arg0, T2
+        or  \arg0, T1
+        and \arg1, T2
+        and \arg2, T1
+        or  T2, T1
+	.endm
+
+	// for i=60:79, F4 is identical to F2
+	#define	F4	F2
+
+
+	/*
+		i=0:15, W[i] = BIG_ENDIAN_LOAD(MESSAGE[i]);
+
+		for (i=0;i<16;i+=4) {
+			1. W_TMP = new 16 bytes from MESSAGE[]
+			2. W_TMP = pshufb(W_TMP, XMM_SHUFB_BSWAP); save to W circular buffer for updating W
+			3. WTMP += {K,K,K,K};
+			4. save quadruple W[i]+K[i] = W_TMP in the stack memory;
+		}
+
+		each step is represented in one of the following 4 macro definitions
+
+	*/
+
+	.macro	W_PRECALC_00_15_0 arg0   			// input argument $0 : 0/4/8/12
+	xmovu	\arg0*4(BUFFER_PTR), W_TMP			// read 16-bytes into W_TMP, BUFFER_PTR possibly not 16-byte aligned
+	.endm
+
+	.macro	W_PRECALC_00_15_1 arg0   			// input argument $0 : current 16-bytes in the circular buffer, one of W0,W4,W8,...,W28
+	vpshufb	XMM_SHUFB_BSWAP, W_TMP, \arg0		// convert W_TMP from little-endian into big-endian
+	.endm
+
+	.macro	W_PRECALC_00_15_2 arg0				// K_BASE points to the current K quadruple.
+	vpaddd	(K_BASE), \arg0, W_TMP					// W_TMP += {K,K,K,K};
+	.endm
+
+	.macro	W_PRECALC_00_15_3 arg0
+	xmov	W_TMP, WK(\arg0&~3)				// save quadruple W[i]+K in the stack memory, which would be used later for updating the hashes A/B/C/D/E
+	.endm
+
+	// rounds 16-31 compute W[0] using the vectorization approach by Dean Gaudet
+	/*
+	W[i  ] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1
+    W[i+1] = (W[i-2] ^ W[i-7] ^ W[i-13] ^ W[i-15]) rol 1
+    W[i+2] = (W[i-1] ^ W[i-6] ^ W[i-12] ^ W[i-14]) rol 1
+    W[i+3] = (   0   ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1
+
+	W[i+3] ^= W[i] rol 1;	// this W[i] is already rol by 1, if we are taking from the intial W before rol 1, we should rol this by 2
+
+	The operation (updating W and W+K) is scheduled as and divided into 4 steps
+
+	0. W_tmp = W3; W = W14 ^ W8
+	1. W = W3 ^ W8 ^ W14 ^ W16; W_TMP = W; W_TMP2 = (W[i] 0 0 0);
+	2. W_TMP = (W3 ^ W8 ^ W14 ^ W16) rol 1; split (W[i] 0 0 0) rol 2 in W_TMP2 and W
+	3. W = W_TMP = W_TMP ^ W_TMP2 ^ W = (W3 ^ W8 ^ W14 ^ W16) rol 1 ^ (W[i] 0 0 0) rol 2; WK = W _TMP+K;
+
+	*/
+
+	.macro	W_PRECALC_16_31_0 arg0, arg1, arg2, arg3, arg4   	// input arguments : W16,W12,W8,W4,W
+	vpalignr	$8, \arg0, \arg1, \arg4		// W = W14
+	vpsrldq	    $4, \arg3, W_TMP		// W_TMP = W3
+	vpxor	    \arg2, \arg4, \arg4			// W = W8 ^ W14
+	.endm
+
+	.macro	W_PRECALC_16_31_1 arg0, arg1		// input arguments : W16,W
+	vpxor	\arg0, W_TMP, W_TMP		// W_TMP = W3 ^ W16
+	vpxor	W_TMP, \arg1, \arg1			// W = W3 ^ W16 ^ W8 ^ W14
+	vpslldq	$12, \arg1, W_TMP2			// W_TMP2 = (W[i] 0 0 0)
+	.endm
+
+	.macro	W_PRECALC_16_31_2 arg0		// input argument : W
+	vpslld	$1, \arg0, W_TMP			// (W3 ^ W16 ^ W8 ^ W14)<<1
+	vpsrld	$31, \arg0, \arg0			// (W3 ^ W16 ^ W8 ^ W14)>>31
+	vpor	\arg0, W_TMP, W_TMP		// W_TMP = (W3 ^ W16 ^ W8 ^ W14) rol 1
+	vpslld	$2, W_TMP2, \arg0			// W = W[i] higher 30 bits after rol 2
+	vpsrld	$30, W_TMP2, W_TMP2	// W_TMP2 = W[i] lower 2 bits after rol 2
+	.endm
+
+	.macro	W_PRECALC_16_31_3 arg0, arg1, arg2		// input arguments: W, i, K_XMM
+	vpxor	W_TMP, \arg0, \arg0
+	vpxor	W_TMP2, \arg0, \arg0			// W_TMP = (W3 ^ W16 ^ W8 ^ W14) rol 1 ^ (W[i] 0 0 0) rol 2
+	vpaddd	\arg2(K_BASE), \arg0, W_TMP	// W+K
+	xmov	W_TMP, WK(\arg1&~3)		// save WK = W+K for later update of the hashes A/B/C/D/E
+	.endm
+
+	/* rounds 32-79 compute W und W+K iusing the vectorization approach from the Intel article
+
+		W = rotate_left(left_shift(concatenate(W8,W4),64) ^ W16 ^ W28 ^ W32, 2);
+
+		where left_shift(concatenate(W8,W4),64) is equivalent to W6. Note also that W32 and W use the same register.
+
+
+	0. W_tmp = W6; W = W28 ^ W32;
+	1. W = W_tmp = W6 ^ W16 ^ W28 ^ W32;
+	2. W_tmp = (W6 ^ W16 ^ W28 ^ W32) rol 2;
+	3. W = W_Tmp; WK = W_tmp + K;
+
+	*/
+
+
+	.macro	W_PRECALC_32_79_0 arg0, arg1, arg2, arg3   		// inputr arguments : W28,W8,W4,W
+	vpxor	    \arg0, \arg3, \arg3				// W = W28 ^ W32;
+	vpalignr	$8, \arg1, \arg2, W_TMP		// W_tmp = (w3 w4 w5 w6) = W6;
+	.endm
+
+	.macro	W_PRECALC_32_79_1 arg0, arg1			// input arguments : W16,W
+	vpxor	\arg0, \arg1, \arg1					// W_tmp = W6 ^ W16
+	vpxor	W_TMP, \arg1, \arg1				// W_tmp = W6 ^ W16 ^ W28 ^ W32
+	//xmov	W_TMP, \arg1					// W = W_tmp = W6 ^ W16 ^ W28 ^ W32
+	.endm
+
+	.macro	W_PRECALC_32_79_2 arg0			// input argument : W
+	vpslld	$2, \arg0, W_TMP				// W << 2
+	vpsrld	$30, \arg0, \arg0				// W >> 30
+	vpor	W_TMP, \arg0, \arg0				// W_tmp = (W6 ^ W16 ^ W28 ^ W32) rol 2
+	.endm
+
+	.macro	W_PRECALC_32_79_3 arg0, arg1, arg2			// input argument W, i, K_XMM
+	vpaddd	\arg2(K_BASE), \arg0, W_TMP		// W + K
+	xmov	W_TMP, WK(\arg1&~3)			// write W+K
+	.endm
+
+
+	/* The hash update operation is completed by the following statements.
+
+		A[i+1] = FN + E[i] + ROTATE_LEFT( A[i], 5 ) + WK(i);
+        B[i+1] = A[i];
+        C[i+1] = ROTATE_LEFT( B[i], 30 );
+        D[i+1] = C[i];
+        E[i+1] = D[i];
+
+		Suppose we start with A0,B0,C0,D0,E0. The 1st iteration can be expressed as follows:
+
+		A1 = FN + E0 + rol(A0,5) + WK;
+		B1 = A0;
+		C1 = rol(B0, 30);
+		D1 = C0;
+		E1 = D0;
+
+		to avoid excessive memory movement between registers,
+			1. A1 = FN + E0 + rol(A0,5) + WK; can be temporarily saved in E0,
+			2. C1 = rol(B0,30) can be temporarily saved in B0.
+
+		Therefore, ignoring the time index, the update operation is equivalent to
+			1. E = FN(B,C,D) + E + rol(A,5) + WK(i)
+			2. B = rol(B,30)
+			3. the hashes are now stored in the order of E,A,B,C,D
+
+
+		To pack 2 hash update operations in 1 iteration, starting with A,B,C,D,E
+		1. E = FN(B,C,D) + E + rol(A,5) + WK(i)
+		2. B = rol(B,30)
+		// now the hashes are in the order of E,A,B,C,D
+		3. D = FN(A,B,C) + D + rol(E,5) + WK(i+1)
+		4. A = rol(A,30)
+		// now the hashes are in the order of D,E,A,B,C
+
+		These operations are distributed into the following 2 macro definitions RR0 and RR1.
+
+	*/
+
+	.macro	RR0 arg0, arg1, arg2, arg3, arg4, arg5, arg6				// input arguments : FN, A, B, C, D, E, i
+	\arg0		\arg2, \arg3, \arg4		// T1 = FN(B,C,D)
+	rol		$30, \arg2		// B = rol(B,30)
+	add		WK(\arg6), \arg5		// E + WK(i)
+    mov     \arg1, T2          // T2 = A
+	add		WK(\arg6+1), \arg4	// D + WK(i+1)
+    rol     $5, T2         // rol(A,5)
+	add		T1, \arg5			// E = FN(B,C,D) + E + WK(i)
+	.endm
+
+	.macro	RR1 arg0, arg1, arg2, arg3, arg4, arg5, arg6
+    add     \arg5, T2          // T2 = FN(B,C,D) + E + rol(A,5) + WK(i)
+    mov     T2, \arg5          // E = FN(B,C,D) + E + rol(A,5) + WK(i)
+    rol     $5, T2         // rol(E,5)
+	add		T2, \arg4			// D + WK(i+1) + rol(E,5)
+	\arg0		\arg1, \arg2, \arg3		// FN(A,B,C)
+	add		T1, \arg4			// D = FN(A,B,C) + D + rol(E,5) + WK(i+1)
+	rol		$30, \arg1		// A = rol(A,30)
+	.endm
+
+
+	.macro	INITIAL_W_PRECALC   			// BIG_ENDIAN_LOAD(64 bytes block) into W (i=0:15) and store W+K into the stack memory
+
+	// i=0 	: W28,W24,W20,W16,W12,W8,W4,W0
+	W_PRECALC_00_15_0	0					// W_TMP = (BUFFER_PTR)
+	W_PRECALC_00_15_1	W0					// convert W_TMP to big-endian, and save W0 = W_TMP
+	W_PRECALC_00_15_2   W0  				// W_TMP = W0 + K
+	W_PRECALC_00_15_3	3					// (sp) = W_TMP = W0 + K
+
+	// i=4	: W24,W20,W16,W12,W8,W4,W0,W28
+	W_PRECALC_00_15_0	4					// W_TMP = 16(BUFFER_PTR)
+	W_PRECALC_00_15_1	W28					// convert W_TMP to big-endian, and save W28 = W_TMP
+	W_PRECALC_00_15_2   W28					// W_TMP = W28 + K
+	W_PRECALC_00_15_3	7					// 16(sp) = W_TMP = W28 + K
+
+	// i=8  : W20,W16,W12,W8,W4,W0,W28,W24
+	W_PRECALC_00_15_0	8					// W_TMP = 32(BUFFER_PTR)
+	W_PRECALC_00_15_1	W24					// convert W_TMP to big-endian, and save W24 = W_TMP
+	W_PRECALC_00_15_2   W24					// W_TMP = W24 + K
+	W_PRECALC_00_15_3	11					// 32(sp) = W_TMP = W24 + K
+
+	// i=12 : W16,W12,W8,W4,W0,W28,W24,W20
+	W_PRECALC_00_15_0	12					// W_TMP = 48(BUFFER_PTR)
+	W_PRECALC_00_15_1	W20					// convert W_TMP to big-endian, and save W20 = W_TMP
+	W_PRECALC_00_15_2   W20					// W_TMP = W20 + K
+	W_PRECALC_00_15_3	15					// 48(sp) = W_TMP = W20 + K
+
+	.endm
+
+
+	.macro	INTERNAL    					// updating W (16:79) and update the digests A/B/C/D/E (i=0:63, based on W+K stored in the stack memory)
+
+	// i=16 : W12,W8,W4,W0,W28,W24,W20,W16
+	W_PRECALC_16_31_0	W0,W28,W24,W20,W16
+	RR0					F1,A,B,C,D,E,0
+	W_PRECALC_16_31_1	W0,W16
+	RR1					F1,A,B,C,D,E,0
+	W_PRECALC_16_31_2	W16
+	RR0					F1,D,E,A,B,C,2
+	W_PRECALC_16_31_3	W16, 2, 0
+	RR1					F1,D,E,A,B,C,2
+
+	// i=20 : W8,W4,W0,W28,W24,W20,W16,W12
+	W_PRECALC_16_31_0	W28,W24,W20,W16,W12
+	RR0					F1,B,C,D,E,A,4
+	W_PRECALC_16_31_1	W28,W12
+	RR1					F1,B,C,D,E,A,4
+	W_PRECALC_16_31_2	W12
+	RR0					F1,E,A,B,C,D,6
+	W_PRECALC_16_31_3	W12, 6, 16
+	RR1					F1,E,A,B,C,D,6
+
+	// i=24 : W4,W0,W28,W24,W20,W16,W12,W8
+	W_PRECALC_16_31_0	W24,W20,W16,W12,W8
+	RR0					F1,C,D,E,A,B,8
+	W_PRECALC_16_31_1	W24,W8
+	RR1					F1,C,D,E,A,B,8
+	W_PRECALC_16_31_2	W8
+	RR0					F1,A,B,C,D,E,10
+	W_PRECALC_16_31_3	W8,10,16
+	RR1					F1,A,B,C,D,E,10
+
+	// i=28 : W0,W28,W24,W20,W16,W12,W8,W4
+	W_PRECALC_16_31_0	W20,W16,W12,W8,W4
+	RR0					F1,D,E,A,B,C,12
+	W_PRECALC_16_31_1	W20,W4
+	RR1					F1,D,E,A,B,C,12
+	W_PRECALC_16_31_2	W4
+	RR0					F1,B,C,D,E,A,14
+	W_PRECALC_16_31_3	W4,14,16
+	RR1					F1,B,C,D,E,A,14
+
+	// i=32 : W28,W24,W20,W16,W12,W8,W4,W0
+	W_PRECALC_32_79_0	W28,W8,W4,W0
+	RR0					F1,E,A,B,C,D,16
+	W_PRECALC_32_79_1	W16,W0
+	RR1					F1,E,A,B,C,D,16
+	W_PRECALC_32_79_2	W0
+	RR0					F1,C,D,E,A,B,18
+	W_PRECALC_32_79_3	W0,18,16
+	RR1					F1,C,D,E,A,B,18
+
+	// starting using F2
+
+	// i=36 : W24,W20,W16,W12,W8,W4,W0,W28
+	W_PRECALC_32_79_0	W24,W4,W0,W28
+	RR0					F2,A,B,C,D,E,20
+	W_PRECALC_32_79_1	W12,W28
+	RR1					F2,A,B,C,D,E,20
+	W_PRECALC_32_79_2	W28
+	RR0					F2,D,E,A,B,C,22
+	W_PRECALC_32_79_3	W28,22,16
+	RR1					F2,D,E,A,B,C,22
+
+	// i=40 : W20,W16,W12,W8,W4,W0,W28,W24
+	#undef  K_XMM
+    #define K_XMM   32
+	W_PRECALC_32_79_0	W20,W0,W28,W24
+	RR0					F2,B,C,D,E,A,24
+	W_PRECALC_32_79_1	W8,W24
+	RR1					F2,B,C,D,E,A,24
+	W_PRECALC_32_79_2	W24
+	RR0					F2,E,A,B,C,D,26
+	W_PRECALC_32_79_3	W24,26,K_XMM
+	RR1					F2,E,A,B,C,D,26
+
+	// i=44 : W16,W12,W8,W4,W0,W28,W24,W20
+	W_PRECALC_32_79_0	W16,W28,W24,W20
+	RR0					F2,C,D,E,A,B,28
+	W_PRECALC_32_79_1	W4,W20
+	RR1					F2,C,D,E,A,B,28
+	W_PRECALC_32_79_2	W20
+	RR0					F2,A,B,C,D,E,30
+	W_PRECALC_32_79_3	W20,30,K_XMM
+	RR1					F2,A,B,C,D,E,30
+
+	// i=48 : W12,W8,W4,W0,W28,W24,W20,W16
+	W_PRECALC_32_79_0	W12,W24,W20,W16
+	RR0					F2,D,E,A,B,C,32
+	W_PRECALC_32_79_1	W0,W16
+	RR1					F2,D,E,A,B,C,32
+	W_PRECALC_32_79_2	W16
+	RR0					F2,B,C,D,E,A,34
+	W_PRECALC_32_79_3	W16,34,K_XMM
+	RR1					F2,B,C,D,E,A,34
+
+	// i=52 : W8,W4,W0,W28,W24,W20,W16,W12
+	W_PRECALC_32_79_0	W8,W20,W16,W12
+	RR0					F2,E,A,B,C,D,36
+	W_PRECALC_32_79_1	W28,W12
+	RR1					F2,E,A,B,C,D,36
+	W_PRECALC_32_79_2	W12
+	RR0					F2,C,D,E,A,B,38
+	W_PRECALC_32_79_3	W12,38,K_XMM
+	RR1					F2,C,D,E,A,B,38
+
+	// starting using F3
+
+	// i=56 : W4,W0,W28,W24,W20,W16,W12,W8
+	W_PRECALC_32_79_0	W4,W16,W12,W8
+	RR0					F3,A,B,C,D,E,40
+	W_PRECALC_32_79_1	W24,W8
+	RR1					F3,A,B,C,D,E,40
+	W_PRECALC_32_79_2	W8
+	RR0					F3,D,E,A,B,C,42
+	W_PRECALC_32_79_3	W8,42,K_XMM
+	RR1					F3,D,E,A,B,C,42
+
+	// i=60 : W0,W28,W24,W20,W16,W12,W8,W4
+	#undef	K_XMM
+	#define	K_XMM	48
+	W_PRECALC_32_79_0	W0,W12,W8,W4
+	RR0					F3,B,C,D,E,A,44
+	W_PRECALC_32_79_1	W20,W4
+	RR1					F3,B,C,D,E,A,44
+	W_PRECALC_32_79_2	W4
+	RR0					F3,E,A,B,C,D,46
+	W_PRECALC_32_79_3	W4,46,K_XMM
+	RR1					F3,E,A,B,C,D,46
+
+	// i=64 : W28,W24,W20,W16,W12,W8,W4,W0
+	W_PRECALC_32_79_0	W28,W8,W4,W0
+	RR0					F3,C,D,E,A,B,48
+	W_PRECALC_32_79_1	W16,W0
+	RR1					F3,C,D,E,A,B,48
+	W_PRECALC_32_79_2	W0
+	RR0					F3,A,B,C,D,E,50
+	W_PRECALC_32_79_3	W0,50,K_XMM
+	RR1					F3,A,B,C,D,E,50
+
+	// i=68 : W24,W20,W16,W12,W8,W4,W0,W28
+	W_PRECALC_32_79_0	W24,W4,W0,W28
+	RR0					F3,D,E,A,B,C,52
+	W_PRECALC_32_79_1	W12,W28
+	RR1					F3,D,E,A,B,C,52
+	W_PRECALC_32_79_2	W28
+	RR0					F3,B,C,D,E,A,54
+	W_PRECALC_32_79_3	W28,54,K_XMM
+	RR1					F3,B,C,D,E,A,54
+
+	// i=72 : W20,W16,W12,W8,W4,W0,W28,W24
+	W_PRECALC_32_79_0	W20,W0,W28,W24
+	RR0					F3,E,A,B,C,D,56
+	W_PRECALC_32_79_1	W8,W24
+	RR1					F3,E,A,B,C,D,56
+	W_PRECALC_32_79_2	W24
+	RR0					F3,C,D,E,A,B,58
+	W_PRECALC_32_79_3	W24,58,K_XMM
+	RR1					F3,C,D,E,A,B,58
+
+	// starting using F4
+
+	// i=76 : W16,W12,W8,W4,W0,W28,W24,W20
+	W_PRECALC_32_79_0	W16,W28,W24,W20
+	RR0					F4,A,B,C,D,E,60
+	W_PRECALC_32_79_1	W4,W20
+	RR1					F4,A,B,C,D,E,60
+	W_PRECALC_32_79_2	W20
+	RR0					F4,D,E,A,B,C,62
+	W_PRECALC_32_79_3	W20,62,K_XMM
+	RR1					F4,D,E,A,B,C,62
+
+	.endm
+
+	.macro	SOFTWARE_PIPELINING
+	// i=0  : W28,W24,W20,W16,W12,W8,W4,W0
+	W_PRECALC_00_15_0	0					// W_TMP = (BUFFER_PTR)
+	RR0					F4,B,C,D,E,A,64
+	W_PRECALC_00_15_1	W0					// convert W_TMP to big-endian, and save W0 = W_TMP
+	RR1					F4,B,C,D,E,A,64
+	W_PRECALC_00_15_2   W0					// W_TMP = W0 + K
+	RR0					F4,E,A,B,C,D,66
+	W_PRECALC_00_15_3	3					// (sp) = W_TMP = W0 + K
+	RR1					F4,E,A,B,C,D,66
+
+	// i=4  : W24,W20,W16,W12,W8,W4,W0,W28
+	W_PRECALC_00_15_0	4					// W_TMP = 16(BUFFER_PTR)
+	RR0					F4,C,D,E,A,B,68
+	W_PRECALC_00_15_1	W28					// convert W_TMP to big-endian, and save W28 = W_TMP
+	RR1					F4,C,D,E,A,B,68
+	W_PRECALC_00_15_2   W28					// W_TMP = W28 + K
+	RR0					F4,A,B,C,D,E,70
+	W_PRECALC_00_15_3	7					// 16(sp) = W_TMP = W28 + K[0]
+	RR1					F4,A,B,C,D,E,70
+
+	// i=8  : W20,W16,W12,W8,W4,W0,W28,W24
+	W_PRECALC_00_15_0	8					// W_TMP = 32(BUFFER_PTR)
+	RR0					F4,D,E,A,B,C,72
+	W_PRECALC_00_15_1	W24					// convert W_TMP to big-endian, and save W24 = W_TMP
+	RR1					F4,D,E,A,B,C,72
+	W_PRECALC_00_15_2	W24					// W_TMP = W24 + K
+	RR0					F4,B,C,D,E,A,74
+	W_PRECALC_00_15_3	11					// 32(sp) = W_TMP = W24 + K
+	RR1					F4,B,C,D,E,A,74
+
+	// i=12 : W16,W12,W8,W4,W0,W28,W24,W20
+	W_PRECALC_00_15_0	12					// W_TMP = 48(BUFFER_PTR)
+	RR0					F4,E,A,B,C,D,76
+	W_PRECALC_00_15_1	W20					// convert W_TMP to big-endian, and save W20 = W_TMP
+	RR1					F4,E,A,B,C,D,76
+	W_PRECALC_00_15_2	W20					// W_TMP = W20 + K
+	RR0					F4,C,D,E,A,B,78
+	W_PRECALC_00_15_3	15					// 48(sp) = W_TMP = W20 + K
+	RR1					F4,C,D,E,A,B,78
+	.endm
+
+
+	#undef	W_PRECALC_00_15_0
+	#undef	W_PRECALC_00_15_1
+	#undef	W_PRECALC_16_31_0
+	#undef	W_PRECALC_32_79_0
+
+	.macro	ENDING		// finish up updating hash digests (i=64:79)
+	//i=80
+	RR0					F4,B,C,D,E,A,64
+	RR1					F4,B,C,D,E,A,64
+	RR0					F4,E,A,B,C,D,66
+	RR1					F4,E,A,B,C,D,66
+
+	//i=84
+	RR0					F4,C,D,E,A,B,68
+	RR1					F4,C,D,E,A,B,68
+	RR0					F4,A,B,C,D,E,70
+	RR1					F4,A,B,C,D,E,70
+
+	//i=88
+	RR0					F4,D,E,A,B,C,72
+	RR1					F4,D,E,A,B,C,72
+	RR0					F4,B,C,D,E,A,74
+	RR1					F4,B,C,D,E,A,74
+
+	//i=92
+	RR0					F4,E,A,B,C,D,76
+	RR1					F4,E,A,B,C,D,76
+	RR0					F4,C,D,E,A,B,78
+	RR1					F4,C,D,E,A,B,78
+	.endm
+
+	// load hash digests A,B,C,D,E from memory into registers
+	.macro	LOAD_HASH
+	mov			(HASH_PTR), A
+	mov			4(HASH_PTR), B
+	mov			8(HASH_PTR), C
+	mov			12(HASH_PTR), D
+	mov			16(HASH_PTR), E
+	.endm
+
+	.macro	UPDATE_HASH arg0, arg1
+	add		\arg0, \arg1
+	mov		\arg1, \arg0
+	.endm
+
+	.macro UPDATE_ALL_HASH
+	UPDATE_HASH		(HASH_PTR), A
+	UPDATE_HASH		4(HASH_PTR), B
+	UPDATE_HASH		8(HASH_PTR), C
+	UPDATE_HASH		12(HASH_PTR), D
+	UPDATE_HASH		16(HASH_PTR), E
+	.endm
+
+
+	/*
+		 main sha1 code for system with avx1 support
+	*/
+
+	.macro  SHA1_PIPELINED_MAIN_BODY
+	LOAD_HASH						// load initial hashes into A,B,C,D,E
+	INITIAL_W_PRECALC   			// big_endian_load(W) and W+K (i=0:15)
+	.p2align	4,0x90
+0:
+	INTERNAL    					// update W (i=16:79) and update ABCDE (i=0:63)
+#if Multiple_Blocks
+	addq	_IMM(64), BUFFER_PTR			// BUFFER_PTR+=64;
+	subq	_IMM(1), cnt					// pre-decrement cnt by 1
+	jbe	1f							// if cnt <= 0, branch to finish off
+	SOFTWARE_PIPELINING     		// update ABCDE (i=64:79) || big_endian_load(W) and W+K (i=0:15)
+	UPDATE_ALL_HASH					// update output hashes
+	jmp	0b							// repeat for next block
+	.p2align	4,0x90
+1:
+#endif
+	ENDING							// update ABCDE (i=64:79)
+	UPDATE_ALL_HASH					// update output hashes
+	.endm
+
+/*
+	I removed the cpu capabilities check.  The check is now down
+	in C code and the appropriate version of the assembler code
+	is selected.
+*/
+	.text
+	.globl _AccelerateCrypto_SHA1_compress_AVX1
+_AccelerateCrypto_SHA1_compress_AVX1:
+
+	// start the sha1 code with avx1 support
+
+	// save callee-save registers
+	push	%rbp
+    mov     %rsp, %rbp
+	push	%rbx
+	push	%r15
+
+	sub		$stack_size, sp					// allocate stack memory for use
+
+	// save used xmm register if this is for kernel
+#if BUILDKERNEL
+    andq    $-32, sp                        // aligned sp to 32-bytes
+    leaq    4*16(sp), %rax
+	xmov	%ymm0, 0*32(%rax)
+	xmov	%ymm1, 1*32(%rax)
+	xmov	%ymm2, 2*32(%rax)
+	xmov	%ymm3, 3*32(%rax)
+	xmov	%ymm4, 4*32(%rax)
+	xmov	%ymm5, 5*32(%rax)
+	xmov	%ymm6, 6*32(%rax)
+	xmov	%ymm7, 7*32(%rax)
+	xmov	%ymm8, 8*32(%rax)
+	xmov	%ymm9, 9*32(%rax)
+#endif
+
+
+	// set up registers to free %edx/%edi/%esi for other use (ABCDE)
+	mov		ctx, HASH_PTR
+	mov		buf, BUFFER_PTR
+#if Multiple_Blocks
+	mov		%rsi, cnt
+#endif
+	lea		K_XMM_AR(%rip), K_BASE
+
+
+	SHA1_PIPELINED_MAIN_BODY
+
+	// restore used xmm registers if this is for kernel
+#if BUILDKERNEL
+    leaq    4*16(sp), %rax
+    xmov    0*32(%rax), %ymm0
+    xmov    1*32(%rax), %ymm1
+    xmov    2*32(%rax), %ymm2
+    xmov    3*32(%rax), %ymm3
+    xmov    4*32(%rax), %ymm4
+    xmov    5*32(%rax), %ymm5
+    xmov    6*32(%rax), %ymm6
+    xmov    7*32(%rax), %ymm7
+    xmov    8*32(%rax), %ymm8
+    xmov    9*32(%rax), %ymm9
+#endif
+
+    leaq    -16(%rbp), %rsp
+
+	// restore callee-save registers
+	pop		%r15
+	pop		%rbx
+	pop		%rbp
+
+	ret							// return
+
+	CC_ASM_SECTION_CONST
+	.p2align	4, 0x90
+
+#define K1 0x5a827999
+#define K2 0x6ed9eba1
+#define K3 0x8f1bbcdc
+#define K4 0xca62c1d6
+
+K_XMM_AR:
+    .long	K1
+	.long	K1
+	.long	K1
+	.long	K1
+    .long	K2
+	.long	K2
+	.long	K2
+	.long	K2
+    .long	K3
+	.long	K3
+	.long	K3
+	.long	K3
+    .long	K4
+	.long	K4
+	.long	K4
+	.long	K4
+REV32:
+// bswap_shufb_ctl: accessed thru 0x40(K_XMM_AR)
+    .long	0x00010203
+    .long	0x04050607
+    .long	0x08090a0b
+    .long	0x0c0d0e0f
+
+
+#endif	// architecture x86_64
+
+#endif // defined(__x86_64__)
+
--- a/acceleratecrypto/Source/sha1/intel/sha1_compress_avx2.s
+++ b/acceleratecrypto/Source/sha1/intel/sha1_compress_avx2.s
@ -0,0 +1,780 @@
+# Copyright (c) (2016,2018,2019) Apple Inc. All rights reserved.
+#
+# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+# is contained in the License.txt file distributed with corecrypto) and only to 
+# people who accept that license. IMPORTANT:  Any license rights granted to you by 
+# Apple Inc. (if any) are limited to internal use within your organization only on 
+# devices and computers you own or control, for the sole purpose of verifying the 
+# security characteristics and correct functioning of the Apple Software.  You may 
+# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+#include <corecrypto/cc_config.h>
+
+#if defined(__x86_64__)
+
+/* 	vng_sha1LittleEndian.s : this file provides optimized x86_64 avx2 implementation of the sha1 function
+	CoreOS - vector and numerics group
+
+	The implementation is based on the principle described in an Intel online article
+	"Improving the Performance of the Secure Hash Algorithm (SHA-1)"
+	http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
+
+
+	Update HASH[] by processing a one 64-byte block in MESSAGE[] can be represented by the following C function
+
+void SHA1( int HASH[], int MESSAGE[] )
+{
+    int A[81], B[81], C[81], D[81], E[81];
+    int W[80];
+
+    int i, FN;
+
+    A[0] = HASH[0];
+    B[0] = HASH[1];
+    C[0] = HASH[2];
+    D[0] = HASH[3];
+    E[0] = HASH[4];
+
+    for ( i=0; i<80; ++i )
+    {
+        if ( i < 16 )
+            W[i] = BIG_ENDIAN_LOAD( MESSAGE[i] );
+        else
+            W[i] = ROTATE_LEFT( W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16], 1 );
+
+        FN = F( i, B[i], C[i], D[i] );
+
+        A[i+1] = FN + E[i] + ROTATE_LEFT( A[i], 5 ) + W[i] + K(i);
+        B[i+1] = A[i];
+        C[i+1] = ROTATE_LEFT( B[i], 30 );
+        D[i+1] = C[i];
+        E[i+1] = D[i];
+    }
+
+    HASH[0] += A[80];
+    HASH[1] += B[80];
+    HASH[2] += C[80];
+    HASH[3] += D[80];
+    HASH[4] += E[80];
+}
+
+	For i=0:15, W[i] is simply big-endian loading of MESSAGE[i]. For i=16:79, W[i] is updated according to W[i] = ROTATE_LEFT( W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16], 1 );
+
+	The approach (by Dean Gaudet) can be used to vectorize the computation of W[i] for i=16:79,
+
+	1. done on 4 consequtive W[i] values in a single XMM register
+    W[i  ] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1
+    W[i+1] = (W[i-2] ^ W[i-7] ^ W[i-13] ^ W[i-15]) rol 1
+    W[i+2] = (W[i-1] ^ W[i-6] ^ W[i-12] ^ W[i-14]) rol 1
+    W[i+3] = (   0   ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1
+
+    2. this additional calculation unfortunately requires many additional operations
+    W[i+3] ^= W[i] rol 1
+
+    3. once we have 4 W[i] values in XMM we can also add four K values with one instruction
+    W[i:i+3] += {K,K,K,K}
+
+	Let W0 = {W[i] W[i+1] W[i+2] W[i+3]} be the current W-vector to be computed, W4 = {W[i-4] W[i-3] W[i-2] W[i-1]} be the previous vector, and so on
+	The Dean Gaudet approach can be expressed as
+
+	1. W0 = rotate_left(left_shift(W4,32) ^ W8 ^ left_shift(concatenate(W16,W12),64) ^ W16,1);
+	2. W[i+3] ^= W[i] rol 1
+	3. W0 += {K,K,K,K}
+
+	For i>=32, the Intel online article suggests that (using a basic identity (X rol 1) rol 1 = X rol 2) the update equation is equivalent to
+
+	1. W0 = rotate_left(left_shift(concatenate(W8,W4),64) ^ W16 ^ W28 ^ W32, 2);
+
+	Note:
+	1. In total, we need 8 16-byte registers or memory for W0,W4,...,W28. W0 and W32 can be the same register or memory.
+	2. The registers are used in a circular buffering mode. For example, we start with W28,W24,...,W0 (with W0 indicating the most recent 16-byte)
+		i=0, W28,W24,...,W0
+		i=4, W24,W20,...,W28
+		i=8, W20,W16,...,W24
+		.
+		.
+		and so forth.
+	3. 2 ssse3 instructions are used in the Intel article, pshufb and palignr.
+		a. pshufb is used to simplify the BIG_ENDIAN_LOAD operation
+		b. palignr is used to simplify the computation of left_shift(concatenate(W12,W8),64)
+
+*/
+
+/* the code can be compiled into single block (64 bytes) per call mode by setting Multiple_blocks to 0 */
+#define	Multiple_Blocks	1
+
+#if BUILDKERNEL
+#define	stack_size	(32*10+16*4+16)					// ymm0-9 + 4 128-bits for intermediate WK(t) storage + 32byte alignment
+#else
+#define	stack_size	(16*4)		        			// 4 128-bits for intermediate WK(t) storage
+#endif
+#define	sp			%rsp							// unifying architectural stack pointer representation
+#define	ctx			%rdi							// 1st input argument, will move to HASH_PTR (%r9)
+#define	buf			%rdx							// 3rd input argument, will move to BUFFER_PTR (%r10)
+#define	cnt         %r11							// will copy from the 2nd input argument (%rsi)
+#define K_BASE		%r8								// an aligned pointer to point to shufb reference numbers of table of K values
+#define HASH_PTR	%r9								// pointer to Hash values (A,B,C,D,E)
+#define BUFFER_PTR  %r10							// pointer to input blocks
+
+// symbolizing registers or stack memory with algorithmic variables	W0,W4,...,W28 + W_TMP, W_TMP2, and XMM_SHUFB_BSWAP for code with avx2 support
+
+#define W_TMP  	%xmm0
+#define W_TMP2 	%xmm1
+#define W0  	%xmm2
+#define W4  	%xmm3
+#define W8  	%xmm4
+#define W12 	%xmm5
+#define W16 	%xmm6
+#define W20 	%xmm7
+#define W24 	%xmm8
+#define W28 	%xmm9
+#define XMM_SHUFB_BSWAP REV32(%rip)
+
+#define	xmov	vmovaps						// aligned 16-byte move
+#define	xmovu	vmovups						// unaligned 16-byte move
+
+// intermediate hash variables
+#define A %ecx
+#define B %esi
+#define C %edi
+#define D %r15d
+#define E %edx
+
+// temp variables
+#define T1 %eax
+#define T2 %ebx
+
+#define WK(t)	((t)&15)*4(sp)
+
+	// int F1(int B, int C, int D) { return (D ^ ( B & (C ^ D)); }
+	// result in T1
+	.macro	F1 arg0, arg1, arg2
+	mov	\arg1, T1
+	xor	\arg2, T1
+	and	\arg0, T1
+	xor	\arg2, T1
+	.endm
+
+	// int F2(int B, int C, int D) { return (D ^ B ^ C); }
+	// result in T1
+	.macro	F2 arg0, arg1, arg2
+	mov	\arg2, T1
+	xor	\arg1, T1
+	xor	\arg0, T1
+	.endm
+
+	// int F3(int B, int C, int D) { return (B & C) | (D & (B ^ C)); }
+	// result in T1
+	.macro	F3 arg0, arg1, arg2
+		mov \arg1, T1
+        mov \arg0, T2
+        or  \arg0, T1
+        and \arg1, T2
+        and \arg2, T1
+        or  T2, T1
+	.endm
+
+	// for i=60:79, F4 is identical to F2
+	#define	F4	F2
+
+
+	/*
+		i=0:15, W[i] = BIG_ENDIAN_LOAD(MESSAGE[i]);
+
+		for (i=0;i<16;i+=4) {
+			1. W_TMP = new 16 bytes from MESSAGE[]
+			2. W_TMP = pshufb(W_TMP, XMM_SHUFB_BSWAP); save to W circular buffer for updating W
+			3. WTMP += {K,K,K,K};
+			4. save quadruple W[i]+K[i] = W_TMP in the stack memory;
+		}
+
+		each step is represented in one of the following 4 macro definitions
+
+	*/
+
+	.macro	W_PRECALC_00_15_0 arg0   			// input argument $0 : 0/4/8/12
+	xmovu	\arg0*4(BUFFER_PTR), W_TMP			// read 16-bytes into W_TMP, BUFFER_PTR possibly not 16-byte aligned
+	.endm
+
+	.macro	W_PRECALC_00_15_1 arg0   			// input argument $0 : current 16-bytes in the circular buffer, one of W0,W4,W8,...,W28
+	vpshufb	XMM_SHUFB_BSWAP, W_TMP, \arg0		// convert W_TMP from little-endian into big-endian
+	.endm
+
+	.macro	W_PRECALC_00_15_2 arg0				// K_BASE points to the current K quadruple.
+	vpaddd	(K_BASE), \arg0, W_TMP					// W_TMP += {K,K,K,K};
+	.endm
+
+	.macro	W_PRECALC_00_15_3 arg0
+	xmov	W_TMP, WK(\arg0&~3)				// save quadruple W[i]+K in the stack memory, which would be used later for updating the hashes A/B/C/D/E
+	.endm
+
+	// rounds 16-31 compute W[0] using the vectorization approach by Dean Gaudet
+	/*
+	W[i  ] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1
+    W[i+1] = (W[i-2] ^ W[i-7] ^ W[i-13] ^ W[i-15]) rol 1
+    W[i+2] = (W[i-1] ^ W[i-6] ^ W[i-12] ^ W[i-14]) rol 1
+    W[i+3] = (   0   ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1
+
+	W[i+3] ^= W[i] rol 1;	// this W[i] is already rol by 1, if we are taking from the intial W before rol 1, we should rol this by 2
+
+	The operation (updating W and W+K) is scheduled as and divided into 4 steps
+
+	0. W_tmp = W3; W = W14 ^ W8
+	1. W = W3 ^ W8 ^ W14 ^ W16; W_TMP = W; W_TMP2 = (W[i] 0 0 0);
+	2. W_TMP = (W3 ^ W8 ^ W14 ^ W16) rol 1; split (W[i] 0 0 0) rol 2 in W_TMP2 and W
+	3. W = W_TMP = W_TMP ^ W_TMP2 ^ W = (W3 ^ W8 ^ W14 ^ W16) rol 1 ^ (W[i] 0 0 0) rol 2; WK = W _TMP+K;
+
+	*/
+
+	.macro	W_PRECALC_16_31_0 arg0, arg1, arg2, arg3, arg4   	// input arguments : W16,W12,W8,W4,W
+	vpalignr	$8, \arg0, \arg1, \arg4		// W = W14
+	vpsrldq	    $4, \arg3, W_TMP		// W_TMP = W3
+	vpxor	    \arg2, \arg4, \arg4			// W = W8 ^ W14
+	.endm
+
+	.macro	W_PRECALC_16_31_1 arg0, arg1		// input arguments : W16,W
+	vpxor	\arg0, W_TMP, W_TMP		// W_TMP = W3 ^ W16
+	vpxor	W_TMP, \arg1, \arg1			// W = W3 ^ W16 ^ W8 ^ W14
+	vpslldq	$12, \arg1, W_TMP2			// W_TMP2 = (W[i] 0 0 0)
+	.endm
+
+	.macro	W_PRECALC_16_31_2 arg0		// input argument : W
+	vpslld	$1, \arg0, W_TMP			// (W3 ^ W16 ^ W8 ^ W14)<<1
+	vpsrld	$31, \arg0, \arg0			// (W3 ^ W16 ^ W8 ^ W14)>>31
+	vpor	\arg0, W_TMP, W_TMP		// W_TMP = (W3 ^ W16 ^ W8 ^ W14) rol 1
+	vpslld	$2, W_TMP2, \arg0			// W = W[i] higher 30 bits after rol 2
+	vpsrld	$30, W_TMP2, W_TMP2	// W_TMP2 = W[i] lower 2 bits after rol 2
+	.endm
+
+	.macro	W_PRECALC_16_31_3 arg0, arg1, arg2		// input arguments: W, i, K_XMM
+	vpxor	W_TMP, \arg0, \arg0
+	vpxor	W_TMP2, \arg0, \arg0			// W_TMP = (W3 ^ W16 ^ W8 ^ W14) rol 1 ^ (W[i] 0 0 0) rol 2
+	vpaddd	\arg2(K_BASE), \arg0, W_TMP	// W+K
+	xmov	W_TMP, WK(\arg1&~3)		// save WK = W+K for later update of the hashes A/B/C/D/E
+	.endm
+
+	/* rounds 32-79 compute W und W+K iusing the vectorization approach from the Intel article
+
+		W = rotate_left(left_shift(concatenate(W8,W4),64) ^ W16 ^ W28 ^ W32, 2);
+
+		where left_shift(concatenate(W8,W4),64) is equivalent to W6. Note also that W32 and W use the same register.
+
+
+	0. W_tmp = W6; W = W28 ^ W32;
+	1. W = W_tmp = W6 ^ W16 ^ W28 ^ W32;
+	2. W_tmp = (W6 ^ W16 ^ W28 ^ W32) rol 2;
+	3. W = W_Tmp; WK = W_tmp + K;
+
+	*/
+
+
+	.macro	W_PRECALC_32_79_0 arg0, arg1, arg2, arg3   		// inputr arguments : W28,W8,W4,W
+	vpxor	    \arg0, \arg3, \arg3				// W = W28 ^ W32;
+	vpalignr	$8, \arg1, \arg2, W_TMP		// W_tmp = (w3 w4 w5 w6) = W6;
+	.endm
+
+	.macro	W_PRECALC_32_79_1 arg0, arg1			// input arguments : W16,W
+	vpxor	\arg0, \arg1, \arg1					// W_tmp = W6 ^ W16
+	vpxor	W_TMP, \arg1, \arg1				// W_tmp = W6 ^ W16 ^ W28 ^ W32
+	//xmov	W_TMP, \arg1					// W = W_tmp = W6 ^ W16 ^ W28 ^ W32
+	.endm
+
+	.macro	W_PRECALC_32_79_2 arg0			// input argument : W
+	vpslld	$2, \arg0, W_TMP				// W << 2
+	vpsrld	$30, \arg0, \arg0				// W >> 30
+	vpor	W_TMP, \arg0, \arg0				// W_tmp = (W6 ^ W16 ^ W28 ^ W32) rol 2
+	.endm
+
+	.macro	W_PRECALC_32_79_3 arg0, arg1, arg2			// input argument W, i, K_XMM
+	vpaddd	\arg2(K_BASE), \arg0, W_TMP		// W + K
+	xmov	W_TMP, WK(\arg1&~3)			// write W+K
+	.endm
+
+
+	/* The hash update operation is completed by the following statements.
+
+		A[i+1] = FN + E[i] + ROTATE_LEFT( A[i], 5 ) + WK(i);
+        B[i+1] = A[i];
+        C[i+1] = ROTATE_LEFT( B[i], 30 );
+        D[i+1] = C[i];
+        E[i+1] = D[i];
+
+		Suppose we start with A0,B0,C0,D0,E0. The 1st iteration can be expressed as follows:
+
+		A1 = FN + E0 + rol(A0,5) + WK;
+		B1 = A0;
+		C1 = rol(B0, 30);
+		D1 = C0;
+		E1 = D0;
+
+		to avoid excessive memory movement between registers,
+			1. A1 = FN + E0 + rol(A0,5) + WK; can be temporarily saved in E0,
+			2. C1 = rol(B0,30) can be temporarily saved in B0.
+
+		Therefore, ignoring the time index, the update operation is equivalent to
+			1. E = FN(B,C,D) + E + rol(A,5) + WK(i)
+			2. B = rol(B,30)
+			3. the hashes are now stored in the order of E,A,B,C,D
+
+
+		To pack 2 hash update operations in 1 iteration, starting with A,B,C,D,E
+		1. E = FN(B,C,D) + E + rol(A,5) + WK(i)
+		2. B = rol(B,30)
+		// now the hashes are in the order of E,A,B,C,D
+		3. D = FN(A,B,C) + D + rol(E,5) + WK(i+1)
+		4. A = rol(A,30)
+		// now the hashes are in the order of D,E,A,B,C
+
+		These operations are distributed into the following 2 macro definitions RR0 and RR1.
+
+	*/
+
+	.macro	RR0 arg0, arg1, arg2, arg3, arg4, arg5, arg6				// input arguments : FN, A, B, C, D, E, i
+	\arg0		\arg2, \arg3, \arg4		// T1 = FN(B,C,D)
+	rol		$30, \arg2		// B = rol(B,30)
+	add		WK(\arg6), \arg5		// E + WK(i)
+	rorx	$27, \arg1, T2    // rol(A,5)
+	add		WK(\arg6+1), \arg4	// D + WK(i+1)
+	add		T1, \arg5			// E = FN(B,C,D) + E + WK(i)
+	.endm
+
+	.macro	RR1 arg0, arg1, arg2, arg3, arg4, arg5, arg6
+	add		T2, \arg5			// T2 = FN(B,C,D) + E + rol(A,5) + WK(i)
+	rorx	$27, \arg5, T2	// rol(E,5)
+	add		T2, \arg4			// D + WK(i+1) + rol(E,5)
+	\arg0		\arg1, \arg2, \arg3		// FN(A,B,C)
+	add		T1, \arg4			// D = FN(A,B,C) + D + rol(E,5) + WK(i+1)
+	rol		$30, \arg1		// A = rol(A,30)
+	.endm
+
+
+	.macro	INITIAL_W_PRECALC   			// BIG_ENDIAN_LOAD(64 bytes block) into W (i=0:15) and store W+K into the stack memory
+
+	// i=0 	: W28,W24,W20,W16,W12,W8,W4,W0
+	W_PRECALC_00_15_0	0					// W_TMP = (BUFFER_PTR)
+	W_PRECALC_00_15_1	W0					// convert W_TMP to big-endian, and save W0 = W_TMP
+	W_PRECALC_00_15_2   W0  				// W_TMP = W0 + K
+	W_PRECALC_00_15_3	3					// (sp) = W_TMP = W0 + K
+
+	// i=4	: W24,W20,W16,W12,W8,W4,W0,W28
+	W_PRECALC_00_15_0	4					// W_TMP = 16(BUFFER_PTR)
+	W_PRECALC_00_15_1	W28					// convert W_TMP to big-endian, and save W28 = W_TMP
+	W_PRECALC_00_15_2   W28					// W_TMP = W28 + K
+	W_PRECALC_00_15_3	7					// 16(sp) = W_TMP = W28 + K
+
+	// i=8  : W20,W16,W12,W8,W4,W0,W28,W24
+	W_PRECALC_00_15_0	8					// W_TMP = 32(BUFFER_PTR)
+	W_PRECALC_00_15_1	W24					// convert W_TMP to big-endian, and save W24 = W_TMP
+	W_PRECALC_00_15_2   W24					// W_TMP = W24 + K
+	W_PRECALC_00_15_3	11					// 32(sp) = W_TMP = W24 + K
+
+	// i=12 : W16,W12,W8,W4,W0,W28,W24,W20
+	W_PRECALC_00_15_0	12					// W_TMP = 48(BUFFER_PTR)
+	W_PRECALC_00_15_1	W20					// convert W_TMP to big-endian, and save W20 = W_TMP
+	W_PRECALC_00_15_2   W20					// W_TMP = W20 + K
+	W_PRECALC_00_15_3	15					// 48(sp) = W_TMP = W20 + K
+
+	.endm
+
+
+	.macro	INTERNAL    					// updating W (16:79) and update the digests A/B/C/D/E (i=0:63, based on W+K stored in the stack memory)
+
+	// i=16 : W12,W8,W4,W0,W28,W24,W20,W16
+	W_PRECALC_16_31_0	W0,W28,W24,W20,W16
+	RR0					F1,A,B,C,D,E,0
+	W_PRECALC_16_31_1	W0,W16
+	RR1					F1,A,B,C,D,E,0
+	W_PRECALC_16_31_2	W16
+	RR0					F1,D,E,A,B,C,2
+	W_PRECALC_16_31_3	W16, 2, 0
+	RR1					F1,D,E,A,B,C,2
+
+	// i=20 : W8,W4,W0,W28,W24,W20,W16,W12
+	W_PRECALC_16_31_0	W28,W24,W20,W16,W12
+	RR0					F1,B,C,D,E,A,4
+	W_PRECALC_16_31_1	W28,W12
+	RR1					F1,B,C,D,E,A,4
+	W_PRECALC_16_31_2	W12
+	RR0					F1,E,A,B,C,D,6
+	W_PRECALC_16_31_3	W12, 6, 16
+	RR1					F1,E,A,B,C,D,6
+
+	// i=24 : W4,W0,W28,W24,W20,W16,W12,W8
+	W_PRECALC_16_31_0	W24,W20,W16,W12,W8
+	RR0					F1,C,D,E,A,B,8
+	W_PRECALC_16_31_1	W24,W8
+	RR1					F1,C,D,E,A,B,8
+	W_PRECALC_16_31_2	W8
+	RR0					F1,A,B,C,D,E,10
+	W_PRECALC_16_31_3	W8,10,16
+	RR1					F1,A,B,C,D,E,10
+
+	// i=28 : W0,W28,W24,W20,W16,W12,W8,W4
+	W_PRECALC_16_31_0	W20,W16,W12,W8,W4
+	RR0					F1,D,E,A,B,C,12
+	W_PRECALC_16_31_1	W20,W4
+	RR1					F1,D,E,A,B,C,12
+	W_PRECALC_16_31_2	W4
+	RR0					F1,B,C,D,E,A,14
+	W_PRECALC_16_31_3	W4,14,16
+	RR1					F1,B,C,D,E,A,14
+
+	// i=32 : W28,W24,W20,W16,W12,W8,W4,W0
+	W_PRECALC_32_79_0	W28,W8,W4,W0
+	RR0					F1,E,A,B,C,D,16
+	W_PRECALC_32_79_1	W16,W0
+	RR1					F1,E,A,B,C,D,16
+	W_PRECALC_32_79_2	W0
+	RR0					F1,C,D,E,A,B,18
+	W_PRECALC_32_79_3	W0,18,16
+	RR1					F1,C,D,E,A,B,18
+
+	// starting using F2
+
+	// i=36 : W24,W20,W16,W12,W8,W4,W0,W28
+	W_PRECALC_32_79_0	W24,W4,W0,W28
+	RR0					F2,A,B,C,D,E,20
+	W_PRECALC_32_79_1	W12,W28
+	RR1					F2,A,B,C,D,E,20
+	W_PRECALC_32_79_2	W28
+	RR0					F2,D,E,A,B,C,22
+	W_PRECALC_32_79_3	W28,22,16
+	RR1					F2,D,E,A,B,C,22
+
+	// i=40 : W20,W16,W12,W8,W4,W0,W28,W24
+	#undef  K_XMM
+    #define K_XMM   32
+	W_PRECALC_32_79_0	W20,W0,W28,W24
+	RR0					F2,B,C,D,E,A,24
+	W_PRECALC_32_79_1	W8,W24
+	RR1					F2,B,C,D,E,A,24
+	W_PRECALC_32_79_2	W24
+	RR0					F2,E,A,B,C,D,26
+	W_PRECALC_32_79_3	W24,26,K_XMM
+	RR1					F2,E,A,B,C,D,26
+
+	// i=44 : W16,W12,W8,W4,W0,W28,W24,W20
+	W_PRECALC_32_79_0	W16,W28,W24,W20
+	RR0					F2,C,D,E,A,B,28
+	W_PRECALC_32_79_1	W4,W20
+	RR1					F2,C,D,E,A,B,28
+	W_PRECALC_32_79_2	W20
+	RR0					F2,A,B,C,D,E,30
+	W_PRECALC_32_79_3	W20,30,K_XMM
+	RR1					F2,A,B,C,D,E,30
+
+	// i=48 : W12,W8,W4,W0,W28,W24,W20,W16
+	W_PRECALC_32_79_0	W12,W24,W20,W16
+	RR0					F2,D,E,A,B,C,32
+	W_PRECALC_32_79_1	W0,W16
+	RR1					F2,D,E,A,B,C,32
+	W_PRECALC_32_79_2	W16
+	RR0					F2,B,C,D,E,A,34
+	W_PRECALC_32_79_3	W16,34,K_XMM
+	RR1					F2,B,C,D,E,A,34
+
+	// i=52 : W8,W4,W0,W28,W24,W20,W16,W12
+	W_PRECALC_32_79_0	W8,W20,W16,W12
+	RR0					F2,E,A,B,C,D,36
+	W_PRECALC_32_79_1	W28,W12
+	RR1					F2,E,A,B,C,D,36
+	W_PRECALC_32_79_2	W12
+	RR0					F2,C,D,E,A,B,38
+	W_PRECALC_32_79_3	W12,38,K_XMM
+	RR1					F2,C,D,E,A,B,38
+
+	// starting using F3
+
+	// i=56 : W4,W0,W28,W24,W20,W16,W12,W8
+	W_PRECALC_32_79_0	W4,W16,W12,W8
+	RR0					F3,A,B,C,D,E,40
+	W_PRECALC_32_79_1	W24,W8
+	RR1					F3,A,B,C,D,E,40
+	W_PRECALC_32_79_2	W8
+	RR0					F3,D,E,A,B,C,42
+	W_PRECALC_32_79_3	W8,42,K_XMM
+	RR1					F3,D,E,A,B,C,42
+
+	// i=60 : W0,W28,W24,W20,W16,W12,W8,W4
+	#undef	K_XMM
+	#define	K_XMM	48
+	W_PRECALC_32_79_0	W0,W12,W8,W4
+	RR0					F3,B,C,D,E,A,44
+	W_PRECALC_32_79_1	W20,W4
+	RR1					F3,B,C,D,E,A,44
+	W_PRECALC_32_79_2	W4
+	RR0					F3,E,A,B,C,D,46
+	W_PRECALC_32_79_3	W4,46,K_XMM
+	RR1					F3,E,A,B,C,D,46
+
+	// i=64 : W28,W24,W20,W16,W12,W8,W4,W0
+	W_PRECALC_32_79_0	W28,W8,W4,W0
+	RR0					F3,C,D,E,A,B,48
+	W_PRECALC_32_79_1	W16,W0
+	RR1					F3,C,D,E,A,B,48
+	W_PRECALC_32_79_2	W0
+	RR0					F3,A,B,C,D,E,50
+	W_PRECALC_32_79_3	W0,50,K_XMM
+	RR1					F3,A,B,C,D,E,50
+
+	// i=68 : W24,W20,W16,W12,W8,W4,W0,W28
+	W_PRECALC_32_79_0	W24,W4,W0,W28
+	RR0					F3,D,E,A,B,C,52
+	W_PRECALC_32_79_1	W12,W28
+	RR1					F3,D,E,A,B,C,52
+	W_PRECALC_32_79_2	W28
+	RR0					F3,B,C,D,E,A,54
+	W_PRECALC_32_79_3	W28,54,K_XMM
+	RR1					F3,B,C,D,E,A,54
+
+	// i=72 : W20,W16,W12,W8,W4,W0,W28,W24
+	W_PRECALC_32_79_0	W20,W0,W28,W24
+	RR0					F3,E,A,B,C,D,56
+	W_PRECALC_32_79_1	W8,W24
+	RR1					F3,E,A,B,C,D,56
+	W_PRECALC_32_79_2	W24
+	RR0					F3,C,D,E,A,B,58
+	W_PRECALC_32_79_3	W24,58,K_XMM
+	RR1					F3,C,D,E,A,B,58
+
+	// starting using F4
+
+	// i=76 : W16,W12,W8,W4,W0,W28,W24,W20
+	W_PRECALC_32_79_0	W16,W28,W24,W20
+	RR0					F4,A,B,C,D,E,60
+	W_PRECALC_32_79_1	W4,W20
+	RR1					F4,A,B,C,D,E,60
+	W_PRECALC_32_79_2	W20
+	RR0					F4,D,E,A,B,C,62
+	W_PRECALC_32_79_3	W20,62,K_XMM
+	RR1					F4,D,E,A,B,C,62
+
+	.endm
+
+	.macro	SOFTWARE_PIPELINING
+	// i=0  : W28,W24,W20,W16,W12,W8,W4,W0
+	W_PRECALC_00_15_0	0					// W_TMP = (BUFFER_PTR)
+	RR0					F4,B,C,D,E,A,64
+	W_PRECALC_00_15_1	W0					// convert W_TMP to big-endian, and save W0 = W_TMP
+	RR1					F4,B,C,D,E,A,64
+	W_PRECALC_00_15_2   W0					// W_TMP = W0 + K
+	RR0					F4,E,A,B,C,D,66
+	W_PRECALC_00_15_3	3					// (sp) = W_TMP = W0 + K
+	RR1					F4,E,A,B,C,D,66
+
+	// i=4  : W24,W20,W16,W12,W8,W4,W0,W28
+	W_PRECALC_00_15_0	4					// W_TMP = 16(BUFFER_PTR)
+	RR0					F4,C,D,E,A,B,68
+	W_PRECALC_00_15_1	W28					// convert W_TMP to big-endian, and save W28 = W_TMP
+	RR1					F4,C,D,E,A,B,68
+	W_PRECALC_00_15_2   W28					// W_TMP = W28 + K
+	RR0					F4,A,B,C,D,E,70
+	W_PRECALC_00_15_3	7					// 16(sp) = W_TMP = W28 + K[0]
+	RR1					F4,A,B,C,D,E,70
+
+	// i=8  : W20,W16,W12,W8,W4,W0,W28,W24
+	W_PRECALC_00_15_0	8					// W_TMP = 32(BUFFER_PTR)
+	RR0					F4,D,E,A,B,C,72
+	W_PRECALC_00_15_1	W24					// convert W_TMP to big-endian, and save W24 = W_TMP
+	RR1					F4,D,E,A,B,C,72
+	W_PRECALC_00_15_2	W24					// W_TMP = W24 + K
+	RR0					F4,B,C,D,E,A,74
+	W_PRECALC_00_15_3	11					// 32(sp) = W_TMP = W24 + K
+	RR1					F4,B,C,D,E,A,74
+
+	// i=12 : W16,W12,W8,W4,W0,W28,W24,W20
+	W_PRECALC_00_15_0	12					// W_TMP = 48(BUFFER_PTR)
+	RR0					F4,E,A,B,C,D,76
+	W_PRECALC_00_15_1	W20					// convert W_TMP to big-endian, and save W20 = W_TMP
+	RR1					F4,E,A,B,C,D,76
+	W_PRECALC_00_15_2	W20					// W_TMP = W20 + K
+	RR0					F4,C,D,E,A,B,78
+	W_PRECALC_00_15_3	15					// 48(sp) = W_TMP = W20 + K
+	RR1					F4,C,D,E,A,B,78
+	.endm
+
+
+	#undef	W_PRECALC_00_15_0
+	#undef	W_PRECALC_00_15_1
+	#undef	W_PRECALC_16_31_0
+	#undef	W_PRECALC_32_79_0
+
+	.macro	ENDING		// finish up updating hash digests (i=64:79)
+	//i=80
+	RR0					F4,B,C,D,E,A,64
+	RR1					F4,B,C,D,E,A,64
+	RR0					F4,E,A,B,C,D,66
+	RR1					F4,E,A,B,C,D,66
+
+	//i=84
+	RR0					F4,C,D,E,A,B,68
+	RR1					F4,C,D,E,A,B,68
+	RR0					F4,A,B,C,D,E,70
+	RR1					F4,A,B,C,D,E,70
+
+	//i=88
+	RR0					F4,D,E,A,B,C,72
+	RR1					F4,D,E,A,B,C,72
+	RR0					F4,B,C,D,E,A,74
+	RR1					F4,B,C,D,E,A,74
+
+	//i=92
+	RR0					F4,E,A,B,C,D,76
+	RR1					F4,E,A,B,C,D,76
+	RR0					F4,C,D,E,A,B,78
+	RR1					F4,C,D,E,A,B,78
+	.endm
+
+	// load hash digests A,B,C,D,E from memory into registers
+	.macro	LOAD_HASH
+	mov			(HASH_PTR), A
+	mov			4(HASH_PTR), B
+	mov			8(HASH_PTR), C
+	mov			12(HASH_PTR), D
+	mov			16(HASH_PTR), E
+	.endm
+
+	.macro	UPDATE_HASH arg0, arg1
+	add		\arg0, \arg1
+	mov		\arg1, \arg0
+	.endm
+
+	.macro UPDATE_ALL_HASH
+	UPDATE_HASH		(HASH_PTR), A
+	UPDATE_HASH		4(HASH_PTR), B
+	UPDATE_HASH		8(HASH_PTR), C
+	UPDATE_HASH		12(HASH_PTR), D
+	UPDATE_HASH		16(HASH_PTR), E
+	.endm
+
+
+	/*
+		 main sha1 code for system with avx2 support
+	*/
+
+	.macro  SHA1_PIPELINED_MAIN_BODY
+	LOAD_HASH						// load initial hashes into A,B,C,D,E
+	INITIAL_W_PRECALC   			// big_endian_load(W) and W+K (i=0:15)
+	.p2align	4,0x90
+0:
+	INTERNAL    					// update W (i=16:79) and update ABCDE (i=0:63)
+#if Multiple_Blocks
+	addq	_IMM(64), BUFFER_PTR			// BUFFER_PTR+=64;
+	subq	_IMM(1), cnt					// pre-decrement cnt by 1
+	jbe	1f							// if cnt <= 0, branch to finish off
+	SOFTWARE_PIPELINING     		// update ABCDE (i=64:79) || big_endian_load(W) and W+K (i=0:15)
+	UPDATE_ALL_HASH					// update output hashes
+	jmp	0b							// repeat for next block
+	.p2align	4,0x90
+1:
+#endif
+	ENDING							// update ABCDE (i=64:79)
+	UPDATE_ALL_HASH					// update output hashes
+	.endm
+
+/*
+	I removed the cpu capabilities check.  The check is now down
+	in C code and the appropriate version of the assembler code
+	is selected.
+*/
+	.text
+
+	.globl _AccelerateCrypto_SHA1_compress_AVX2
+_AccelerateCrypto_SHA1_compress_AVX2:
+
+	// start the sha1 code with avx2 support
+
+	// save callee-save registers
+	push	%rbp
+    mov     %rsp, %rbp
+	push	%rbx
+	push	%r15
+
+	sub		$stack_size, sp					// allocate stack memory for use
+
+	// save used xmm register if this is for kernel
+#if BUILDKERNEL
+    andq    $-32, sp                        // aligned sp to 32-bytes
+    leaq    4*16(sp), %rax
+    xmov    %ymm0, 0*32(%rax)
+    xmov    %ymm1, 1*32(%rax)
+    xmov    %ymm2, 2*32(%rax)
+    xmov    %ymm3, 3*32(%rax)
+    xmov    %ymm4, 4*32(%rax)
+    xmov    %ymm5, 5*32(%rax)
+    xmov    %ymm6, 6*32(%rax)
+    xmov    %ymm7, 7*32(%rax)
+    xmov    %ymm8, 8*32(%rax)
+    xmov    %ymm9, 9*32(%rax)
+#endif
+
+
+	// set up registers to free %edx/%edi/%esi for other use (ABCDE)
+	mov		ctx, HASH_PTR
+	mov		buf, BUFFER_PTR
+#if Multiple_Blocks
+	mov		%rsi, cnt
+#endif
+	lea		K_XMM_AR(%rip), K_BASE
+
+
+	SHA1_PIPELINED_MAIN_BODY
+
+	// restore used xmm registers if this is for kernel
+#if BUILDKERNEL
+    leaq    4*16(sp), %rax
+    xmov    0*32(%rax), %ymm0
+    xmov    1*32(%rax), %ymm1
+    xmov    2*32(%rax), %ymm2
+    xmov    3*32(%rax), %ymm3
+    xmov    4*32(%rax), %ymm4
+    xmov    5*32(%rax), %ymm5
+    xmov    6*32(%rax), %ymm6
+    xmov    7*32(%rax), %ymm7
+    xmov    8*32(%rax), %ymm8
+    xmov    9*32(%rax), %ymm9
+#endif
+
+	leaq    	-16(%rbp), %rsp
+
+	// restore callee-save registers
+	pop		%r15
+	pop		%rbx
+	pop		%rbp
+
+	ret							// return
+
+	CC_ASM_SECTION_CONST
+	.p2align	4, 0x90
+
+#define K1 0x5a827999
+#define K2 0x6ed9eba1
+#define K3 0x8f1bbcdc
+#define K4 0xca62c1d6
+
+K_XMM_AR:
+    .long	K1
+	.long	K1
+	.long	K1
+	.long	K1
+    .long	K2
+	.long	K2
+	.long	K2
+	.long	K2
+    .long	K3
+	.long	K3
+	.long	K3
+	.long	K3
+    .long	K4
+	.long	K4
+	.long	K4
+	.long	K4
+REV32:
+// bswap_shufb_ctl: accessed thru 0x40(K_XMM_AR)
+    .long	0x00010203
+    .long	0x04050607
+    .long	0x08090a0b
+    .long	0x0c0d0e0f
+
+
+#endif // defined(__x86_64__) 
+
--- a/acceleratecrypto/Source/sha1/intel/sha1_compress_sse.s
+++ b/acceleratecrypto/Source/sha1/intel/sha1_compress_sse.s
@ -0,0 +1,983 @@
+# Copyright (c) (2010,2011,2012,2014,2015,2016,2018,2019) Apple Inc. All rights reserved.
+#
+# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+# is contained in the License.txt file distributed with corecrypto) and only to 
+# people who accept that license. IMPORTANT:  Any license rights granted to you by 
+# Apple Inc. (if any) are limited to internal use within your organization only on 
+# devices and computers you own or control, for the sole purpose of verifying the 
+# security characteristics and correct functioning of the Apple Software.  You may 
+# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+#include <corecrypto/cc_config.h>
+
+#if (defined(__x86_64__) || defined(__i386__))
+
+/* 	vng_sha1LittleEndian.s : this file provides optimized x86_64 and i386 implementation of the sha1 function
+	CoreOS - vector and numerics group
+
+	The implementation is based on the principle described in an Intel online article
+	"Improving the Performance of the Secure Hash Algorithm (SHA-1)"
+	http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
+
+
+	Update HASH[] by processing a one 64-byte block in MESSAGE[] can be represented by the following C function
+
+void SHA1( int HASH[], int MESSAGE[] )
+{
+    int A[81], B[81], C[81], D[81], E[81];
+    int W[80];
+
+    int i, FN;
+
+    A[0] = HASH[0];
+    B[0] = HASH[1];
+    C[0] = HASH[2];
+    D[0] = HASH[3];
+    E[0] = HASH[4];
+
+    for ( i=0; i<80; ++i )
+    {
+        if ( i < 16 )
+            W[i] = BIG_ENDIAN_LOAD( MESSAGE[i] );
+        else
+            W[i] = ROTATE_LEFT( W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16], 1 );
+
+        FN = F( i, B[i], C[i], D[i] );
+
+        A[i+1] = FN + E[i] + ROTATE_LEFT( A[i], 5 ) + W[i] + K(i);
+        B[i+1] = A[i];
+        C[i+1] = ROTATE_LEFT( B[i], 30 );
+        D[i+1] = C[i];
+        E[i+1] = D[i];
+    }
+
+    HASH[0] += A[80];
+    HASH[1] += B[80];
+    HASH[2] += C[80];
+    HASH[3] += D[80];
+    HASH[4] += E[80];
+}
+
+	For i=0:15, W[i] is simply big-endian loading of MESSAGE[i]. For i=16:79, W[i] is updated according to W[i] = ROTATE_LEFT( W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16], 1 );
+
+	The approach (by Dean Gaudet) can be used to vectorize the computation of W[i] for i=16:79,
+
+	1. done on 4 consequtive W[i] values in a single XMM register
+    W[i  ] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1
+    W[i+1] = (W[i-2] ^ W[i-7] ^ W[i-13] ^ W[i-15]) rol 1
+    W[i+2] = (W[i-1] ^ W[i-6] ^ W[i-12] ^ W[i-14]) rol 1
+    W[i+3] = (   0   ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1
+
+    2. this additional calculation unfortunately requires many additional operations
+    W[i+3] ^= W[i] rol 1
+
+    3. once we have 4 W[i] values in XMM we can also add four K values with one instruction
+    W[i:i+3] += {K,K,K,K}
+
+	Let W0 = {W[i] W[i+1] W[i+2] W[i+3]} be the current W-vector to be computed, W4 = {W[i-4] W[i-3] W[i-2] W[i-1]} be the previous vector, and so on
+	The Dean Gaudet approach can be expressed as
+
+	1. W0 = rotate_left(left_shift(W4,32) ^ W8 ^ left_shift(concatenate(W16,W12),64) ^ W16,1);
+	2. W[i+3] ^= W[i] rol 1
+	3. W0 += {K,K,K,K}
+
+	For i>=32, the Intel online article suggests that (using a basic identity (X rol 1) rol 1 = X rol 2) the update equation is equivalent to
+
+	1. W0 = rotate_left(left_shift(concatenate(W8,W4),64) ^ W16 ^ W28 ^ W32, 2);
+
+	Note:
+	1. In total, we need 8 16-byte registers or memory for W0,W4,...,W28. W0 and W32 can be the same register or memory.
+	2. The registers are used in a circular buffering mode. For example, we start with W28,W24,...,W0 (with W0 indicating the most recent 16-byte)
+		i=0, W28,W24,...,W0
+		i=4, W24,W20,...,W28
+		i=8, W20,W16,...,W24
+		.
+		.
+		and so forth.
+	3. 2 ssse3 instructions are used in the Intel article, pshufb and palignr.
+		a. pshufb is used to simplify the BIG_ENDIAN_LOAD operation
+		b. palignr is used to simplify the computation of left_shift(concatenate(W12,W8),64)
+
+*/
+
+/* the code can be compiled into single block (64 bytes) per call mode by setting Multiple_blocks to 0 */
+#define	Multiple_Blocks	1
+
+#if defined (__x86_64__) || defined(__i386__)		// x86_64 or i386 architectures
+
+#if defined(__x86_64__)
+
+	// set up for x86_64
+#define	stack_size	(16*11+16*4)					// x0-x10 + 4 128-bits for intermediate WK(t) storage
+#define	sp			%rsp							// unifying architectural stack pointer representation
+#define	ctx			%rdi							// 1st input argument, will move to HASH_PTR (%r9)
+#define	buf			%rdx							// 3rd input argument, will move to BUFFER_PTR (%r10)
+#define	cnt         %r11							// will copy from the 2nd input argument (%rsi)
+#define K_BASE		%r8								// an aligned pointer to point to shufb reference numbers of table of K values
+#define HASH_PTR	%r9								// pointer to Hash values (A,B,C,D,E)
+#define BUFFER_PTR  %r10							// pointer to input blocks
+
+#else	// !__x86_64__
+
+	// set up for i386
+#define stack_size	(12+16*2+16*11+16*4)			// 12-bytes (alignment) + extra 2 + 3 (W24/W28/XMM_SHUFB_BSWAP) + 8 (xmm0-xmm7) + 4 (WK(t))
+#define	sp			%esp							// unifying architectural stack pointer representation
+#define HASH_PTR	stack_size+16+4(sp)				// use 1st input argument from caller function, 16 for (esi/edi/ebx/ebp)
+#define cnt         stack_size+16+8(sp)				// use 2nd input argument from caller function
+#define BUFFER_PTR	stack_size+16+12(sp)			// use 3rd input argument from caller function
+#define K_BASE		stack_size-4(sp)				// use for K_BASE
+
+#endif	// __x86_64__
+
+// symbolizing registers or stack memory with algorithmic variables	W0,W4,...,W28 + W_TMP, W_TMP2, and XMM_SHUFB_BSWAP for code with ssse3 support
+
+#define W_TMP  	%xmm0
+#define W_TMP2 	%xmm1
+#define W0  	%xmm2
+#define W4  	%xmm3
+#define W8  	%xmm4
+#define W12 	%xmm5
+#define W16 	%xmm6
+#define W20 	%xmm7
+#if defined(__x86_64__)
+#define W24 	%xmm8
+#define W28 	%xmm9
+#define XMM_SHUFB_BSWAP %xmm10				// used only when ssse3 is supported
+#else	// defined (__i386__)
+#define W24     12*16(sp)
+#define W28     13*16(sp)
+#define XMM_SHUFB_BSWAP 14*16(sp)			// used only when ssse3 is supported
+#endif
+
+#define	xmov	movaps						// aligned 16-byte move
+#define	xmovu	movups						// unaligned 16-byte move
+
+// intermediate hash variables
+#define A %ecx
+#define B %esi
+#define C %edi
+#if defined(__x86_64__)
+#define D %r15d
+#else
+#define D %ebp
+#endif
+#define E %edx
+
+// temp variables
+#define T1 %eax
+#define T2 %ebx
+
+#define WK(t)	((t)&15)*4(sp)
+
+	// int F1(int B, int C, int D) { return (D ^ ( B & (C ^ D)); }
+	// result in T1
+	.macro	F1 arg0, arg1, arg2
+	mov	\arg1, T1
+	xor	\arg2, T1
+	and	\arg0, T1
+	xor	\arg2, T1
+	.endm
+
+	// int F2(int B, int C, int D) { return (D ^ B ^ C); }
+	// result in T1
+	.macro	F2 arg0, arg1, arg2
+	mov	\arg2, T1
+	xor	\arg1, T1
+	xor	\arg0, T1
+	.endm
+
+	// int F3(int B, int C, int D) { return (B & C) | (D & (B ^ C)); }
+	// result in T1
+	.macro	F3 arg0, arg1, arg2
+		mov \arg1, T1
+        mov \arg0, T2
+        or  \arg0, T1
+        and \arg1, T2
+        and \arg2, T1
+        or  T2, T1
+	.endm
+
+	// for i=60:79, F4 is identical to F2
+	#define	F4	F2
+
+
+	/*
+		i=0:15, W[i] = BIG_ENDIAN_LOAD(MESSAGE[i]);
+
+		with ssse3 support, this is achived via
+		for (i=0;i<16;i+=4) {
+			1. W_TMP = new 16 bytes from MESSAGE[]
+			2. W_TMP = pshufb(W_TMP, XMM_SHUFB_BSWAP); save to W circular buffer for updating W
+			3. WTMP += {K,K,K,K};
+			4. save quadruple W[i]+K[i] = W_TMP in the stack memory;
+		}
+
+		each step is represented in one of the following 4 macro definitions
+
+	*/
+
+	.macro	W_PRECALC_00_15_0_ssse3 arg0			// input argument $0 : 0/4/8/12
+#if defined (__x86_64__)					// BUFFER_PTR is already an address register in x86_64
+	xmovu	\arg0*4(BUFFER_PTR), W_TMP			// read 16-bytes into W_TMP, BUFFER_PTR possibly not 16-byte aligned
+#else										// BUFFER_PTR is from the argument set up in the caller
+	mov     BUFFER_PTR, T1					// T1 = BUFFER_PTR
+    xmovu  \arg0*4(T1), W_TMP					// read 16-bytes into W_TMP, BUFFER_PTR possibly not 16-byte aligned
+#endif
+	.endm
+
+	.macro	W_PRECALC_00_15_1_ssse3 arg0			// input argument $0 : current 16-bytes in the circular buffer, one of W0,W4,W8,...,W28
+	pshufb	XMM_SHUFB_BSWAP, W_TMP			// convert W_TMP from little-endian into big-endian
+	xmov	W_TMP, \arg0						// save W_TMP in the circular buffer
+	.endm
+
+	.macro	W_PRECALC_00_15_2				// K_BASE points to the current K quadruple.
+#if defined (__x86_64__)					// K_BASE is already an address register in x86_64
+	paddd	(K_BASE), W_TMP					// W_TMP += {K,K,K,K};
+#else										// K_BASE is previously set up in the stack memory
+	mov     K_BASE, T1						// T1 = K_BASE
+    paddd   (T1), W_TMP						// W_TMP += {K,K,K,K};
+#endif
+	.endm
+
+	.macro	W_PRECALC_00_15_3 arg0
+	xmov	W_TMP, WK(\arg0&~3)				// save quadruple W[i]+K in the stack memory, which would be used later for updating the hashes A/B/C/D/E
+	.endm
+
+	// rounds 16-31 compute W[0] using the vectorization approach by Dean Gaudet
+	/*
+	W[i  ] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1
+    W[i+1] = (W[i-2] ^ W[i-7] ^ W[i-13] ^ W[i-15]) rol 1
+    W[i+2] = (W[i-1] ^ W[i-6] ^ W[i-12] ^ W[i-14]) rol 1
+    W[i+3] = (   0   ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1
+
+	W[i+3] ^= W[i] rol 1;	// this W[i] is already rol by 1, if we are taking from the intial W before rol 1, we should rol this by 2
+
+	The operation (updating W and W+K) is scheduled as and divided into 4 steps
+
+	0. W_tmp = W3; W = W14 ^ W8
+	1. W = W3 ^ W8 ^ W14 ^ W16; W_TMP = W; W_TMP2 = (W[i] 0 0 0);
+	2. W_TMP = (W3 ^ W8 ^ W14 ^ W16) rol 1; split (W[i] 0 0 0) rol 2 in W_TMP2 and W
+	3. W = W_TMP = W_TMP ^ W_TMP2 ^ W = (W3 ^ W8 ^ W14 ^ W16) rol 1 ^ (W[i] 0 0 0) rol 2; WK = W _TMP+K;
+
+	*/
+
+	.macro	W_PRECALC_16_31_0_ssse3 arg0, arg1, arg2, arg3, arg4	// input arguments : W16,W12,W8,W4,W
+	xmov	\arg1, \arg4					// W = W12
+	palignr	$8, \arg0, \arg4				// W = W14
+	xmov	\arg3, W_TMP				// W_TMP = W4
+	psrldq	$4, W_TMP				// W_TMP = W3
+	pxor	\arg2, \arg4					// W = W8 ^ W14
+	.endm
+
+	.macro	W_PRECALC_16_31_1 arg0, arg1		// input arguments : W16,W
+	pxor	\arg0, W_TMP				// W_TMP = W3 ^ W16
+	pxor	W_TMP, \arg1				// W = W3 ^ W16 ^ W8 ^ W14
+	xmov	\arg1, W_TMP2				// W_TMP2 = W3 ^ W16 ^ W8 ^ W14
+	xmov	\arg1, W_TMP				// W_TMP = W3 ^ W16 ^ W8 ^ W14
+	pslldq	$12, W_TMP2			// W_TMP2 = (W[i] 0 0 0)
+	.endm
+
+	.macro	W_PRECALC_16_31_2 arg0		// input argument : W
+	psrld	$31, \arg0				// (W3 ^ W16 ^ W8 ^ W14)>>31
+	pslld	$1, W_TMP				// (W3 ^ W16 ^ W8 ^ W14)<<1
+	por		\arg0, W_TMP				// W_TMP = (W3 ^ W16 ^ W8 ^ W14) rol 1
+	xmov	W_TMP2, \arg0				// copy W[i] at location of W[i+3]
+	psrld	$30, W_TMP2			// W_TMP2 = W[i] lower 2 bits after rol 2
+	pslld	$2, \arg0					// W = W[i] higher 30 bits after rol 2
+	.endm
+
+	.macro	W_PRECALC_16_31_3 arg0, arg1, arg2		// input arguments: W, i, K_XMM
+#if defined (__i386__)
+	mov     K_BASE, T1				// K_BASE is store in the stack memory for i386
+#endif
+	pxor	\arg0, W_TMP
+	pxor	W_TMP2, W_TMP			// W_TMP = (W3 ^ W16 ^ W8 ^ W14) rol 1 ^ (W[i] 0 0 0) rol 2
+	xmov	W_TMP, \arg0				// save W = W_TMP in the W circular buffer
+#if defined (__x86_64__)
+	paddd	\arg2(K_BASE), W_TMP		// W+K
+#else
+    paddd   \arg2(T1), W_TMP			// W+K
+#endif
+	xmov	W_TMP, WK(\arg1&~3)		// save WK = W+K for later update of the hashes A/B/C/D/E
+	.endm
+
+	/* rounds 32-79 compute W und W+K iusing the vectorization approach from the Intel article
+
+		W = rotate_left(left_shift(concatenate(W8,W4),64) ^ W16 ^ W28 ^ W32, 2);
+
+		where left_shift(concatenate(W8,W4),64) is equivalent to W6. Note also that W32 and W use the same register.
+
+
+	0. W_tmp = W6; W = W28 ^ W32;
+	1. W = W_tmp = W6 ^ W16 ^ W28 ^ W32;
+	2. W_tmp = (W6 ^ W16 ^ W28 ^ W32) rol 2;
+	3. W = W_Tmp; WK = W_tmp + K;
+
+	*/
+
+
+	.macro	W_PRECALC_32_79_0_ssse3 arg0, arg1, arg2, arg3		// inputr arguments : W28,W8,W4,W
+	xmov	\arg2, W_TMP					// (w1 w2 w3 w4)
+	pxor	\arg0, \arg3						// W = W28 ^ W32;
+	palignr	$8, \arg1, W_TMP				// W_tmp = (w3 w4 w5 w6) = W6;
+	.endm
+
+	// this is a variant of W_PRECALC_32_79_0_ssse3 for i386 (as W24/W28 are stored in memory, not in registers)
+	.macro  W_PRECALC_32_79_0_i386_ssse3 arg0, arg1, arg2, arg3	// input arguments : W28,W8,W4,W
+    xmov    \arg3, W_TMP						// W32
+    pxor    \arg0, W_TMP						// W28 ^ W32
+    xmov    W_TMP, \arg3						// W = W28 ^ W32;
+    xmov    \arg2, W_TMP						// W4
+    palignr $8, \arg1, W_TMP					// W_tmp = (w3 w4 w5 w6) = W6;
+    .endm
+
+	.macro	W_PRECALC_32_79_1 arg0, arg1			// input arguments : W16,W
+	pxor	\arg0, W_TMP					// W_tmp = W6 ^ W16
+	pxor	\arg1, W_TMP					// W_tmp = W6 ^ W16 ^ W28 ^ W32
+	xmov	W_TMP, \arg1					// W = W_tmp = W6 ^ W16 ^ W28 ^ W32
+	.endm
+
+	.macro	W_PRECALC_32_79_2 arg0			// input argument : W
+	psrld	$30, \arg0					// W >> 30
+	pslld	$2, W_TMP					// W << 2
+	por		\arg0, W_TMP					// W_tmp = (W6 ^ W16 ^ W28 ^ W32) rol 2
+	.endm
+
+	// this is a variant of W_PRECALC_32_79_2 for i386 (as W24/W28 are stored in memory, not in registers)
+	// this should be used when the input is either W24 or W28 on i386 architecture
+    .macro  W_PRECALC_32_79_2_i386 arg0  	// input argument : W
+    xmov    \arg0, W_TMP2					// W
+    psrld   $30, W_TMP2				// W >> 30
+    xmov    W_TMP2, \arg0					// save (W >> 30) at W
+    pslld   $2, W_TMP					// W_tmp << 2
+    por     \arg0, W_TMP					// W_tmp = (W6 ^ W16 ^ W28 ^ W32) rol 2
+    .endm
+
+	.macro	W_PRECALC_32_79_3 arg0, arg1, arg2			// input argument W, i, K_XMM
+#if defined (__x86_64__)
+	xmov	W_TMP, \arg0					// W = (W6 ^ W16 ^ W28 ^ W32) rol 2
+	paddd	\arg2(K_BASE), W_TMP			// W + K
+	xmov	W_TMP, WK(\arg1&~3)			// write W+K
+#else
+    mov     K_BASE, T1					// T1 = K_BASE (which is in the caller argument)
+    xmov    W_TMP, \arg0					// W = (W6 ^ W16 ^ W28 ^ W32) rol 2
+    paddd   \arg2(T1), W_TMP				// W_tmp = W + K
+    xmov    W_TMP, WK(\arg1&~3)			// write WK
+#endif
+	.endm
+
+
+	/* The hash update operation is completed by the following statements.
+
+		A[i+1] = FN + E[i] + ROTATE_LEFT( A[i], 5 ) + WK(i);
+        B[i+1] = A[i];
+        C[i+1] = ROTATE_LEFT( B[i], 30 );
+        D[i+1] = C[i];
+        E[i+1] = D[i];
+
+		Suppose we start with A0,B0,C0,D0,E0. The 1st iteration can be expressed as follows:
+
+		A1 = FN + E0 + rol(A0,5) + WK;
+		B1 = A0;
+		C1 = rol(B0, 30);
+		D1 = C0;
+		E1 = D0;
+
+		to avoid excessive memory movement between registers,
+			1. A1 = FN + E0 + rol(A0,5) + WK; can be temporarily saved in E0,
+			2. C1 = rol(B0,30) can be temporarily saved in B0.
+
+		Therefore, ignoring the time index, the update operation is equivalent to
+			1. E = FN(B,C,D) + E + rol(A,5) + WK(i)
+			2. B = rol(B,30)
+			3. the hashes are now stored in the order of E,A,B,C,D
+
+
+		To pack 2 hash update operations in 1 iteration, starting with A,B,C,D,E
+		1. E = FN(B,C,D) + E + rol(A,5) + WK(i)
+		2. B = rol(B,30)
+		// now the hashes are in the order of E,A,B,C,D
+		3. D = FN(A,B,C) + D + rol(E,5) + WK(i+1)
+		4. A = rol(A,30)
+		// now the hashes are in the order of D,E,A,B,C
+
+		These operations are distributed into the following 2 macro definitions RR0 and RR1.
+
+	*/
+
+	.macro	RR0 arg0, arg1, arg2, arg3, arg4, arg5, arg6				// input arguments : FN, A, B, C, D, E, i
+	\arg0		\arg2, \arg3, \arg4		// T1 = FN(B,C,D)
+	add		WK(\arg6), \arg5		// E + WK(i)
+	rol		$30, \arg2		// B = rol(B,30)
+	mov		\arg1, T2			// T2 = A
+	add		WK(\arg6+1), \arg4	// D + WK(i+1)
+	rol		$5, T2			// rol(A,5)
+	add		T1, \arg5			// E = FN(B,C,D) + E + WK(i)
+	.endm
+
+	.macro	RR1 arg0, arg1, arg2, arg3, arg4, arg5, arg6
+	add		\arg5, T2			// T2 = FN(B,C,D) + E + rol(A,5) + WK(i)
+	mov		T2, \arg5			// E = FN(B,C,D) + E + rol(A,5) + WK(i)
+	rol		$5, T2			// rol(E,5)
+	add		T2, \arg4			// D + WK(i+1) + rol(E,5)
+	\arg0		\arg1, \arg2, \arg3		// FN(A,B,C)
+	add		T1, \arg4			// D = FN(A,B,C) + D + rol(E,5) + WK(i+1)
+	rol		$30, \arg1		// A = rol(A,30)
+	.endm
+
+
+
+	/*
+
+		The following macro definitions are used to expand code for the per-block sha1 operation.
+
+			INITIAL_W_PRECALC_ssse3	: BIG_ENDIAN_LOAD(64 bytes block) into W (i=0:15) and store W+K into the stack memory
+			INTERNAL_ssse3 : updating W (16:79) and update the digests A/B/C/D/E (i=0:63, based on W+K stored in the stack memory)
+			ENDING : finishing up update the digests A/B/C/D/E (i=64:79)
+
+		For multiple-block sha1 operation (Multiple_Blocks = 1), INITIAL_W_PRECALC_ssse3 and ENDING are combined
+		into 1 macro definition for software pipeling.
+
+			SOFTWARE_PIPELINING_ssse3 : BIG_ENDIAN_LOAD(64 bytes block) into W (i=0:15) and store W+K into the stack, and finishing up update the digests A/B/C/D/E (i=64:79)
+
+		assume cnt (the number of blocks)  >= 1, the main code body should look like
+
+		INITIAL_W_PRECALC_ssse3				// W = big_endian_load and pre-compute W+K (i=0:15)
+		do {
+			INTERNAL_ssse3					// update W(i=16:79), and update hash digests A/B/C/D/E (i=0:63)
+			cnt--;
+			if (cnt==0) break;
+			BUFFER_PTR += 64;
+			SOFTWARE_PIPELINING_ssse3;		// update hash digests A/B/C/D/E (i=64:79) + W = big_endian_load and pre-compute W+K (i=0:15)
+		}
+		ENDING								// update hash digests A/B/C/D/E (i=64:79)
+
+	*/
+
+	#define	W_PRECALC_00_15_0	W_PRECALC_00_15_0_ssse3
+	#define	W_PRECALC_00_15_1	W_PRECALC_00_15_1_ssse3
+	#define	W_PRECALC_16_31_0	W_PRECALC_16_31_0_ssse3
+	#define	W_PRECALC_32_79_0	W_PRECALC_32_79_0_ssse3
+	#define	W_PRECALC_32_79_0_i386	W_PRECALC_32_79_0_i386_ssse3
+
+
+	.macro	INITIAL_W_PRECALC_ssse3			// BIG_ENDIAN_LOAD(64 bytes block) into W (i=0:15) and store W+K into the stack memory
+
+	// i=0 	: W28,W24,W20,W16,W12,W8,W4,W0
+	W_PRECALC_00_15_0	0					// W_TMP = (BUFFER_PTR)
+	W_PRECALC_00_15_1	W0					// convert W_TMP to big-endian, and save W0 = W_TMP
+	W_PRECALC_00_15_2						// W_TMP = W0 + K
+	W_PRECALC_00_15_3	3					// (sp) = W_TMP = W0 + K
+
+	// i=4	: W24,W20,W16,W12,W8,W4,W0,W28
+	W_PRECALC_00_15_0	4					// W_TMP = 16(BUFFER_PTR)
+	W_PRECALC_00_15_1	W28					// convert W_TMP to big-endian, and save W28 = W_TMP
+	W_PRECALC_00_15_2						// W_TMP = W28 + K
+	W_PRECALC_00_15_3	7					// 16(sp) = W_TMP = W28 + K
+
+	// i=8  : W20,W16,W12,W8,W4,W0,W28,W24
+	W_PRECALC_00_15_0	8					// W_TMP = 32(BUFFER_PTR)
+	W_PRECALC_00_15_1	W24					// convert W_TMP to big-endian, and save W24 = W_TMP
+	W_PRECALC_00_15_2						// W_TMP = W24 + K
+	W_PRECALC_00_15_3	11					// 32(sp) = W_TMP = W24 + K
+
+	// i=12 : W16,W12,W8,W4,W0,W28,W24,W20
+	W_PRECALC_00_15_0	12					// W_TMP = 48(BUFFER_PTR)
+	W_PRECALC_00_15_1	W20					// convert W_TMP to big-endian, and save W20 = W_TMP
+	W_PRECALC_00_15_2						// W_TMP = W20 + K
+	W_PRECALC_00_15_3	15					// 48(sp) = W_TMP = W20 + K
+
+	.endm
+
+
+	.macro	INTERNAL_ssse3					// updating W (16:79) and update the digests A/B/C/D/E (i=0:63, based on W+K stored in the stack memory)
+
+	// i=16 : W12,W8,W4,W0,W28,W24,W20,W16
+	W_PRECALC_16_31_0	W0,W28,W24,W20,W16
+	RR0					F1,A,B,C,D,E,0
+	W_PRECALC_16_31_1	W0,W16
+	RR1					F1,A,B,C,D,E,0
+	W_PRECALC_16_31_2	W16
+	RR0					F1,D,E,A,B,C,2
+	W_PRECALC_16_31_3	W16, 2, 0
+	RR1					F1,D,E,A,B,C,2
+
+	// i=20 : W8,W4,W0,W28,W24,W20,W16,W12
+	W_PRECALC_16_31_0	W28,W24,W20,W16,W12
+	RR0					F1,B,C,D,E,A,4
+	W_PRECALC_16_31_1	W28,W12
+	RR1					F1,B,C,D,E,A,4
+	W_PRECALC_16_31_2	W12
+	RR0					F1,E,A,B,C,D,6
+	W_PRECALC_16_31_3	W12, 6, 16
+	RR1					F1,E,A,B,C,D,6
+
+	// i=24 : W4,W0,W28,W24,W20,W16,W12,W8
+	W_PRECALC_16_31_0	W24,W20,W16,W12,W8
+	RR0					F1,C,D,E,A,B,8
+	W_PRECALC_16_31_1	W24,W8
+	RR1					F1,C,D,E,A,B,8
+	W_PRECALC_16_31_2	W8
+	RR0					F1,A,B,C,D,E,10
+	W_PRECALC_16_31_3	W8,10,16
+	RR1					F1,A,B,C,D,E,10
+
+	// i=28 : W0,W28,W24,W20,W16,W12,W8,W4
+	W_PRECALC_16_31_0	W20,W16,W12,W8,W4
+	RR0					F1,D,E,A,B,C,12
+	W_PRECALC_16_31_1	W20,W4
+	RR1					F1,D,E,A,B,C,12
+	W_PRECALC_16_31_2	W4
+	RR0					F1,B,C,D,E,A,14
+	W_PRECALC_16_31_3	W4,14,16
+	RR1					F1,B,C,D,E,A,14
+
+	// i=32 : W28,W24,W20,W16,W12,W8,W4,W0
+	W_PRECALC_32_79_0	W28,W8,W4,W0
+	RR0					F1,E,A,B,C,D,16
+	W_PRECALC_32_79_1	W16,W0
+	RR1					F1,E,A,B,C,D,16
+	W_PRECALC_32_79_2	W0
+	RR0					F1,C,D,E,A,B,18
+	W_PRECALC_32_79_3	W0,18,16
+	RR1					F1,C,D,E,A,B,18
+
+	// starting using F2
+
+	// i=36 : W24,W20,W16,W12,W8,W4,W0,W28
+#if defined (__x86_64__)
+	W_PRECALC_32_79_0	W24,W4,W0,W28
+#else
+	W_PRECALC_32_79_0_i386	W24,W4,W0,W28
+#endif
+	RR0					F2,A,B,C,D,E,20
+	W_PRECALC_32_79_1	W12,W28
+	RR1					F2,A,B,C,D,E,20
+#if defined (__x86_64__)
+	W_PRECALC_32_79_2	W28
+#else
+	W_PRECALC_32_79_2_i386	W28
+#endif
+	RR0					F2,D,E,A,B,C,22
+	W_PRECALC_32_79_3	W28,22,16
+	RR1					F2,D,E,A,B,C,22
+
+	// i=40 : W20,W16,W12,W8,W4,W0,W28,W24
+	#undef  K_XMM
+    #define K_XMM   32
+#if defined (__x86_64__)
+	W_PRECALC_32_79_0	W20,W0,W28,W24
+#else
+	W_PRECALC_32_79_0_i386	W20,W0,W28,W24
+#endif
+	RR0					F2,B,C,D,E,A,24
+	W_PRECALC_32_79_1	W8,W24
+	RR1					F2,B,C,D,E,A,24
+#if defined (__x86_64__)
+	W_PRECALC_32_79_2	W24
+#else
+	W_PRECALC_32_79_2_i386	W24
+#endif
+	RR0					F2,E,A,B,C,D,26
+	W_PRECALC_32_79_3	W24,26,K_XMM
+	RR1					F2,E,A,B,C,D,26
+
+	// i=44 : W16,W12,W8,W4,W0,W28,W24,W20
+	W_PRECALC_32_79_0	W16,W28,W24,W20
+	RR0					F2,C,D,E,A,B,28
+	W_PRECALC_32_79_1	W4,W20
+	RR1					F2,C,D,E,A,B,28
+	W_PRECALC_32_79_2	W20
+	RR0					F2,A,B,C,D,E,30
+	W_PRECALC_32_79_3	W20,30,K_XMM
+	RR1					F2,A,B,C,D,E,30
+
+	// i=48 : W12,W8,W4,W0,W28,W24,W20,W16
+	W_PRECALC_32_79_0	W12,W24,W20,W16
+	RR0					F2,D,E,A,B,C,32
+	W_PRECALC_32_79_1	W0,W16
+	RR1					F2,D,E,A,B,C,32
+	W_PRECALC_32_79_2	W16
+	RR0					F2,B,C,D,E,A,34
+	W_PRECALC_32_79_3	W16,34,K_XMM
+	RR1					F2,B,C,D,E,A,34
+
+	// i=52 : W8,W4,W0,W28,W24,W20,W16,W12
+	W_PRECALC_32_79_0	W8,W20,W16,W12
+	RR0					F2,E,A,B,C,D,36
+	W_PRECALC_32_79_1	W28,W12
+	RR1					F2,E,A,B,C,D,36
+	W_PRECALC_32_79_2	W12
+	RR0					F2,C,D,E,A,B,38
+	W_PRECALC_32_79_3	W12,38,K_XMM
+	RR1					F2,C,D,E,A,B,38
+
+	// starting using F3
+
+	// i=56 : W4,W0,W28,W24,W20,W16,W12,W8
+	W_PRECALC_32_79_0	W4,W16,W12,W8
+	RR0					F3,A,B,C,D,E,40
+	W_PRECALC_32_79_1	W24,W8
+	RR1					F3,A,B,C,D,E,40
+	W_PRECALC_32_79_2	W8
+	RR0					F3,D,E,A,B,C,42
+	W_PRECALC_32_79_3	W8,42,K_XMM
+	RR1					F3,D,E,A,B,C,42
+
+	// i=60 : W0,W28,W24,W20,W16,W12,W8,W4
+	#undef	K_XMM
+	#define	K_XMM	48
+	W_PRECALC_32_79_0	W0,W12,W8,W4
+	RR0					F3,B,C,D,E,A,44
+	W_PRECALC_32_79_1	W20,W4
+	RR1					F3,B,C,D,E,A,44
+	W_PRECALC_32_79_2	W4
+	RR0					F3,E,A,B,C,D,46
+	W_PRECALC_32_79_3	W4,46,K_XMM
+	RR1					F3,E,A,B,C,D,46
+
+	// i=64 : W28,W24,W20,W16,W12,W8,W4,W0
+	W_PRECALC_32_79_0	W28,W8,W4,W0
+	RR0					F3,C,D,E,A,B,48
+	W_PRECALC_32_79_1	W16,W0
+	RR1					F3,C,D,E,A,B,48
+	W_PRECALC_32_79_2	W0
+	RR0					F3,A,B,C,D,E,50
+	W_PRECALC_32_79_3	W0,50,K_XMM
+	RR1					F3,A,B,C,D,E,50
+
+	// i=68 : W24,W20,W16,W12,W8,W4,W0,W28
+#if defined (__x86_64__)
+	W_PRECALC_32_79_0	W24,W4,W0,W28
+#else
+	W_PRECALC_32_79_0_i386	W24,W4,W0,W28
+#endif
+	RR0					F3,D,E,A,B,C,52
+	W_PRECALC_32_79_1	W12,W28
+	RR1					F3,D,E,A,B,C,52
+#if defined (__x86_64__)
+	W_PRECALC_32_79_2	W28
+#else
+	W_PRECALC_32_79_2_i386	W28
+#endif
+	RR0					F3,B,C,D,E,A,54
+	W_PRECALC_32_79_3	W28,54,K_XMM
+	RR1					F3,B,C,D,E,A,54
+
+	// i=72 : W20,W16,W12,W8,W4,W0,W28,W24
+#if defined (__x86_64__)
+	W_PRECALC_32_79_0	W20,W0,W28,W24
+#else
+	W_PRECALC_32_79_0_i386	W20,W0,W28,W24
+#endif
+	RR0					F3,E,A,B,C,D,56
+	W_PRECALC_32_79_1	W8,W24
+	RR1					F3,E,A,B,C,D,56
+#if defined (__x86_64__)
+	W_PRECALC_32_79_2	W24
+#else
+	W_PRECALC_32_79_2_i386	W24
+#endif
+	RR0					F3,C,D,E,A,B,58
+	W_PRECALC_32_79_3	W24,58,K_XMM
+	RR1					F3,C,D,E,A,B,58
+
+	// starting using F4
+
+	// i=76 : W16,W12,W8,W4,W0,W28,W24,W20
+	W_PRECALC_32_79_0	W16,W28,W24,W20
+	RR0					F4,A,B,C,D,E,60
+	W_PRECALC_32_79_1	W4,W20
+	RR1					F4,A,B,C,D,E,60
+	W_PRECALC_32_79_2	W20
+	RR0					F4,D,E,A,B,C,62
+	W_PRECALC_32_79_3	W20,62,K_XMM
+	RR1					F4,D,E,A,B,C,62
+
+	.endm
+
+	.macro	SOFTWARE_PIPELINING_ssse3
+	// i=0  : W28,W24,W20,W16,W12,W8,W4,W0
+	W_PRECALC_00_15_0	0					// W_TMP = (BUFFER_PTR)
+	RR0					F4,B,C,D,E,A,64
+	W_PRECALC_00_15_1	W0					// convert W_TMP to big-endian, and save W0 = W_TMP
+	RR1					F4,B,C,D,E,A,64
+	W_PRECALC_00_15_2						// W_TMP = W0 + K
+	RR0					F4,E,A,B,C,D,66
+	W_PRECALC_00_15_3	3					// (sp) = W_TMP = W0 + K
+	RR1					F4,E,A,B,C,D,66
+
+	// i=4  : W24,W20,W16,W12,W8,W4,W0,W28
+	W_PRECALC_00_15_0	4					// W_TMP = 16(BUFFER_PTR)
+	RR0					F4,C,D,E,A,B,68
+	W_PRECALC_00_15_1	W28					// convert W_TMP to big-endian, and save W28 = W_TMP
+	RR1					F4,C,D,E,A,B,68
+	W_PRECALC_00_15_2						// W_TMP = W28 + K
+	RR0					F4,A,B,C,D,E,70
+	W_PRECALC_00_15_3	7					// 16(sp) = W_TMP = W28 + K[0]
+	RR1					F4,A,B,C,D,E,70
+
+	// i=8  : W20,W16,W12,W8,W4,W0,W28,W24
+	W_PRECALC_00_15_0	8					// W_TMP = 32(BUFFER_PTR)
+	RR0					F4,D,E,A,B,C,72
+	W_PRECALC_00_15_1	W24					// convert W_TMP to big-endian, and save W24 = W_TMP
+	RR1					F4,D,E,A,B,C,72
+	W_PRECALC_00_15_2						// W_TMP = W24 + K
+	RR0					F4,B,C,D,E,A,74
+	W_PRECALC_00_15_3	11					// 32(sp) = W_TMP = W24 + K
+	RR1					F4,B,C,D,E,A,74
+
+	// i=12 : W16,W12,W8,W4,W0,W28,W24,W20
+	W_PRECALC_00_15_0	12					// W_TMP = 48(BUFFER_PTR)
+	RR0					F4,E,A,B,C,D,76
+	W_PRECALC_00_15_1	W20					// convert W_TMP to big-endian, and save W20 = W_TMP
+	RR1					F4,E,A,B,C,D,76
+	W_PRECALC_00_15_2						// W_TMP = W20 + K
+	RR0					F4,C,D,E,A,B,78
+	W_PRECALC_00_15_3	15					// 48(sp) = W_TMP = W20 + K
+	RR1					F4,C,D,E,A,B,78
+	.endm
+
+
+	#undef	W_PRECALC_00_15_0
+	#undef	W_PRECALC_00_15_1
+	#undef	W_PRECALC_16_31_0
+	#undef	W_PRECALC_32_79_0
+	#undef	W_PRECALC_32_79_0_i386
+
+	.macro	ENDING		// finish up updating hash digests (i=64:79)
+	//i=80
+	RR0					F4,B,C,D,E,A,64
+	RR1					F4,B,C,D,E,A,64
+	RR0					F4,E,A,B,C,D,66
+	RR1					F4,E,A,B,C,D,66
+
+	//i=84
+	RR0					F4,C,D,E,A,B,68
+	RR1					F4,C,D,E,A,B,68
+	RR0					F4,A,B,C,D,E,70
+	RR1					F4,A,B,C,D,E,70
+
+	//i=88
+	RR0					F4,D,E,A,B,C,72
+	RR1					F4,D,E,A,B,C,72
+	RR0					F4,B,C,D,E,A,74
+	RR1					F4,B,C,D,E,A,74
+
+	//i=92
+	RR0					F4,E,A,B,C,D,76
+	RR1					F4,E,A,B,C,D,76
+	RR0					F4,C,D,E,A,B,78
+	RR1					F4,C,D,E,A,B,78
+	.endm
+
+	// load hash digests A,B,C,D,E from memory into registers
+	.macro	LOAD_HASH
+#if defined (__x86_64__)
+	mov			(HASH_PTR), A
+	mov			4(HASH_PTR), B
+	mov			8(HASH_PTR), C
+	mov			12(HASH_PTR), D
+	mov			16(HASH_PTR), E
+#else
+    mov         HASH_PTR, T1
+    mov         (T1), A
+    mov         4(T1), B
+    mov         8(T1), C
+    mov         12(T1), D
+    mov         16(T1), E
+#endif
+	.endm
+
+	.macro	UPDATE_HASH arg0, arg1
+	add		\arg0, \arg1
+	mov		\arg1, \arg0
+	.endm
+
+	.macro UPDATE_ALL_HASH
+#if defined (__x86_64__)
+	UPDATE_HASH		(HASH_PTR), A
+	UPDATE_HASH		4(HASH_PTR), B
+	UPDATE_HASH		8(HASH_PTR), C
+	UPDATE_HASH		12(HASH_PTR), D
+	UPDATE_HASH		16(HASH_PTR), E
+#else
+    mov             HASH_PTR, T1
+    UPDATE_HASH     (T1), A
+    UPDATE_HASH     4(T1), B
+    UPDATE_HASH     8(T1), C
+    UPDATE_HASH     12(T1), D
+    UPDATE_HASH     16(T1), E
+#endif
+	.endm
+
+
+	/*
+		 main sha1 code for system with ssse3 support
+	*/
+
+	.macro  SHA1_PIPELINED_MAIN_BODY_ssse3
+	LOAD_HASH						// load initial hashes into A,B,C,D,E
+	INITIAL_W_PRECALC_ssse3			// big_endian_load(W) and W+K (i=0:15)
+	.p2align	4,0x90
+0:
+	INTERNAL_ssse3					// update W (i=16:79) and update ABCDE (i=0:63)
+#if Multiple_Blocks
+#if defined (__x86_64__)
+	addq	_IMM(64), BUFFER_PTR			// BUFFER_PTR+=64;
+	subq	_IMM(1), cnt					// pre-decrement cnt by 1
+#else
+	addl	_IMM(64), BUFFER_PTR			// BUFFER_PTR+=64;
+	subl	_IMM(1), cnt					// pre-decrement cnt by 1
+#endif
+	jbe	1f							// if cnt <= 0, branch to finish off
+	SOFTWARE_PIPELINING_ssse3		// update ABCDE (i=64:79) || big_endian_load(W) and W+K (i=0:15)
+	UPDATE_ALL_HASH					// update output hashes
+	jmp	0b							// repeat for next block
+	.p2align	4,0x90
+1:
+#endif
+	ENDING							// update ABCDE (i=64:79)
+	UPDATE_ALL_HASH					// update output hashes
+	.endm
+
+/*
+	I removed the cpu capabilities check.  The check is now down
+	in C code and the appropriate version of the assembler code
+	is selected.
+*/
+	.text
+	.globl _AccelerateCrypto_SHA1_compress_ssse3
+_AccelerateCrypto_SHA1_compress_ssse3:
+
+	// start the sha1 code with ssse3 support
+
+	// save callee-save registers
+#if defined (__x86_64__)
+	push	%rbp
+    mov     %rsp, %rbp
+	push	%rbx
+	push	%r15
+#else
+    push    %ebx
+    push    %ebp
+    push    %esi
+    push    %edi
+#endif
+
+	sub		$stack_size, sp					// allocate stack memory for use
+
+	// save used xmm register if this is for kernel
+#if BUILDKERNEL
+	xmov	%xmm0, 4*16(sp)
+	xmov	%xmm1, 5*16(sp)
+	xmov	%xmm2, 6*16(sp)
+	xmov	%xmm3, 7*16(sp)
+	xmov	%xmm4, 8*16(sp)
+	xmov	%xmm5, 9*16(sp)
+	xmov	%xmm6, 10*16(sp)
+	xmov	%xmm7, 11*16(sp)
+#if defined (__x86_64__)
+	xmov	%xmm8, 12*16(sp)
+	xmov	%xmm9, 13*16(sp)
+	xmov	%xmm10, 14*16(sp)
+#endif
+#endif
+
+#if defined (__x86_64__)
+
+	// set up registers to free %edx/%edi/%esi for other use (ABCDE)
+	mov		ctx, HASH_PTR
+	mov		buf, BUFFER_PTR
+#if Multiple_Blocks
+	mov		%rsi, cnt
+#endif
+	lea		K_XMM_AR(%rip), K_BASE
+	xmov	0x40(K_BASE), XMM_SHUFB_BSWAP
+
+#else	// __i386__
+
+#if BUILDKERNEL
+    lea     K_XMM_AR, %eax
+#else
+	// Get address of 0 in R.
+           call    0f          // Push program counter onto stack.
+        0: pop     %eax      // Get program counter.
+		lea	K_XMM_AR-0b(%eax), %eax
+#endif
+    mov     %eax, K_BASE
+    xmov    0x40(%eax), %xmm0
+    xmov    %xmm0, XMM_SHUFB_BSWAP
+
+#endif
+
+	SHA1_PIPELINED_MAIN_BODY_ssse3
+
+	// restore used xmm registers if this is for kernel
+#if BUILDKERNEL
+	xmov	4*16(sp), %xmm0
+	xmov	5*16(sp), %xmm1
+	xmov	6*16(sp), %xmm2
+	xmov	7*16(sp), %xmm3
+	xmov	8*16(sp), %xmm4
+	xmov	9*16(sp), %xmm5
+	xmov	10*16(sp), %xmm6
+	xmov	11*16(sp), %xmm7
+#if defined (__x86_64__)
+	xmov	12*16(sp), %xmm8
+	xmov	13*16(sp), %xmm9
+	xmov	14*16(sp), %xmm10
+#endif
+#endif
+
+	add		$stack_size, sp		// deallocate stack memory
+
+	// restore callee-save registers
+#if defined (__x86_64__)
+	pop		%r15
+	pop		%rbx
+	pop		%rbp
+#else
+    pop     %edi
+    pop     %esi
+    pop     %ebp
+    pop     %ebx
+#endif
+
+	ret							// return
+
+	CC_ASM_SECTION_CONST
+	.p2align	4, 0x90
+
+#define K1 0x5a827999
+#define K2 0x6ed9eba1
+#define K3 0x8f1bbcdc
+#define K4 0xca62c1d6
+
+K_XMM_AR:
+    .long	K1
+	.long	K1
+	.long	K1
+	.long	K1
+    .long	K2
+	.long	K2
+	.long	K2
+	.long	K2
+    .long	K3
+	.long	K3
+	.long	K3
+	.long	K3
+    .long	K4
+	.long	K4
+	.long	K4
+	.long	K4
+// bswap_shufb_ctl: accessed thru 0x40(K_XMM_AR)
+    .long	0x00010203
+    .long	0x04050607
+    .long	0x08090a0b
+    .long	0x0c0d0e0f
+
+
+#endif	// architecture x86_64 or i386
+
+#endif // (defined(__x86_64__) || defined(__i386__))
+
--- a/acceleratecrypto/Source/sha256/arm/sha256_compress_armv7neon.s
+++ b/acceleratecrypto/Source/sha256/arm/sha256_compress_armv7neon.s
@ -0,0 +1,854 @@
+# Copyright (c) (2011,2012,2013,2015,2016,2018,2019) Apple Inc. All rights reserved.
+#
+# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+# is contained in the License.txt file distributed with corecrypto) and only to 
+# people who accept that license. IMPORTANT:  Any license rights granted to you by 
+# Apple Inc. (if any) are limited to internal use within your organization only on 
+# devices and computers you own or control, for the sole purpose of verifying the 
+# security characteristics and correct functioning of the Apple Software.  You may 
+# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+/*
+	This file provides armv7+neon hand implementation of the following function
+
+	void SHA256_Transform(SHA256_ctx *ctx, char *data, unsigned int num_blocks);
+
+	which is a C function in sha2.c (from xnu).
+
+	sha256 algorithm per block description:
+
+		1. W(0:15) = big-endian (per 4 bytes) loading of input data (64 byte) 
+		2. load 8 digests a-h from ctx->state
+		3. for r = 0:15
+				T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
+				d += T1;
+				h = T1 + Sigma0(a) + Maj(a,b,c)
+				permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
+		4. for r = 16:63
+				W[r] = W[r-16] + sigma1(W[r-2]) + W[r-7] + sigma0(W[r-15]);
+				T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
+				d += T1;
+				h = T1 + Sigma0(a) + Maj(a,b,c)
+				permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
+				
+	In the assembly implementation:	
+		- a circular window of message schedule W(r:r+15) is updated and stored in q0-q3
+		- its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
+		- the 8 digests (a-h) will be stored in GPR or memory
+
+	the implementation per block looks like
+
+	----------------------------------------------------------------------------
+
+	load W(0:15) (big-endian per 4 bytes) into q0:q3
+	pre_calculate and store W+K(0:15) in stack
+
+	load digests a-h from ctx->state;
+
+	for (r=0;r<48;r+=4) {
+		digests a-h update and permute round r:r+3
+		update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration 
+	}
+
+	for (r=48;r<64;r+=4) {
+		digests a-h update and permute round r:r+3
+	}
+
+	ctx->states += digests a-h;
+
+	----------------------------------------------------------------------------
+
+	our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block 
+	into the last 16 rounds of its previous block:
+
+	----------------------------------------------------------------------------
+
+	load W(0:15) (big-endian per 4 bytes) into q0:q3 
+	pre_calculate and store W+K(0:15) in stack
+
+L_loop:
+
+	load digests a-h from ctx->state;
+
+	for (r=0;r<48;r+=4) {
+		digests a-h update and permute round r:r+3
+		update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration 
+	}
+
+	num_block--;
+	if (num_block==0)	jmp L_last_block;
+
+	for (r=48;r<64;r+=4) {
+		digests a-h update and permute round r:r+3
+		load W([r:r+3]%16) (big-endian per 4 bytes) into q0:q3 
+		pre_calculate and store W+K([r:r+3]%16) in stack
+	}
+
+	ctx->states += digests a-h;
+
+	jmp	L_loop;
+
+L_last_block:
+
+	for (r=48;r<64;r+=4) {
+		digests a-h update and permute round r:r+3
+	}
+
+	ctx->states += digests a-h;
+
+	------------------------------------------------------------------------
+
+	Apple CoreOS vector & numerics
+*/
+
+#if (defined(__arm__) && defined(__ARM_NEON__))
+
+	// associate variables with registers or memory
+
+	#define	ctx			r0
+	#define data		r1
+	#define	num_blocks	[sp, #64]
+	#define	_i_loop	    [sp, #68]
+
+	#define	a			r2
+	#define	b			r3
+	#define	c			r4
+	#define	d			r5
+	#define	e			r8
+	#define	f			r9
+	#define	g			r10
+	#define	h			r11
+
+	#define	K			r6
+
+	// 2 local variables
+	#define	t	r12
+	#define	s	lr
+
+	// a window (16 words) of message scheule
+	#define	W0	q0
+	#define	W1	q1
+	#define	W2	q2
+	#define	W3	q3
+	#define	zero	q8
+
+	// circular buffer for WK[(r:r+15)%16]
+	#define WK(r)   [sp,#((r)&15)*4]
+
+// #define Ch(x,y,z)   (((x) & (y)) ^ ((~(x)) & (z)))
+
+	.macro Ch
+	mvn		t, $0		// ~x
+	and		s, $0, $1	// (x) & (y)
+	and		t, t, $2	// (~(x)) & (z)
+	eor		t, t, s		// t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
+	.endm
+
+// #define Maj(x,y,z)  (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+
+	.macro	Maj
+	eor		t, $1, $2		// y^z
+	and		s, $1, $2		// y&z
+	and		t, t, $0		// x&(y^z)
+	eor		t, t, s			// t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) 
+	.endm
+
+// #define sigma0_256(x)   (S32(7,  (x)) ^ S32(18, (x)) ^ R(3 ,   (x)))
+
+	// performs sigma0_256 on 4 words on a Q register
+	// use q6/q7 as intermediate registers
+	.macro	sigma0
+	vshr.u32	q6, $0, #7
+	vshl.i32	q7, $0, #14
+	vshr.u32	$0, $0, #3
+	veor		$0, q6
+	veor		$0, q7
+	vshr.u32	q6, #11
+	vshl.i32	q7, #11
+	veor		$0, q6
+	veor		$0, q7
+	.endm
+
+// #define sigma1_256(x)   (S32(17, (x)) ^ S32(19, (x)) ^ R(10,   (x)))
+
+	// performs sigma1_256 on 4 words on a Q register
+	// use q6/q7 as intermediate registers
+	.macro	sigma1
+	vshr.u32	q6, $0, #17
+	vshl.i32	q7, $0, #13
+	vshr.u32	$0, $0, #10
+	veor		$0, q6
+	veor		$0, q7
+	vshr.u32	q6, #2
+	vshl.i32	q7, #2
+	veor		$0, q6
+	veor		$0, q7
+	.endm
+
+// #define Sigma0_256(x)   (S32(2,  (x)) ^ S32(13, (x)) ^ S32(22, (x)))
+
+	.macro	Sigma0
+	ror		t, $0, #2		// S32(2,  (x))
+	ror		s, $0, #13		// S32(13,  (x))
+	eor		t, t, s			// S32(2,  (x)) ^ S32(13, (x))
+	ror		s, s, #9		// S32(22,  (x))
+	eor		t, t, s			// t = (S32(2,  (x)) ^ S32(13, (x)) ^ S32(22, (x)))
+	.endm
+
+// #define Sigma1_256(x)   (S32(6,  (x)) ^ S32(11, (x)) ^ S32(25, (x)))
+
+	.macro	Sigma1
+	ror		t, $0, #6		// S32(6,  (x))
+	ror		s, $0, #11		// S32(11, (x))
+	eor		t, t, s			// S32(6,  (x)) ^ S32(11, (x))
+	ror		s, s, #14		// S32(25, (x))	
+	eor		t, t, s			// t = (S32(6,  (x)) ^ S32(11, (x)) ^ S32(25, (x)))
+	.endm
+
+	// per round digests update
+	.macro	round
+	// ror		t, $4, #6			// S32(6,  (x))
+	eor		t, t, $4, ror #11	// S32(6,  (x)) ^ S32(11, (x))
+	and		s, $4, $5			// (x) & (y)
+	eor		t, t, $4, ror #25	// t = (S32(6,  (x)) ^ S32(11, (x)) ^ S32(25, (x)))
+	add		$7, t				// use h to store h+Sigma1(e)
+	bic		t, $6, $4			// (~(x)) & (z)
+	eor		t, t, s				// t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
+	ldr		s, WK($8)			//
+	add		$7, t				// t = h+Sigma1(e)+Ch(e,f,g);
+	ror		t, $0, #2			// S32(2,  (x))
+	add		$7, s				// h = T1
+	eor		t, t, $0, ror #13	// S32(2,  (x)) ^ S32(13, (x))
+	add		$3, $7				// d += T1;
+	eor		t, t, $0, ror #22	// t = (S32(2,  (x)) ^ S32(13, (x)) ^ S32(22, (x)))				// t = Sigma0(a);
+	add		$7, t				// h = T1 + Sigma0(a);
+	eor		t, $1, $2			// y^z
+	and		s, $1, $2			// y&z
+	and		t, t, $0			// x&(y^z)
+	eor		s, s, t				// t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+	// add		$7, s				// h = T1 + Sigma0(a) + Maj(a,b,c);			
+	.endm
+
+	// per 4 rounds digests update and permutation
+	// permutation is absorbed by rotating the roles of digests a-h
+	.macro	rounds
+	ror		t, $4, #6
+	round	$0, $1, $2, $3, $4, $5, $6, $7, 0+$8
+	ror		t, $3, #6
+	add		$7, s
+	round	$7, $0, $1, $2, $3, $4, $5, $6, 1+$8
+	ror		t, $2, #6
+	add		$6, s
+	round	$6, $7, $0, $1, $2, $3, $4, $5, 2+$8
+	ror		t, $1, #6
+	add		$5, s
+	round	$5, $6, $7, $0, $1, $2, $3, $4, 3+$8
+	add		$4, s
+	.endm
+
+	.macro	rounds_a
+	ror		t, e, #6
+	round	a, b, c, d, e, f, g, h, 0+$0
+	ror		t, d, #6
+	add		h, s
+	round	h, a, b, c, d, e, f, g, 1+$0
+	ror		t, c, #6
+	add		g, s
+	round	g, h, a, b, c, d, e, f, 2+$0
+	ror		t, b, #6
+	add		f, s
+	round	f, g, h, a, b, c, d, e, 3+$0
+	add		e, s
+	.endm
+
+	.macro	rounds_a_update_W_WK
+	ror		t, e, #6
+	round	a, b, c, d, e, f, g, h, 0+$0
+	vld1.s32	{$2},[data]!
+	ror		t, d, #6
+	add		h, s
+	round	h, a, b, c, d, e, f, g, 1+$0
+	vrev32.8	$2, $2
+	ror		t, c, #6
+	vld1.s32	{q4},[K,:128]!
+	add		g, s
+	round	g, h, a, b, c, d, e, f, 2+$0
+	ror		t, b, #6
+	add		f, s
+	vadd.s32	q4, $2
+	round	f, g, h, a, b, c, d, e, 3+$0
+	add		    t, sp, #($1*16)
+	add		    e, s
+	vst1.32		{q4},[t]
+	.endm
+
+	.macro	rounds_e
+	ror		t, a, #6
+	round	e, f, g, h, a, b, c, d, 0+$0
+	ror		t, h, #6
+	add		d, s
+	round	d, e, f, g, h, a, b, c, 1+$0
+	ror		t, g, #6
+	add		c, s
+	round	c, d, e, f, g, h, a, b, 2+$0
+	ror		t, f, #6
+	add		b, s
+	round	b, c, d, e, f, g, h, a, 3+$0
+	add		a, s
+	.endm
+
+	.macro	rounds_e_update_W_WK
+	ror		t, a, #6
+	round	e, f, g, h, a, b, c, d, 0+$0
+	vld1.s32	{$2},[data]!
+	ror		t, h, #6
+	add		d, s
+	round	d, e, f, g, h, a, b, c, 1+$0
+	vrev32.8	$2, $2
+	ror		t, g, #6
+	vld1.s32	{q4},[K,:128]!
+	add		c, s
+	round	c, d, e, f, g, h, a, b, 2+$0
+	ror		t, f, #6
+	add		b, s
+	vadd.s32	q4, $2
+	round	b, c, d, e, f, g, h, a, 3+$0
+	add		    t, sp, #($1*16)
+	add		a, s
+	vst1.32		{q4},[t]
+	.endm
+
+	// update the message schedule W and W+K (4 rounds) 16 rounds ahead in the future 
+	.macro	message_schedule
+	vld1.32	{q5},[K,:128]!
+	vext.32 q4, $0, $1, #1			// Q4 = w4:w1
+	sigma0	q4						// sigma0(w4:w1)
+	vadd.s32	$0, q4				// w3:w0 + sigma0(w4:w1)
+	vext.32	q6, $2, $3, #1			// Q6 = w12:w9
+	vadd.s32	$0, q6				// w3:w0 + sigma0(w4:w1) + w12:w9
+	vext.64	q4, $3, zero, #1		// 0 0 w15:w14
+	sigma1	q4						// Q4 = sigma1(0 0 w15:w14)
+	vadd.s32	$0, q4				// w3:w0 + sigma0(w4:w1) + w12:w9 + sigma1(0 0 w15:w14)
+	vext.64	q4, zero, $0, #1		// Q4 = (w17:w16 0 0)
+	sigma1	q4						// sigma1(w17:w16 0 0)
+	vadd.s32	$0, q4				// w19:w16 = w3:w0 + sigma0(w4:w1) + w12:w9 + sigma1(w17:w14)
+	add		t, sp, #(($4&15)*4)
+	vadd.s32	q5, $0				// W+K
+	vst1.32		{q5},[t,:128]
+	.endm
+
+	// this macro is used in the last 16 rounds of a current block
+	// it reads the next message (16 4-byte words), load it into 4 words W[r:r+3], computes WK[r:r+3]
+	// and save into stack to prepare for next block
+
+	.macro	update_W_WK
+	vld1.s32	{$1},[data]!
+	vrev32.8	$1, $1
+	add		t, sp, #($0*16)
+	vld1.s32	{q4},[K,:128]!
+	vadd.s32	q4, $1
+	vst1.32		{q4},[t]
+	.endm
+
+	.macro	Update_Digits
+	ldrd	t, s, [ctx]
+	add		a, t
+	add		b, s
+	strd	a, b, [ctx]
+
+	ldrd	t, s, [ctx,#8]
+	add		c, t
+	add		d, s
+	strd	c, d, [ctx, #8]
+
+	ldrd	t, s, [ctx,#16]
+	add		e, t
+	add		f, s
+	strd	e, f, [ctx, #16]
+
+	ldrd	t, s, [ctx,#24]
+	add		g, t
+	add		h, s
+	strd	g, h, [ctx, #24]
+	.endm
+
+	.macro	rounds_a_schedule_update
+	eor		t, e, e, ror #5	// S32(6,  (x)) ^ S32(11, (x))
+	vld1.32	{q5},[K,:128]!
+	eor		t, t, e, ror #19	// t = (S32(6,  (x)) ^ S32(11, (x)) ^ S32(25, (x)))
+	vext.32 q4, $1, $2, #1			// Q4 = w4:w1
+	and		s, e, f				// (x) & (y)
+	add		h, t, ror #6				// use h to store h+Sigma1(e)
+	bic		t, g, e				// (~(x)) & (z)
+	vshr.u32	q6, q4, #7
+	eor		t, t, s				// t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
+	vshl.i32	q7, q4, #14
+	ldr		s, WK($0)			//
+	add		h, t				// t = h+Sigma1(e)+Ch(e,f,g);
+	eor		t, a, a, ror #11	// S32(2,  (x)) ^ S32(13, (x))
+	vshr.u32	q4, q4, #3
+	add		h, s				// h = T1
+	eor		t, t, a, ror #20	// t = (S32(2,  (x)) ^ S32(13, (x)) ^ S32(22, (x)))				// t = Sigma0(a);
+	add		d, h				// d += T1;
+	add		h, t, ror #2				// h = T1 + Sigma0(a);
+	veor		q4, q6
+	eor		t, b, c			// y^z
+	vshr.u32	q6, #11
+	and		s, b, c			// y&z
+	and		t, t, a			// x&(y^z)
+	veor		q4, q7
+	eor		s, s, t				// t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+	eor		t, d, d, ror #5	// S32(6,  (x)) ^ S32(11, (x))
+	vshl.i32	q7, #11
+
+
+	add		h, s
+	veor		q4, q6
+	eor		t, t, d, ror #19	// t = (S32(6,  (x)) ^ S32(11, (x)) ^ S32(25, (x)))
+	and		s, d, e				// (x) & (y)
+	veor		q4, q7
+	add		g, t, ror #6				// use h to store h+Sigma1(e)
+
+	bic		t, f, d				// (~(x)) & (z)
+	vext.32	q6, $3, $4, #1			// Q6 = w12:w9
+	eor		t, t, s				// t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
+	ldr		s, WK(1+$0)			//
+	vadd.s32	$1, q4				// w3:w0 + sigma0(w4:w1)
+	add		g, t				// t = h+Sigma1(e)+Ch(e,f,g);
+	eor		t, h, h, ror #11	// S32(2,  (x)) ^ S32(13, (x))
+	vadd.s32	$1, q6				// w3:w0 + sigma0(w4:w1) + w12:w9
+	add		g, s				// h = T1
+	eor		t, t, h, ror #20	// t = (S32(2,  (x)) ^ S32(13, (x)) ^ S32(22, (x)))				// t = Sigma0(a);
+	vext.64	q4, $4, zero, #1		// 0 0 w15:w14
+	add		c, g				// d += T1;
+	add		g, t, ror #2			// h = T1 + Sigma0(a);
+	eor		t, a, b				// y^z
+	and		s, a, b				// y&z
+	vshr.u32	q6, q4, #17
+	and		t, t, h				// x&(y^z)
+	vshl.i32	q7, q4, #13
+	eor		s, s, t				// t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+	vshr.u32	q4, q4, #10
+
+
+
+	eor		t, c, c, ror #5	// S32(6,  (x)) ^ S32(11, (x))
+	veor		q4, q6
+	add		g, s
+	veor		q4, q7
+	eor		t, t, c, ror #19	// t = (S32(6,  (x)) ^ S32(11, (x)) ^ S32(25, (x)))
+	vshr.u32	q6, #2
+	and		s, c, d				// (x) & (y)
+	vshl.i32	q7, #2
+	add		f, t, ror #6				// use h to store h+Sigma1(e)
+	veor		q4, q6
+	bic		t, e, c				// (~(x)) & (z)
+	veor		q4, q7
+	eor		t, t, s				// t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
+	vadd.s32	$1, q4				// w3:w0 + sigma0(w4:w1) + w12:w9 + sigma1(0 0 w15:w14)
+	ldr		s, WK(2+$0)			//
+	vext.64	q4, zero, $1, #1		// Q4 = (w17:w16 0 0)
+	add		f, t				// t = h+Sigma1(e)+Ch(e,f,g);
+	vshr.u32	q6, q4, #17
+	eor		t, g, g, ror #11	// S32(2,  (x)) ^ S32(13, (x))
+	vshl.i32	q7, q4, #13
+	add		f, s				// h = T1
+	vshr.u32	q4, q4, #10
+	eor		t, t, g, ror #20	// t = (S32(2,  (x)) ^ S32(13, (x)) ^ S32(22, (x)))				// t = Sigma0(a);
+	veor		q4, q6
+	add		b, f				// d += T1;
+	veor		q4, q7
+	add		f, t, ror #2			// h = T1 + Sigma0(a);
+	eor		t, h, a				// y^z
+	vshr.u32	q6, #2
+	and		s, h, a				// y&z
+	and		t, t, g				// x&(y^z)
+	vshl.i32	q7, #2
+	eor		s, s, t				// t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+
+	eor		t, b, b, ror #5	// S32(6,  (x)) ^ S32(11, (x))
+	veor		q4, q6
+	add		f, s
+	eor		t, t, b, ror #19	// t = (S32(6,  (x)) ^ S32(11, (x)) ^ S32(25, (x)))
+	veor		q4, q7
+
+	vadd.s32	$1, q4				// w19:w16 = w3:w0 + sigma0(w4:w1) + w12:w9 + sigma1(w17:w14)
+
+	and		s, b, c				// (x) & (y)
+	add		e, t, ror #6				// use h to store h+Sigma1(e)
+	bic		t, d, b				// (~(x)) & (z)
+	vadd.s32	q5, $1				// W+K
+	eor		t, t, s				// t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
+	ldr		s, WK(3+$0)			//
+	add		e, t				// t = h+Sigma1(e)+Ch(e,f,g);
+	eor		t, f, f, ror #11	// S32(2,  (x)) ^ S32(13, (x))
+	add		e, s				// h = T1
+	eor		t, t, f, ror #20	// t = (S32(2,  (x)) ^ S32(13, (x)) ^ S32(22, (x)))				// t = Sigma0(a);
+	add		a, e				// d += T1;
+	add		e, t, ror #2				// h = T1 + Sigma0(a);
+	eor		t, g, h				// y^z
+	and		s, g, h				// y&z
+	and		t, t, f				// x&(y^z)
+	eor		s, s, t				// t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+
+	add		t, sp, #(($0&15)*4)
+	add		e, s
+	vst1.32		{q5},[t,:128]
+
+	.endm
+
+	.macro	rounds_e_schedule_update
+	eor		t, a, a, ror #5			// S32(6,  (x)) ^ S32(11, (x))
+	vld1.32	{q5},[K,:128]!
+	eor		t, t, a, ror #19	// t = (S32(6,  (x)) ^ S32(11, (x)) ^ S32(25, (x)))
+	vext.32 q4, $1, $2, #1			// Q4 = w4:w1
+	and		s, a, b				// (x) & (y)
+	add		d, t, ror #6				// use h to store h+Sigma1(e)
+	bic		t, c, a				// (~(x)) & (z)
+	vshr.u32	q6, q4, #7
+	eor		t, t, s				// t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
+	vshl.i32	q7, q4, #14
+	ldr		s, WK($0)			//
+	add		d, t				// t = h+Sigma1(e)+Ch(e,f,g);
+	eor		t, e, e, ror #11	// S32(2,  (x)) ^ S32(13, (x))
+	vshr.u32	q4, q4, #3
+	add		d, s				// h = T1
+	eor		t, t, e, ror #20	// t = (S32(2,  (x)) ^ S32(13, (x)) ^ S32(22, (x)))				// t = Sigma0(a);
+	add		h, d				// d += T1;
+	veor		q4, q6
+	add		d, t, ror #2				// h = T1 + Sigma0(a);
+	vshr.u32	q6, #11
+	eor		t, f, g				// y^z
+	and		s, f, g				// y&z
+	veor		q4, q7
+	and		t, t, e				// x&(y^z)
+	vshl.i32	q7, #11
+	eor		s, s, t				// t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+	veor		q4, q6
+	eor		t, h, h, ror #5	// S32(6,  (x)) ^ S32(11, (x))
+
+	vext.32	q6, $3, $4, #1			// Q6 = w12:w9
+
+	add		d, s
+	veor		q4, q7
+	eor		t, t, h, ror #19	// t = (S32(6,  (x)) ^ S32(11, (x)) ^ S32(25, (x)))
+	and		s, h, a				// (x) & (y)
+	vadd.s32	$1, q4				// w3:w0 + sigma0(w4:w1)
+	add		c, t, ror #6			// use h to store h+Sigma1(e)
+	bic		t, b, h				// (~(x)) & (z)
+
+	eor		t, t, s				// t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
+	ldr		s, WK(1+$0)			//
+	add		c, t				// t = h+Sigma1(e)+Ch(e,f,g);
+	vadd.s32	$1, q6				// w3:w0 + sigma0(w4:w1) + w12:w9
+	eor		t, d, d, ror #11	// S32(2,  (x)) ^ S32(13, (x))
+	vext.64	q4, $4, zero, #1		// 0 0 w15:w14
+	add		c, s				// h = T1
+	eor		t, t, d, ror #20	// t = (S32(2,  (x)) ^ S32(13, (x)) ^ S32(22, (x)))				// t = Sigma0(a);
+	add		g, c				// d += T1;
+	vshr.u32	q6, q4, #17
+	add		c, t, ror #2			// h = T1 + Sigma0(a);
+	vshl.i32	q7, q4, #13
+	eor		t, e, f				// y^z
+	vshr.u32	q4, q4, #10
+	and		s, e, f				// y&z
+	and		t, t, d				// x&(y^z)
+	veor		q4, q6
+	eor		s, s, t				// t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+
+	veor		q4, q7
+
+	eor		t, g, g, ror #5		// S32(6,  (x)) ^ S32(11, (x))
+	vshr.u32	q6, #2
+	add		c, s
+	vshl.i32	q7, #2
+	eor		t, t, g, ror #19	// t = (S32(6,  (x)) ^ S32(11, (x)) ^ S32(25, (x)))
+	veor		q4, q6
+	and		s, g, h				// (x) & (y)
+	veor		q4, q7
+	add		b, t, ror #6				// use h to store h+Sigma1(e)
+	vadd.s32	$1, q4				// w3:w0 + sigma0(w4:w1) + w12:w9 + sigma1(0 0 w15:w14)
+	bic		t, a, g				// (~(x)) & (z)
+	vext.64	q4, zero, $1, #1		// Q4 = (w17:w16 0 0)
+	eor		t, t, s				// t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
+	ldr		s, WK(2+$0)			//
+	add		b, t				// t = h+Sigma1(e)+Ch(e,f,g);
+	vshr.u32	q6, q4, #17
+	eor		t, c, c, ror #11	// S32(2,  (x)) ^ S32(13, (x))
+	vshl.i32	q7, q4, #13
+	add		b, s				// h = T1
+	vshr.u32	q4, q4, #10
+	eor		t, t, c, ror #20	// t = (S32(2,  (x)) ^ S32(13, (x)) ^ S32(22, (x)))				// t = Sigma0(a);
+	add		f, b				// d += T1;
+	veor		q4, q6
+	add		b, t, ror #2			// h = T1 + Sigma0(a);
+	vshr.u32	q6, #2
+	eor		t, d, e				// y^z
+	veor		q4, q7
+	and		s, d, e				// y&z
+	vshl.i32	q7, #2
+	and		t, t, c				// x&(y^z)
+	eor		s, s, t				// t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+
+	veor		q4, q6
+	eor		t, f, f, ror #5	// S32(6,  (x)) ^ S32(11, (x))
+	veor		q4, q7
+	add		b, s
+	eor		t, t, f, ror #19	// t = (S32(6,  (x)) ^ S32(11, (x)) ^ S32(25, (x)))
+
+	and		s, f, g				// (x) & (y)
+	add		a, t, ror #6			// use h to store h+Sigma1(e)
+	bic		t, h, f				// (~(x)) & (z)
+	vadd.s32	$1, q4				// w19:w16 = w3:w0 + sigma0(w4:w1) + w12:w9 + sigma1(w17:w14)
+	eor		t, t, s				// t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
+	ldr		s, WK(3+$0)			//
+	add		a, t				// t = h+Sigma1(e)+Ch(e,f,g);
+	eor		t, b, b, ror #11	// S32(2,  (x)) ^ S32(13, (x))
+	add		a, s				// h = T1
+	eor		t, t, b, ror #20	// t = (S32(2,  (x)) ^ S32(13, (x)) ^ S32(22, (x)))				// t = Sigma0(a);
+	vadd.s32	q5, $1				// W+K
+	add		e, a				// d += T1;
+	add		a, t, ror #2				// h = T1 + Sigma0(a);
+	eor		t, c, d				// y^z
+	and		s, c, d				// y&z
+	and		t, t, b				// x&(y^z)
+	eor		s, s, t				// t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+	add		t, sp, #(($0&15)*4)
+	add		a, s
+
+	vst1.32		{q5},[t,:128]
+	.endm
+
+
+.subsections_via_symbols
+    .text
+
+    .p2align    4
+
+K256:
+	.long 	0x428a2f98
+	.long 	0x71374491
+	.long	0xb5c0fbcf
+	.long	0xe9b5dba5
+	.long	0x3956c25b
+	.long	0x59f111f1
+	.long	0x923f82a4
+	.long	0xab1c5ed5
+    .long	0xd807aa98
+	.long	0x12835b01
+	.long	0x243185be 
+	.long	0x550c7dc3
+    .long	0x72be5d74 
+	.long	0x80deb1fe 
+	.long	0x9bdc06a7 
+	.long	0xc19bf174
+    .long	0xe49b69c1 
+	.long	0xefbe4786 
+	.long	0x0fc19dc6 
+	.long	0x240ca1cc
+    .long	0x2de92c6f 
+	.long	0x4a7484aa 
+	.long	0x5cb0a9dc 
+	.long	0x76f988da
+    .long	0x983e5152 
+	.long	0xa831c66d 
+	.long	0xb00327c8 
+	.long	0xbf597fc7
+    .long	0xc6e00bf3 
+	.long	0xd5a79147 
+	.long	0x06ca6351 
+	.long	0x14292967
+    .long	0x27b70a85 
+	.long	0x2e1b2138 
+	.long	0x4d2c6dfc 
+	.long	0x53380d13
+    .long	0x650a7354 
+	.long	0x766a0abb 
+	.long	0x81c2c92e 
+	.long	0x92722c85
+    .long	0xa2bfe8a1 
+	.long	0xa81a664b 
+	.long	0xc24b8b70 
+	.long	0xc76c51a3
+    .long	0xd192e819 
+	.long	0xd6990624 
+	.long	0xf40e3585 
+	.long	0x106aa070
+    .long	0x19a4c116 
+	.long	0x1e376c08 
+	.long	0x2748774c 
+	.long	0x34b0bcb5
+    .long	0x391c0cb3 
+	.long	0x4ed8aa4a 
+	.long	0x5b9cca4f 
+	.long	0x682e6ff3
+    .long	0x748f82ee 
+	.long	0x78a5636f 
+	.long	0x84c87814 
+	.long	0x8cc70208
+    .long	0x90befffa
+	.long	0xa4506ceb
+	.long	0xbef9a3f7
+	.long	0xc67178f2
+
+
+    .syntax unified
+    .p2align  2
+    .code   16
+    .thumb_func _AccelerateCrypto_SHA256_compress
+
+    .globl _AccelerateCrypto_SHA256_compress
+_AccelerateCrypto_SHA256_compress:
+
+    // due to the change of order in the 2nd and 3rd calling argument,
+    // we need to switch r1/r2 to use the original code
+    mov     r12, r1
+    mov     r1, r2
+    mov     r2, r12
+
+	// push callee-saved registers
+	push	{r4-r7,lr}
+	add		r7, sp, #12			// set up dtrace frame pointer
+	push	{r8-r11}
+
+	// align sp to 16-byte boundary
+    mov     r12, sp
+	ands    r12, r12, #15		// bytes to align to 16-byte boundary
+    it      eq
+	addeq	r12, #16			// if nothing, enforce to insert 16 bytes
+	sub     sp, r12
+	str     r12, [sp]
+
+#if BUILDKERNEL
+    vpush   {q8}
+#endif
+    vpush   {q0-q7}
+#define stack_size (16*5)       // circular buffer W0-W3, extra 16 to save num_blocks
+    sub     sp, #stack_size
+
+	str		r2, num_blocks 
+
+	veor	zero, zero
+
+	// set up pointer to table K256[]
+    ldr     K, L_table1
+L_table0:
+    mov     r12, pc
+    ldr     K, [r12, K]
+    bal       0f
+L_table1:
+    .long   L_Tab$non_lazy_ptr-(L_table0+4)
+0:
+
+	// load W[0:15]
+	vld1.s32	{W0-W1},[data]!
+	vld1.s32	{W2-W3},[data]!
+
+	// load K[0:15] & per word byte swap
+	vrev32.8	W0, W0
+	vrev32.8	W1, W1
+	vld1.s32	{q4-q5}, [K,:128]!
+	vrev32.8	W2, W2
+	vrev32.8	W3, W3
+	vld1.s32	{q6-q7}, [K,:128]!
+
+	// compute WK[0:15] and save in stack
+
+	vadd.s32	q4, q0
+	vadd.s32	q5, q1
+	vadd.s32	q6, q2
+	vadd.s32	q7, q3
+
+	vstmia		sp,{q4-q7}
+
+	// digests a-h = ctx->states;
+	ldmia		ctx,{a-d,e-h}
+
+L_loop:
+
+	// rounds 0:47 interleaved with W/WK update for rounds 16:63
+    mov     t, #3
+    str     t, _i_loop
+L_i_loop:
+	rounds_a_schedule_update	 0,W0,W1,W2,W3
+	rounds_e_schedule_update	 4,W1,W2,W3,W0
+	rounds_a_schedule_update	 8,W2,W3,W0,W1
+	rounds_e_schedule_update	12,W3,W0,W1,W2
+    ldr     t, _i_loop
+    subs    t, t, #1
+    str     t, _i_loop
+    bgt     L_i_loop
+
+	// revert K to the beginning of K256[]
+	ldr		t, num_blocks
+	sub		K, #256
+
+	subs	t, #1						// num_blocks--
+	beq		L_final_block				// if final block, wrap up final rounds
+	str		t, num_blocks
+
+	// rounds 48:63 interleaved with W/WK initialization for next block rounds 0:15 
+#if 0
+	rounds_a	48
+	update_W_WK	0, W0
+	rounds_e	52 
+	update_W_WK	1, W1
+	rounds_a	56
+	update_W_WK	2, W2
+	rounds_e	60 
+	update_W_WK	3, W3
+#else
+    rounds_a_update_W_WK    48, 0, W0
+    rounds_e_update_W_WK    52, 1, W1
+    rounds_a_update_W_WK    56, 2, W2
+    rounds_e_update_W_WK    60, 3, W3
+#endif
+
+	// ctx->states += digests a-h
+	Update_Digits
+
+	// digests a-h = ctx->states;
+	// ldmia		ctx,{a-d,e-h}
+
+	bal		L_loop				// branch for next block
+
+	// wrap up digest update round 48:63 for final block
+L_final_block:
+	rounds_a	48
+	rounds_e	52 
+	rounds_a	56
+	rounds_e	60 
+
+	// ctx->states += digests a-h
+	Update_Digits
+
+	// free allocated stack memory
+	add		sp, #stack_size
+
+	// if kernel, restore q0-q8
+	vpop	{q0-q1}
+	vpop	{q2-q3}
+	vpop	{q4-q5}
+	vpop	{q6-q7}
+#if BUILDKERNEL
+	vpop	{q8}
+#endif
+
+	// dealign sp from the 16-byte boundary
+    ldr     r12, [sp]
+    add     sp, r12
+
+	// restore callee-save registers and return
+	pop	{r8-r11}
+	pop	{r4-r7,pc}
+
+
+    .section    __DATA,__nl_symbol_ptr,non_lazy_symbol_pointers
+    .p2align  2
+L_Tab$non_lazy_ptr:
+    .indirect_symbol        K256
+    .long   0
+
+
+
+#endif // (defined(__arm__) && defined(__ARM_NEON__))
+
--- a/acceleratecrypto/Source/sha256/arm64/sha256_compress_arm64.s
+++ b/acceleratecrypto/Source/sha256/arm64/sha256_compress_arm64.s
@ -0,0 +1,389 @@
+# Copyright (c) (2018-2020) Apple Inc. All rights reserved.
+#
+# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+# is contained in the License.txt file distributed with corecrypto) and only to
+# people who accept that license. IMPORTANT:  Any license rights granted to you by
+# Apple Inc. (if any) are limited to internal use within your organization only on
+# devices and computers you own or control, for the sole purpose of verifying the
+# security characteristics and correct functioning of the Apple Software.  You may
+# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+
+/*
+	This file provides armv7+neon hand implementation of the following function
+
+	void SHA256_Transform(SHA256_ctx *ctx, char *data, unsigned int num_blocks);
+
+	which is a C function in sha2.c (from xnu).
+
+	sha256 algorithm per block description:
+
+		1. W(0:15) = big-endian (per 4 bytes) loading of input data (64 byte) 
+		2. load 8 digests a-h from ctx->state
+		3. for r = 0:15
+				T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
+				d += T1;
+				h = T1 + Sigma0(a) + Maj(a,b,c)
+				permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
+		4. for r = 16:63
+				W[r] = W[r-16] + sigma1(W[r-2]) + W[r-7] + sigma0(W[r-15]);
+				T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
+				d += T1;
+				h = T1 + Sigma0(a) + Maj(a,b,c)
+				permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
+				
+	In the assembly implementation:	
+		- a circular window of message schedule W(r:r+15) is updated and stored in q0-q3
+		- its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
+		- the 8 digests (a-h) will be stored in GPR or memory
+
+	the implementation per block looks like
+
+	----------------------------------------------------------------------------
+
+	load W(0:15) (big-endian per 4 bytes) into q0:q3
+	pre_calculate and store W+K(0:15) in stack
+
+	load digests a-h from ctx->state;
+
+	for (r=0;r<48;r+=4) {
+		digests a-h update and permute round r:r+3
+		update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration 
+	}
+
+	for (r=48;r<64;r+=4) {
+		digests a-h update and permute round r:r+3
+	}
+
+	ctx->states += digests a-h;
+
+	----------------------------------------------------------------------------
+
+	our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block 
+	into the last 16 rounds of its previous block:
+
+	----------------------------------------------------------------------------
+
+	load W(0:15) (big-endian per 4 bytes) into q0:q3 
+	pre_calculate and store W+K(0:15) in stack
+
+L_loop:
+
+	load digests a-h from ctx->state;
+
+	for (r=0;r<48;r+=4) {
+		digests a-h update and permute round r:r+3
+		update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration 
+	}
+
+	num_block--;
+	if (num_block==0)	jmp L_last_block;
+
+	for (r=48;r<64;r+=4) {
+		digests a-h update and permute round r:r+3
+		load W([r:r+3]%16) (big-endian per 4 bytes) into q0:q3 
+		pre_calculate and store W+K([r:r+3]%16) in stack
+	}
+
+	ctx->states += digests a-h;
+
+	jmp	L_loop;
+
+L_last_block:
+
+	for (r=48;r<64;r+=4) {
+		digests a-h update and permute round r:r+3
+	}
+
+	ctx->states += digests a-h;
+
+	------------------------------------------------------------------------
+
+	Apple CoreOS vector & numerics
+*/
+
+#if defined(__arm64__)
+
+#include "arm64_isa_compatibility.h"
+#include "ccarm_pac_bti_macros.h"
+
+.subsections_via_symbols
+    .text
+
+    .p2align  4
+
+K256:
+    .long   0x428a2f98
+    .long   0x71374491
+    .long   0xb5c0fbcf
+    .long   0xe9b5dba5
+    .long   0x3956c25b
+    .long   0x59f111f1
+    .long   0x923f82a4
+    .long   0xab1c5ed5
+    .long   0xd807aa98
+    .long   0x12835b01
+    .long   0x243185be
+    .long   0x550c7dc3
+    .long   0x72be5d74
+    .long   0x80deb1fe
+    .long   0x9bdc06a7
+    .long   0xc19bf174
+    .long   0xe49b69c1
+    .long   0xefbe4786
+    .long   0x0fc19dc6
+    .long   0x240ca1cc
+    .long   0x2de92c6f
+    .long   0x4a7484aa
+    .long   0x5cb0a9dc
+    .long   0x76f988da
+    .long   0x983e5152
+    .long   0xa831c66d
+    .long   0xb00327c8
+    .long   0xbf597fc7
+    .long   0xc6e00bf3
+    .long   0xd5a79147
+    .long   0x06ca6351
+    .long   0x14292967
+    .long   0x27b70a85
+    .long   0x2e1b2138
+    .long   0x4d2c6dfc
+    .long   0x53380d13
+    .long   0x650a7354
+    .long   0x766a0abb
+    .long   0x81c2c92e
+    .long   0x92722c85
+    .long   0xa2bfe8a1
+    .long   0xa81a664b
+    .long   0xc24b8b70
+    .long   0xc76c51a3
+    .long   0xd192e819
+    .long   0xd6990624
+    .long   0xf40e3585
+    .long   0x106aa070
+    .long   0x19a4c116
+    .long   0x1e376c08
+    .long   0x2748774c
+    .long   0x34b0bcb5
+    .long   0x391c0cb3
+    .long   0x4ed8aa4a
+    .long   0x5b9cca4f
+    .long   0x682e6ff3
+    .long   0x748f82ee
+    .long   0x78a5636f
+    .long   0x84c87814
+    .long   0x8cc70208
+    .long   0x90befffa
+    .long   0xa4506ceb
+    .long   0xbef9a3f7
+    .long   0xc67178f2
+
+
+    .p2align  4
+
+	.globl _AccelerateCrypto_SHA256_compress
+_AccelerateCrypto_SHA256_compress:
+
+
+	#define	hashes		x0
+	#define	numblocks	x1
+	#define	data		x2
+	#define	ktable		x3
+	BRANCH_TARGET_CALL
+#ifdef __ILP32__
+    uxtw    numblocks, numblocks        // in arm64_32 size_t is 32-bit, so we need to extend it
+#endif
+
+
+	adrp	ktable, K256@page
+	cbnz	numblocks, 1f						// if number of blocks is nonzero, go on for sha256 transform operation
+	ret		lr							// otherwise, return
+1:
+	add		ktable, ktable, K256@pageoff
+
+#if BUILDKERNEL
+	// save q0-q7, q16-q24 8+8+1=19
+	sub		x4, sp, #17*16
+	sub		sp, sp, #17*16
+	st1.4s	{v0, v1, v2, v3}, [x4], #64
+	st1.4s	{v4, v5, v6, v7}, [x4], #64
+	st1.4s	{v16, v17, v18, v19}, [x4], #64
+	st1.4s	{v20, v21, v22, v23}, [x4], #64
+	st1.4s	{v24}, [x4], #16
+#endif
+
+	ld1.4s	{v0,v1,v2,v3}, [data], #64			// w0,w1,w2,w3 need to bswap into big-endian
+
+    rev32.16b	v0, v0					// byte swap of 1st 4 ints
+    ldr         q21, [ktable, #16*0]
+    rev32.16b	v1, v1					// byte swap of 2nd 4 ints
+    ldr         q16, [hashes, #0]
+    rev32.16b	v2, v2					// byte swap of 3rd 4 ints
+    ldr         q17, [hashes, #16]
+    rev32.16b	v3, v3					// byte swap of 4th 4 ints
+    ldr         q22, [ktable, #16*1]
+
+	mov.16b		v18, v16
+    ldr         q23, [ktable, #16*2]
+    add.4s		v4, v0, v21				// 1st 4 input + K256
+    ldr         q24, [ktable, #16*3]
+    add.4s		v5, v1, v22				// 2nd 4 input + K256
+	mov.16b		v19, v17
+    add.4s		v6, v2, v23				// 3rd 4 input + K256
+    add.4s		v7, v3, v24				// 4th 4 input + K256
+    add         ktable, ktable, #16*4
+
+
+	.macro	sha256_round
+	mov.16b		v20, v18
+	SHA256SU0	$0, $1
+	SHA256H		18, 19, $4
+	SHA256SU1	$0, $2, $3
+	SHA256H2	19, 20, $4
+	add.4s		$6, $5, $7
+	.endm
+
+	// 4 vector hashes update and load next vector rounds
+	.macro	sha256_hash_load_round
+	mov.16b		v20, v18
+	SHA256H		18, 19, $0
+    rev32.16b	$1, $1
+	SHA256H2	19, 20, $0
+    add.4s		$2, $1, $3
+	.endm
+
+	.macro	sha256_hash_round
+	mov.16b		v20, v18
+	SHA256H		18, 19, $0
+	SHA256H2	19, 20, $0
+	.endm
+
+	// 12 vector hash and sequence update rounds
+    mov         w4, #3
+L_i_loop:
+    mov.16b		v20, v18
+	ldr         q21, [ktable, #0]		// k0
+	SHA256SU0	0, 1
+	ldr         q22, [ktable, #16]		// k1
+	SHA256H		18, 19, 4
+	ldr         q23, [ktable, #32]		// k2
+	SHA256SU1	0, 2, 3
+	ldr         q24, [ktable, #48]		// k3
+	SHA256H2	19, 20, 4
+    add         ktable, ktable, #64
+	add.4s		v4, v0, v21
+
+	sha256_round	1, 2, 3, 0, 5, v1, v5, v22
+	sha256_round	2, 3, 0, 1, 6, v2, v6, v23
+    subs            w4, w4, #1
+	sha256_round	3, 0, 1, 2, 7, v3, v7, v24
+    b.gt            L_i_loop
+    
+	subs 		numblocks, numblocks, #1	// pre-decrement num_blocks by 1
+	b.le		L_wrapup
+
+	sub			ktable, ktable, #256
+
+L_loop:
+
+    ldr	        q0, [data, #0]
+	mov.16b		v20, v18
+    ldr         q21, [ktable,#0]
+	SHA256H		18, 19, 4
+	ldr	        q1, [data, #16]
+    rev32.16b	v0, v0
+	ldr	        q2, [data, #32]
+	SHA256H2	19, 20, 4
+	ldr	        q3, [data, #48]
+    add.4s		v4, v0, v21
+
+    ldr         q22, [ktable,#16]
+	mov.16b		v20, v18
+    add         data, data, #64
+	SHA256H		18, 19, 5
+    ldr         q23, [ktable,#32]
+    rev32.16b	v1, v1
+    ldr         q24, [ktable,#48]
+	SHA256H2	19, 20, 5
+    add.4s		v5, v1, v22
+
+	sha256_hash_load_round	6, v2, v6, v23
+	sha256_hash_load_round	7, v3, v7, v24
+
+	add.4s		v18, v16, v18
+	add.4s		v19, v17, v19
+	mov.16b		v16, v18
+	mov.16b		v17, v19
+
+	// 12 vector hash and sequence update rounds
+    mov.16b		v20, v18
+	ldr         q21, [ktable, #16*4]		// k0
+	SHA256SU0	0, 1
+	ldr         q22, [ktable, #16*5]		// k1
+	SHA256H		18, 19, 4
+	ldr         q23, [ktable, #16*6]		// k2
+	SHA256SU1	0, 2, 3
+	ldr         q24, [ktable, #16*7]		// k3
+	SHA256H2	19, 20, 4
+	add.4s		v4, v0, v21
+
+	sha256_round	1, 2, 3, 0, 5, v1, v5, v22
+	sha256_round	2, 3, 0, 1, 6, v2, v6, v23
+	sha256_round	3, 0, 1, 2, 7, v3, v7, v24
+    mov.16b		v20, v18
+	ldr         q21, [ktable, #16*8]		// k0
+	SHA256SU0	0, 1
+	ldr         q22, [ktable, #16*9]		// k1
+	SHA256H		18, 19, 4
+	ldr         q23, [ktable, #16*10]		// k2
+	SHA256SU1	0, 2, 3
+	ldr         q24, [ktable, #16*11]		// k3
+	SHA256H2	19, 20, 4
+	add.4s		v4, v0, v21
+
+	sha256_round	1, 2, 3, 0, 5, v1, v5, v22
+	sha256_round	2, 3, 0, 1, 6, v2, v6, v23
+	sha256_round	3, 0, 1, 2, 7, v3, v7, v24
+
+    mov.16b		v20, v18
+	ldr         q21, [ktable, #16*12]		// k0
+	SHA256SU0	0, 1
+	ldr         q22, [ktable, #16*13]		// k1
+	SHA256H		18, 19, 4
+	ldr         q23, [ktable, #16*14]		// k2
+	SHA256SU1	0, 2, 3
+	ldr         q24, [ktable, #16*15]		// k3
+	SHA256H2	19, 20, 4
+	add.4s		v4, v0, v21
+
+	sha256_round	1, 2, 3, 0, 5, v1, v5, v22
+	sha256_round	2, 3, 0, 1, 6, v2, v6, v23
+	sha256_round	3, 0, 1, 2, 7, v3, v7, v24
+
+	subs 		numblocks, numblocks, #1	// pre-decrement num_blocks by 1
+	b.gt		L_loop
+
+L_wrapup:
+
+	sha256_hash_round	4
+	sha256_hash_round	5
+	sha256_hash_round	6
+	sha256_hash_round	7
+
+	add.4s		v16, v16, v18
+	add.4s		v17, v17, v19
+	st1.4s		{v16,v17}, [hashes]					// hashes q16 : d,c,b,a   q17 : h,g,f,e
+
+#if BUILDKERNEL
+	// restore q9-q13, q0-q7, q16-q31
+	ld1.4s	{v0, v1, v2, v3}, [sp], #64
+	ld1.4s	{v4, v5, v6, v7}, [sp], #64
+	ld1.4s	{v16, v17, v18, v19}, [sp], #64
+	ld1.4s	{v20, v21, v22, v23}, [sp], #64
+	ld1.4s	{v24}, [sp], #16
+#endif
+
+	ret		lr
+
+
+#endif		// arm64
+
--- a/acceleratecrypto/Source/sha256/arm64/sha256_compress_arm64neon.s
+++ b/acceleratecrypto/Source/sha256/arm64/sha256_compress_arm64neon.s
@ -0,0 +1,796 @@
+# Copyright (c) (2011-2013,2015,2016,2018-2020) Apple Inc. All rights reserved.
+#
+# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+# is contained in the License.txt file distributed with corecrypto) and only to
+# people who accept that license. IMPORTANT:  Any license rights granted to you by
+# Apple Inc. (if any) are limited to internal use within your organization only on
+# devices and computers you own or control, for the sole purpose of verifying the
+# security characteristics and correct functioning of the Apple Software.  You may
+# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+
+/*
+
+    This is for Chinook AOP (arm64) that does not support crypto instructions.
+
+	This file provides arm64 neon hand implementation of the following function
+
+	void SHA256_Transform(SHA256_ctx *ctx, char *data, unsigned int num_blocks);
+
+	which is a C function in sha2.c (from xnu).
+
+	sha256 algorithm per block description:
+
+		1. W(0:15) = big-endian (per 4 bytes) loading of input data (64 byte) 
+		2. load 8 digests a-h from ctx->state
+		3. for r = 0:15
+				T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
+				d += T1;
+				h = T1 + Sigma0(a) + Maj(a,b,c)
+				permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
+		4. for r = 16:63
+				W[r] = W[r-16] + sigma1(W[r-2]) + W[r-7] + sigma0(W[r-15]);
+				T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
+				d += T1;
+				h = T1 + Sigma0(a) + Maj(a,b,c)
+				permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
+				
+	In the assembly implementation:	
+		- a circular window of message schedule W(r:r+15) is updated and stored in q0-q3
+		- its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
+		- the 8 digests (a-h) will be stored in GPR or memory
+
+	the implementation per block looks like
+
+	----------------------------------------------------------------------------
+
+	load W(0:15) (big-endian per 4 bytes) into q0:q3
+	pre_calculate and store W+K(0:15) in stack
+
+	load digests a-h from ctx->state;
+
+	for (r=0;r<48;r+=4) {
+		digests a-h update and permute round r:r+3
+		update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration 
+	}
+
+	for (r=48;r<64;r+=4) {
+		digests a-h update and permute round r:r+3
+	}
+
+	ctx->states += digests a-h;
+
+	----------------------------------------------------------------------------
+
+	our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block 
+	into the last 16 rounds of its previous block:
+
+	----------------------------------------------------------------------------
+
+	load W(0:15) (big-endian per 4 bytes) into q0:q3 
+	pre_calculate and store W+K(0:15) in stack
+
+L_loop:
+
+	load digests a-h from ctx->state;
+
+	for (r=0;r<48;r+=4) {
+		digests a-h update and permute round r:r+3
+		update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration 
+	}
+
+	num_block--;
+	if (num_block==0)	jmp L_last_block;
+
+	for (r=48;r<64;r+=4) {
+		digests a-h update and permute round r:r+3
+		load W([r:r+3]%16) (big-endian per 4 bytes) into q0:q3 
+		pre_calculate and store W+K([r:r+3]%16) in stack
+	}
+
+	ctx->states += digests a-h;
+
+	jmp	L_loop;
+
+L_last_block:
+
+	for (r=48;r<64;r+=4) {
+		digests a-h update and permute round r:r+3
+	}
+
+	ctx->states += digests a-h;
+
+	------------------------------------------------------------------------
+
+	Apple CoreOS vector & numerics
+*/
+
+
+	// associate variables with registers or memory
+
+    #define ctx         x0
+    #define num_blocks  x1
+    #define data        x2
+    #define ktable      x3
+
+	#define	_i_loop	    x4
+
+	#define	a			w5
+	#define	bb			w6
+	#define	c			w7
+	#define	d			w8
+	#define	e			w9
+	#define	f			w10
+	#define	g			w11
+	#define	h			w12
+
+	// 2 local variables
+	#define	t	w13
+	#define	s	w14
+
+	// a window (16 words) of message scheule
+	#define	W0	v0
+	#define	W1	v1
+	#define	W2	v2
+	#define	W3	v3
+	#define	qW0	q0
+	#define	qW1	q1
+	#define	qW2	q2
+	#define	qW3	q3
+	#define	zero	v16
+	#define	WK0	v4
+	#define	WK1	v5
+	#define	WK2	v6
+	#define	WK3	v7
+	#define	qWK0	q4
+	#define	qWK1	q5
+	#define	qWK2	q6
+	#define	qWK3	q7
+
+	// circular buffer for WK[(r:r+15)%16]
+	#define WK(r)   [sp,#((r)&15)*4]
+
+// #define Ch(x,y,z)   (((x) & (y)) ^ ((~(x)) & (z)))
+
+	.macro Ch
+	mvn		t, $0		// ~x
+	and		s, $0, $1	// (x) & (y)
+	and		t, t, $2	// (~(x)) & (z)
+	eor		t, t, s		// t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
+	.endm
+
+// #define Maj(x,y,z)  (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+
+	.macro	Maj
+	eor		t, $1, $2		// y^z
+	and		s, $1, $2		// y&z
+	and		t, t, $0		// x&(y^z)
+	eor		t, t, s			// t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) 
+	.endm
+
+// #define sigma0_256(x)   (S32(7,  (x)) ^ S32(18, (x)) ^ R(3 ,   (x)))
+
+	// performs sigma0_256 on 4 words on a Q register
+	// use q6/q7 as intermediate registers
+	.macro	sigma0
+	vshr.u32	q6, $0, #7
+	vshl.i32	q7, $0, #14
+	vshr.u32	$0, $0, #3
+	veor		$0, q6
+	veor		$0, q7
+	vshr.u32	q6, #11
+	vshl.i32	q7, #11
+	veor		$0, q6
+	veor		$0, q7
+	.endm
+
+// #define sigma1_256(x)   (S32(17, (x)) ^ S32(19, (x)) ^ R(10,   (x)))
+
+	// performs sigma1_256 on 4 words on a Q register
+	// use q6/q7 as intermediate registers
+	.macro	sigma1
+	vshr.u32	q6, $0, #17
+	vshl.i32	q7, $0, #13
+	vshr.u32	$0, $0, #10
+	veor		$0, q6
+	veor		$0, q7
+	vshr.u32	q6, #2
+	vshl.i32	q7, #2
+	veor		$0, q6
+	veor		$0, q7
+	.endm
+
+// #define Sigma0_256(x)   (S32(2,  (x)) ^ S32(13, (x)) ^ S32(22, (x)))
+
+	.macro	Sigma0
+	ror		t, $0, #2		// S32(2,  (x))
+	ror		s, $0, #13		// S32(13,  (x))
+	eor		t, t, s			// S32(2,  (x)) ^ S32(13, (x))
+	ror		s, s, #9		// S32(22,  (x))
+	eor		t, t, s			// t = (S32(2,  (x)) ^ S32(13, (x)) ^ S32(22, (x)))
+	.endm
+
+// #define Sigma1_256(x)   (S32(6,  (x)) ^ S32(11, (x)) ^ S32(25, (x)))
+
+	.macro	Sigma1
+	ror		t, $0, #6		// S32(6,  (x))
+	ror		s, $0, #11		// S32(11, (x))
+	eor		t, t, s			// S32(6,  (x)) ^ S32(11, (x))
+	ror		s, s, #14		// S32(25, (x))	
+	eor		t, t, s			// t = (S32(6,  (x)) ^ S32(11, (x)) ^ S32(25, (x)))
+	.endm
+
+	// per round digests update
+	.macro	round
+	// ror		t, $4, #6			// S32(6,  (x))
+	eor		t, t, $4, ror #11	// S32(6,  (x)) ^ S32(11, (x))
+	eor		t, t, $4, ror #25	// t = (S32(6,  (x)) ^ S32(11, (x)) ^ S32(25, (x)))
+	and		s, $4, $5			// (x) & (y)
+	add		$7, $7, t			// use h to store h+Sigma1(e)
+	bic		t, $6, $4			// (~(x)) & (z)
+	eor		t, t, s				// t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
+	mov		s, $8   			//
+	add		$7, $7, t			// t = h+Sigma1(e)+Ch(e,f,g);
+	ror		t, $0, #2			// S32(2,  (x))
+	add		$7, $7, s			// h = T1
+	eor		t, t, $0, ror #13	// S32(2,  (x)) ^ S32(13, (x))
+	add		$3, $3, $7			// d += T1;
+	eor		t, t, $0, ror #22	// t = (S32(2,  (x)) ^ S32(13, (x)) ^ S32(22, (x)))				// t = Sigma0(a);
+	add		$7, $7, t			// h = T1 + Sigma0(a);
+	eor		t, $1, $2			// y^z
+	and		s, $1, $2			// y&z
+	and		t, t, $0			// x&(y^z)
+	eor		s, s, t				// t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+	// add		$7, s				// h = T1 + Sigma0(a) + Maj(a,b,c);			
+	.endm
+
+	// per 4 rounds digests update and permutation
+	// permutation is absorbed by rotating the roles of digests a-h
+	.macro	rounds
+	ror		t, $4, #6
+	round	$0, $1, $2, $3, $4, $5, $6, $7, 0+$8
+	ror		t, $3, #6
+	add		$7, s
+	round	$7, $0, $1, $2, $3, $4, $5, $6, 1+$8
+	ror		t, $2, #6
+	add		$6, s
+	round	$6, $7, $0, $1, $2, $3, $4, $5, 2+$8
+	ror		t, $1, #6
+	add		$5, s
+	round	$5, $6, $7, $0, $1, $2, $3, $4, 3+$8
+	add		$4, s
+	.endm
+
+	.macro	rounds_a
+	ror		t, e, #6
+	round	a, bb, c, d, e, f, g, h, $0.s[0]
+	ror		t, d, #6
+	add		h, h, s
+	round	h, a, bb, c, d, e, f, g, $0.s[1]
+	ror		t, c, #6
+	add		g, g, s
+	round	g, h, a, bb, c, d, e, f, $0.s[2]
+	ror		t, bb, #6
+	add		f, f, s
+	round	f, g, h, a, bb, c, d, e, $0.s[3]
+	add		e, e, s
+	.endm
+
+	.macro	rounds_e
+	ror		t, a, #6
+	round	e, f, g, h, a, bb, c, d, $0.s[0]
+	ror		t, h, #6
+	add		d, d, s
+	round	d, e, f, g, h, a, bb, c, $0.s[1]
+	ror		t, g, #6
+	add		c, c, s
+	round	c, d, e, f, g, h, a, bb, $0.s[2]
+	ror		t, f, #6
+	add		bb, bb, s
+	round	bb, c, d, e, f, g, h, a, $0.s[3]
+	add		a, a, s
+	.endm
+
+	.macro	rounds_a_update_W_WK
+	ror		t, e, #6
+	ldr	$3, [data], #16
+	round	a, bb, c, d, e, f, g, h, $0.s[0]
+	ror		t, d, #6
+	rev32.16b	$1, $1
+	add		h, h, s
+	round	h, a, bb, c, d, e, f, g, $0.s[1]
+	ror		t, c, #6
+	add		g, g, s
+	ldr	    q17, [ktable], #16
+	round	g, h, a, bb, c, d, e, f, $0.s[2]
+	ror		t, bb, #6
+	add		f, f, s
+	round	f, g, h, a, bb, c, d, e, $0.s[3]
+	add		e, e, s
+	add.4s	$0, v17, $1
+	.endm
+
+	.macro	rounds_e_update_W_WK
+	ror		t, a, #6
+	ldr	    $3, [data], #16
+	round	e, f, g, h, a, bb, c, d, $0.s[0]
+	ror		t, h, #6
+	rev32.16b	$1, $1
+	add		d, d, s
+	round	d, e, f, g, h, a, bb, c, $0.s[1]
+	ror		t, g, #6
+	add		c, c, s
+	ldr	    q17, [ktable], #16
+	round	c, d, e, f, g, h, a, bb, $0.s[2]
+	ror		t, f, #6
+	add		bb, bb, s
+	round	bb, c, d, e, f, g, h, a, $0.s[3]
+	add		a, a, s
+	add.4s	$0, v17, $1
+	.endm
+
+	// this macro is used in the last 16 rounds of a current block
+	// it reads the next message (16 4-byte words), load it into 4 words W[r:r+3], computes WK[r:r+3]
+	// and save into stack to prepare for next block
+
+	.macro	update_W_WK
+	ldr	$3, [data]
+	ldr	$2, [ktable]
+    add     data, data, #16
+	rev32.16b	$1, $1
+    add     ktable, ktable, #16
+	add.4s	$0, $0, $1
+	.endm
+
+	.macro	Update_Digits
+	ldp		t, s, [ctx]
+	add		a, a, t
+	add		bb, bb, s
+	stp	    a, bb, [ctx]
+
+	ldp		t, s, [ctx,#8]
+	add		c, c, t
+	add		d, d, s
+	stp	    c, d, [ctx, #8]
+
+	ldp		t, s, [ctx,#16]
+	add		e, e, t
+	add		f, f, s
+	stp	    e, f, [ctx, #16]
+
+	ldp		t, s, [ctx,#24]
+	add		g, g, t
+	add		h, h, s
+	stp	    g, h, [ctx, #24]
+	.endm
+
+	.macro	rounds_a_schedule_update
+	eor		t, e, e, ror #5	// S32(6,  (x)) ^ S32(11, (x))
+	ldr     q17, [ktable], #16
+	eor		t, t, e, ror #19	// t = (S32(6,  (x)) ^ S32(11, (x)) ^ S32(25, (x)))
+	ext.16b  v18, $1, $2, #4			// w4:w1
+    ror     t, t, #6
+	and		s, e, f				// (x) & (y)
+	add		h, h, t				// use h to store h+Sigma1(e)
+	bic		t, g, e				// (~(x)) & (z)
+	ushr.4s	v19, v18, #7
+	eor		t, t, s				// t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
+	mov		s, $5.s[0]  		//
+	add		h, h, t				// t = h+Sigma1(e)+Ch(e,f,g);
+	shl.4s	v20, v18, #14
+	eor		t, a, a, ror #11	// S32(2,  (x)) ^ S32(13, (x))
+	ushr.4s v18, v18, #3
+	add		h, h, s				// h = T1
+	eor		t, t, a, ror #20	// t = (S32(2,  (x)) ^ S32(13, (x)) ^ S32(22, (x)))				// t = Sigma0(a);
+	add		d, d, h				// d += T1;
+    ror     t, t, #2
+	eor.16b		v18, v18, v19
+	add		h, h, t				// h = T1 + Sigma0(a);
+	ushr.4s	v19, v19, #11
+	eor		t, bb, c			// y^z
+	and		s, bb, c			// y&z
+	and		t, t, a			// x&(y^z)
+	eor.16b		v18, v18, v20
+	eor		s, s, t				// t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+	shl.4s	v20, v20, #11
+	eor		t, d, d, ror #5	// S32(6,  (x)) ^ S32(11, (x))
+
+	add		h, h, s
+	eor		t, t, d, ror #19	// t = (S32(6,  (x)) ^ S32(11, (x)) ^ S32(25, (x)))
+	eor.16b		v18, v18, v19
+	and		s, d, e				// (x) & (y)
+	ext.16b	v19, $3, $4, #4			// q19 = w12:w9
+    ror     t, t, #6
+	add		g, g, t				// use h to store h+Sigma1(e)
+	eor.16b		v18, v18, v20
+	bic		t, f, d				// (~(x)) & (z)
+
+	eor		t, t, s				// t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
+	mov		s, $5.s[1]  		//
+	add		g, g, t				// t = h+Sigma1(e)+Ch(e,f,g);
+	eor		t, h, h, ror #11	// S32(2,  (x)) ^ S32(13, (x))
+	add.4s	$1, $1, v18				// w3:w0 + sigma0(w4:w1)
+	add		g, g, s				// h = T1
+	ext.16b	v18, $4, zero, #8		// 0 0 w15:w14
+	eor		t, t, h, ror #20	// t = (S32(2,  (x)) ^ S32(13, (x)) ^ S32(22, (x)))				// t = Sigma0(a);
+	add.4s	$1, $1, v19				// w3:w0 + sigma0(w4:w1) + w12:w9
+    ror     t, t, #2
+	add		c, c, g				// d += T1;
+	ushr.4s	v19, v18, #17
+	add		g, g, t			// h = T1 + Sigma0(a);
+	shl.4s	v20, v18, #13
+	eor		t, a, bb				// y^z
+	ushr.4s	v18, v18, #10
+	and		s, a, bb				// y&z
+	and		t, t, h				// x&(y^z)
+	eor.16b		v18, v18, v19
+	eor		s, s, t				// t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+	ushr.4s	    v19, v19, #2
+
+
+
+	eor		t, c, c, ror #5	// S32(6,  (x)) ^ S32(11, (x))
+	add		g, g, s
+	eor.16b		v18, v18, v20
+	eor		t, t, c, ror #19	// t = (S32(6,  (x)) ^ S32(11, (x)) ^ S32(25, (x)))
+	shl.4s	    v20, v20, #2
+    ror     t, t, #6
+	and		s, c, d				// (x) & (y)
+	eor.16b		v18, v18, v19
+	add		f, f, t				// use h to store h+Sigma1(e)
+	eor.16b		v18, v18, v20
+	bic		t, e, c				// (~(x)) & (z)
+	add.4s	$1, $1, v18				// w3:w0 + sigma0(w4:w1) + w12:w9 + sigma1(0 0 w15:w14)
+	eor		t, t, s				// t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
+	mov		s, $5.s[2]  		//
+	add		f, f, t				// t = h+Sigma1(e)+Ch(e,f,g);
+	ext.16b	v18, zero, $1, #8		// Q4 = (w17:w16 0 0)
+	eor		t, g, g, ror #11	// S32(2,  (x)) ^ S32(13, (x))
+	add		f, f, s				// h = T1
+	eor		t, t, g, ror #20	// t = (S32(2,  (x)) ^ S32(13, (x)) ^ S32(22, (x)))				// t = Sigma0(a);
+	ushr.4s	v19, v18, #17
+	add		bb, bb, f				// d += T1;
+	shl.4s	v20, v18, #13
+    ror     t, t, #2 
+	ushr.4s	v18, v18, #10
+	add		f, f, t 			// h = T1 + Sigma0(a);
+	eor		t, h, a				// y^z
+	and		s, h, a				// y&z
+	eor.16b		v18, v18, v19
+	and		t, t, g				// x&(y^z)
+	eor		s, s, t				// t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+
+	eor		t, bb, bb, ror #5	// S32(6,  (x)) ^ S32(11, (x))
+	add		f, f, s
+	eor.16b		v18, v18, v20
+	eor		t, t, bb, ror #19	// t = (S32(6,  (x)) ^ S32(11, (x)) ^ S32(25, (x)))
+	ushr.4s	v19, v19, #2
+    ror     t, t, #6
+	shl.4s	v20, v20, #2
+
+	and		s, bb, c			// (x) & (y)
+	eor.16b		v18, v18, v19
+	add		e, e, t     		// use h to store h+Sigma1(e)
+	bic		t, d, bb			// (~(x)) & (z)
+	eor		t, t, s				// t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
+	mov		s, $5.s[3]  		//
+	add		e, e, t				// t = h+Sigma1(e)+Ch(e,f,g);
+	eor.16b		v18, v18, v20
+	eor		t, f, f, ror #11	// S32(2,  (x)) ^ S32(13, (x))
+	add		e, e, s				// h = T1
+	eor		t, t, f, ror #20	// t = (S32(2,  (x)) ^ S32(13, (x)) ^ S32(22, (x)))				// t = Sigma0(a);
+	add		a, a, e				// d += T1;
+    ror     t, t, #2
+	add.4s	$1, $1, v18	    	// w19:w16 = w3:w0 + sigma0(w4:w1) + w12:w9 + sigma1(w17:w14)
+	add		e, e, t				// h = T1 + Sigma0(a);
+	eor		t, g, h				// y^z
+	and		s, g, h				// y&z
+	add.4s	$5, v17, $1			// W+K
+	and		t, t, f				// x&(y^z)
+	eor		s, s, t				// t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+	add		e, e, s
+
+	.endm
+
+	.macro	rounds_e_schedule_update
+	eor		t, a, a, ror #5			// S32(6,  (x)) ^ S32(11, (x))
+	ldr     q17, [ktable], #16      // K
+	eor		t, t, a, ror #19	// t = (S32(6,  (x)) ^ S32(11, (x)) ^ S32(25, (x)))
+	ext.16b v18, $1, $2, #4			// Q18 = w4:w1
+    ror     t, t, #6
+	and		s, a, bb				// (x) & (y)
+	add		d, d, t				// use h to store h+Sigma1(e)
+	bic		t, c, a				// (~(x)) & (z)
+	ushr.4s	v19, v18, #7
+	eor		t, t, s				// t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
+    mov     s, $5.s[0]
+	add		d, d, t				// t = h+Sigma1(e)+Ch(e,f,g);
+	shl.4s	v20, v18, #14
+	eor		t, e, e, ror #11	// S32(2,  (x)) ^ S32(13, (x))
+	ushr.4s	v18, v18, #3
+	add		d, d, s				// h = T1
+	eor		t, t, e, ror #20	// t = (S32(2,  (x)) ^ S32(13, (x)) ^ S32(22, (x)))				// t = Sigma0(a);
+	add		h, h, d				// d += T1;
+    ror     t, t, #2
+	eor.16b	v18, v18, v19
+	add		d, d, t				// h = T1 + Sigma0(a);
+	ushr.4s	v19, v19, #11
+	eor		t, f, g				// y^z
+	and		s, f, g				// y&z
+	and		t, t, e				// x&(y^z)
+	eor.16b		v18, v18, v20
+	eor		s, s, t				// t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+	shl.4s	v20, v20, #11
+	eor		t, h, h, ror #5	// S32(6,  (x)) ^ S32(11, (x))
+
+
+	add		d, d, s
+	eor		t, t, h, ror #19	// t = (S32(6,  (x)) ^ S32(11, (x)) ^ S32(25, (x)))
+	eor.16b	v18, v18, v19
+	and		s, h, a				// (x) & (y)
+	ext.16b v19, $3, $4, #4			// q19 = w12:w9
+    ror     t, t, #6
+	add		c, c, t			// use h to store h+Sigma1(e)
+	eor.16b	v18, v18, v20
+	bic		t, bb, h				// (~(x)) & (z)
+
+	eor		t, t, s				// t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
+    mov     s, $5.s[1]
+	add		c, c, t				// t = h+Sigma1(e)+Ch(e,f,g);
+	eor		t, d, d, ror #11	// S32(2,  (x)) ^ S32(13, (x))
+	add.4s	$1, $1, v18				// w3:w0 + sigma0(w4:w1)
+	add		c, c, s				// h = T1
+	ext.16b v18, $4, zero, #8		// 0 0 w15:w14
+	eor		t, t, d, ror #20	// t = (S32(2,  (x)) ^ S32(13, (x)) ^ S32(22, (x)))				// t = Sigma0(a);
+	add.4s	$1, $1, v19				// w3:w0 + sigma0(w4:w1) + w12:w9
+    ror     t, t, #2
+	add		g, g, c				// d += T1;
+	ushr.4s	v19, v18, #17
+	add		c, c, t 			// h = T1 + Sigma0(a);
+	shl.4s	v20, v18, #13
+	eor		t, e, f				// y^z
+	ushr.4s v18, v18, #10
+	and		s, e, f				// y&z
+	and		t, t, d				// x&(y^z)
+	eor.16b	v18, v18, v19
+	eor		s, s, t				// t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+	ushr.4s	v19, v19, #2
+
+
+	eor		t, g, g, ror #5		// S32(6,  (x)) ^ S32(11, (x))
+	add		c, c, s
+	eor.16b	v18, v18, v20
+	eor		t, t, g, ror #19	// t = (S32(6,  (x)) ^ S32(11, (x)) ^ S32(25, (x)))
+	shl.4s	v20, v20, #2
+    ror     t, t, #6
+	and		s, g, h				// (x) & (y)
+	eor.16b	v18, v18, v19
+	add		bb, bb, t 			// use h to store h+Sigma1(e)
+	eor.16b	v18, v18, v20
+	bic		t, a, g				// (~(x)) & (z)
+	add.4s	$1, $1, v18				// w3:w0 + sigma0(w4:w1) + w12:w9 + sigma1(0 0 w15:w14)
+	eor		t, t, s				// t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
+    mov     s, $5.s[2]
+	add		bb, bb, t				// t = h+Sigma1(e)+Ch(e,f,g);
+	ext.16b	v18, zero, $1, #8		// Q18 = (w17:w16 0 0)
+	eor		t, c, c, ror #11	// S32(2,  (x)) ^ S32(13, (x))
+	add		bb, bb, s				// h = T1
+	eor		t, t, c, ror #20	// t = (S32(2,  (x)) ^ S32(13, (x)) ^ S32(22, (x)))				// t = Sigma0(a);
+	ushr.4s	v19, v18, #17
+	add		f, f, bb				// d += T1;
+	shl.4s	v20, v18, #13
+    ror     t, t, #2
+	ushr.4s	v18, v18, #10
+	add		bb, bb, t 			// h = T1 + Sigma0(a);
+	eor		t, d, e				// y^z
+	and		s, d, e				// y&z
+	eor.16b	v18, v18, v19
+	and		t, t, c				// x&(y^z)
+	eor		s, s, t				// t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+
+	eor		t, f, f, ror #5	// S32(6,  (x)) ^ S32(11, (x))
+	add		bb, bb, s
+	eor.16b	v18, v18, v20
+	eor		t, t, f, ror #19	// t = (S32(6,  (x)) ^ S32(11, (x)) ^ S32(25, (x)))
+	ushr.4s	v19, v19, #2
+    ror     t, t, #6
+	shl.4s	    v20, v20, #2
+
+	and		s, f, g				// (x) & (y)
+	add		a, a, t         	// use h to store h+Sigma1(e)
+	eor.16b	v18, v18, v19
+	bic		t, h, f				// (~(x)) & (z)
+	eor		t, t, s				// t = Ch(x,y,z) = (((x) & (y)) ^ ((~(x)) & (z)))
+    mov     s, $5.s[3]
+	add		a, a, t				// t = h+Sigma1(e)+Ch(e,f,g);
+	eor.16b	v18, v18, v20
+	eor		t, bb, bb, ror #11	// S32(2,  (x)) ^ S32(13, (x))
+	add		a, a, s				// h = T1
+	eor		t, t, bb, ror #20	// t = (S32(2,  (x)) ^ S32(13, (x)) ^ S32(22, (x)))				// t = Sigma0(a);
+    ror     t, t, #2
+	add.4s	$1, $1, v18				// w19:w16 = w3:w0 + sigma0(w4:w1) + w12:w9 + sigma1(w17:w14)
+	add		e, e, a				// d += T1;
+	add		a, a, t				// h = T1 + Sigma0(a);
+	eor		t, c, d				// y^z
+	and		s, c, d				// y&z
+	add.4s	$5, v17, $1			// W+K
+	and		t, t, bb				// x&(y^z)
+	eor		s, s, t				// t = Maj(x,y,z) = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+	add		a, a, s
+	.endm
+
+
+#if defined(__arm64__)
+#include "ccarm_pac_bti_macros.h"
+
+.subsections_via_symbols
+	.text
+	.p2align	4
+
+K256:
+	.long 	0x428a2f98
+	.long 	0x71374491
+	.long	0xb5c0fbcf
+	.long	0xe9b5dba5
+	.long	0x3956c25b
+	.long	0x59f111f1
+	.long	0x923f82a4
+	.long	0xab1c5ed5
+    .long	0xd807aa98
+	.long	0x12835b01
+	.long	0x243185be 
+	.long	0x550c7dc3
+    .long	0x72be5d74 
+	.long	0x80deb1fe 
+	.long	0x9bdc06a7 
+	.long	0xc19bf174
+    .long	0xe49b69c1 
+	.long	0xefbe4786 
+	.long	0x0fc19dc6 
+	.long	0x240ca1cc
+    .long	0x2de92c6f 
+	.long	0x4a7484aa 
+	.long	0x5cb0a9dc 
+	.long	0x76f988da
+    .long	0x983e5152 
+	.long	0xa831c66d 
+	.long	0xb00327c8 
+	.long	0xbf597fc7
+    .long	0xc6e00bf3 
+	.long	0xd5a79147 
+	.long	0x06ca6351 
+	.long	0x14292967
+    .long	0x27b70a85 
+	.long	0x2e1b2138 
+	.long	0x4d2c6dfc 
+	.long	0x53380d13
+    .long	0x650a7354 
+	.long	0x766a0abb 
+	.long	0x81c2c92e 
+	.long	0x92722c85
+    .long	0xa2bfe8a1 
+	.long	0xa81a664b 
+	.long	0xc24b8b70 
+	.long	0xc76c51a3
+    .long	0xd192e819 
+	.long	0xd6990624 
+	.long	0xf40e3585 
+	.long	0x106aa070
+    .long	0x19a4c116 
+	.long	0x1e376c08 
+	.long	0x2748774c 
+	.long	0x34b0bcb5
+    .long	0x391c0cb3 
+	.long	0x4ed8aa4a 
+	.long	0x5b9cca4f 
+	.long	0x682e6ff3
+    .long	0x748f82ee 
+	.long	0x78a5636f 
+	.long	0x84c87814 
+	.long	0x8cc70208
+    .long	0x90befffa
+	.long	0xa4506ceb
+	.long	0xbef9a3f7
+	.long	0xc67178f2
+
+
+    .p2align  4
+
+    .globl _AccelerateCrypto_SHA256_compress_arm64neon
+_AccelerateCrypto_SHA256_compress_arm64neon:
+    BRANCH_TARGET_CALL
+    adrp    ktable, K256@page
+    cbnz    num_blocks, 1f                       // if number of blocks is nonzero, go on for sha256 transform operation
+    ret     lr                          // otherwise, return
+1:
+    add     ktable, ktable, K256@pageoff
+
+#if BUILDKERNEL
+    // save q0-q7, q16-q20 8+4+1=13
+    sub     x4, sp, #13*16
+    sub     sp, sp, #13*16
+    st1.4s  {v0, v1, v2, v3}, [x4], #64
+    st1.4s  {v4, v5, v6, v7}, [x4], #64
+    st1.4s  {v16, v17, v18, v19}, [x4], #64
+    st1.4s  {v20}, [x4]
+#endif
+
+
+	// load W[0:15]
+    ldr         qW0, [data, #0*16]
+	movi.16b    zero, #0
+    ldr         qW1, [data, #1*16]
+    ldr         qW2, [data, #2*16]
+    ldr         qW3, [data, #3*16]
+    add         data, data, #4*16
+
+	// load K[0:15] & per word byte swap
+    rev32.16b   W0, W0
+    ldr         qWK0, [ktable, #0*16]
+    rev32.16b   W1, W1
+    ldr         qWK1, [ktable, #1*16]
+    rev32.16b   W2, W2
+    ldr         qWK2, [ktable, #2*16]
+    rev32.16b   W3, W3
+    ldr         qWK3, [ktable, #3*16]
+
+	// compute WK[0:15]
+    add         ktable, ktable, #4*16
+	add.4s	WK0, WK0, W0
+    ldp     a, bb, [ctx, #0*4]
+	add.4s	WK1, WK1, W1
+    ldp     c, d, [ctx, #2*4]
+	add.4s	WK2, WK2, W2
+    ldp     e, f, [ctx, #4*4]
+	add.4s	WK3, WK3, W3
+    ldp     g, h, [ctx, #6*4]
+
+L_loop:
+
+	// rounds 0:47 interleaved with W/WK update for rounds 16:63
+    mov     _i_loop, #3
+L_i_loop:
+	rounds_a_schedule_update	 0,W0,W1,W2,W3, WK0
+	rounds_e_schedule_update	 4,W1,W2,W3,W0, WK1
+	rounds_a_schedule_update	 8,W2,W3,W0,W1, WK2
+	rounds_e_schedule_update	12,W3,W0,W1,W2, WK3
+    subs    _i_loop, _i_loop, #1
+    b.gt     L_i_loop
+
+	// revert K to the beginning of K256[]
+	subs	num_blocks, num_blocks, #1						// num_blocks--
+	sub		ktable, ktable, #256
+	b.eq	L_final_block				// if final block, wrap up final rounds
+
+	// rounds 48:63 interleaved with W/WK initialization for next block rounds 0:15 
+	rounds_a_update_W_WK	WK0, W0, qWK0, qW0
+	rounds_e_update_W_WK	WK1, W1, qWK1, qW1
+	rounds_a_update_W_WK	WK2, W2, qWK2, qW2
+	rounds_e_update_W_WK	WK3, W3, qWK3, qW3
+
+	// ctx->states += digests a-h, also update digest variables a-h
+	Update_Digits
+
+	b.al		L_loop				// branch for next block
+
+	// wrap up digest update round 48:63 for final block
+L_final_block:
+	rounds_a	WK0
+	rounds_e	WK1
+	rounds_a	WK2
+	rounds_e	WK3
+
+	// ctx->states += digests a-h
+	Update_Digits
+
+#if BUILDKERNEL
+    // restore q0-q7, q16-q20
+    ld1.4s  {v0, v1, v2, v3}, [sp], #64
+    ld1.4s  {v4, v5, v6, v7}, [sp], #64
+    ld1.4s  {v16, v17, v18, v19}, [sp], #64
+    ld1.4s  {v20}, [sp], #16
+#endif
+
+    ret     lr
+
+#endif /* arm64 */
+
--- a/acceleratecrypto/Source/sha256/intel/sha256_K.c
+++ b/acceleratecrypto/Source/sha256/intel/sha256_K.c
@ -0,0 +1,30 @@
+/* Copyright (c) (2010,2014-2016,2019,2020) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by
+ * Apple Inc. (if any) are limited to internal use within your organization only on
+ * devices and computers you own or control, for the sole purpose of verifying the
+ * security characteristics and correct functioning of the Apple Software.  You may
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#include <stdint.h>
+#include <corecrypto/cc_config.h>
+
+/* the K array */
+const uint32_t sha256_K[64] CC_ALIGNED(16) = {
+    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b,
+    0x59f111f1, 0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01,
+    0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7,
+    0xc19bf174, 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+    0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 0x983e5152,
+    0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147,
+    0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc,
+    0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+    0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819,
+    0xd6990624, 0xf40e3585, 0x106aa070, 0x19a4c116, 0x1e376c08,
+    0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f,
+    0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+    0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
--- a/acceleratecrypto/Source/sha256/intel/sha256_compress.c
+++ b/acceleratecrypto/Source/sha256/intel/sha256_compress.c
@ -0,0 +1,30 @@
+/* Copyright (c) (2019) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to 
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by 
+ * Apple Inc. (if any) are limited to internal use within your organization only on 
+ * devices and computers you own or control, for the sole purpose of verifying the 
+ * security characteristics and correct functioning of the Apple Software.  You may 
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#include <stddef.h>
+#include "config.h"
+#include "AccelerateCrypto.h"
+
+#if (defined(__x86_64__) || defined(__i386__))
+extern void AccelerateCrypto_SHA256_compress_ssse3(uint32_t *state, size_t num, const void *buf) __asm__("_AccelerateCrypto_SHA256_compress_ssse3");
+extern void AccelerateCrypto_SHA256_compress_AVX1(uint32_t *state, size_t num, const void *buf) __asm__("_AccelerateCrypto_SHA256_compress_AVX1");
+extern void AccelerateCrypto_SHA256_compress_AVX2(uint32_t *state, size_t num, const void *buf) __asm__("_AccelerateCrypto_SHA256_compress_AVX2");
+
+void  AccelerateCrypto_SHA256_compress(uint32_t *state, size_t num, const void *buf)
+{
+#if defined(__x86_64__)
+    if (HAS_AVX2()) AccelerateCrypto_SHA256_compress_AVX2(state, num, buf);
+    else if (HAS_AVX1()) AccelerateCrypto_SHA256_compress_AVX1(state, num, buf);
+    else 
+#endif
+        AccelerateCrypto_SHA256_compress_ssse3(state, num, buf);  
+}
+#endif  // (defined(__x86_64__) || defined(__i386__))
--- a/acceleratecrypto/Source/sha256/intel/sha256_compress_avx1.s
+++ b/acceleratecrypto/Source/sha256/intel/sha256_compress_avx1.s
--- a/acceleratecrypto/Source/sha256/intel/sha256_compress_avx2.s
+++ b/acceleratecrypto/Source/sha256/intel/sha256_compress_avx2.s
--- a/acceleratecrypto/Source/sha256/intel/sha256_compress_ssse3_32.s
+++ b/acceleratecrypto/Source/sha256/intel/sha256_compress_ssse3_32.s
@ -0,0 +1,504 @@
+# Copyright (c) (2010,2011,2012,2014,2015,2016,2018,2019) Apple Inc. All rights reserved.
+#
+# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+# is contained in the License.txt file distributed with corecrypto) and only to 
+# people who accept that license. IMPORTANT:  Any license rights granted to you by 
+# Apple Inc. (if any) are limited to internal use within your organization only on 
+# devices and computers you own or control, for the sole purpose of verifying the 
+# security characteristics and correct functioning of the Apple Software.  You may 
+# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+#include <corecrypto/cc_config.h>
+
+/*
+	This file provides i386 hand implementation of the following function
+
+	sha2_void sha256_compile(sha256_ctx ctx[1]);
+
+	which is a C function in CommonCrypto Source/Digest/sha2.c
+
+	The implementation here is modified from another sha256 i386 implementation for sha256 in the xnu.
+	To modify to fit the new API,
+		the old ctx (points to ctx->hashes) shoule be changed to ctx->hashes, 8(ctx).
+		the old data (points to ctx->wbuf), should be changed to ctx->wbuf, 40(ctx).
+
+	sha256_compile handles 1 input block (64 bytes) per call.
+
+
+	The following is comments for the initial xnu-sha256.s.
+
+	void SHA256_Transform(SHA256_ctx *ctx, char *data, unsigned int num_blocks);
+
+	which is a C function in sha2.c (from xnu).
+
+	sha256 algorithm per block description:
+
+		1. W(0:15) = big-endian (per 4 bytes) loading of input data (64 byte)
+		2. load 8 digests a-h from ctx->state
+		3. for r = 0:15
+				T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
+				d += T1;
+				h = T1 + Sigma0(a) + Maj(a,b,c)
+				permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
+		4. for r = 16:63
+				W[r] = W[r-16] + sigma1(W[r-2]) + W[r-7] + sigma0(W[r-15]);
+				T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
+				d += T1;
+				h = T1 + Sigma0(a) + Maj(a,b,c)
+				permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
+
+	In the assembly implementation:
+		- a circular window of message schedule W(r:r+15) is updated and stored in xmm0-xmm3
+		- its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
+		- the 8 digests (a-h) will be stored in GPR or m32 (all in GPR for x86_64, and some in m32 for i386)
+
+	the implementation per block looks like
+
+	----------------------------------------------------------------------------
+
+	load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3
+	pre_calculate and store W+K(0:15) in stack
+
+	load digests a-h from ctx->state;
+
+	for (r=0;r<48;r+=4) {
+		digests a-h update and permute round r:r+3
+		update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
+	}
+
+	for (r=48;r<64;r+=4) {
+		digests a-h update and permute round r:r+3
+	}
+
+	ctx->states += digests a-h;
+
+	----------------------------------------------------------------------------
+
+	our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block
+	into the last 16 rounds of its previous block:
+
+	----------------------------------------------------------------------------
+
+	load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3
+	pre_calculate and store W+K(0:15) in stack
+
+L_loop:
+
+	load digests a-h from ctx->state;
+
+	for (r=0;r<48;r+=4) {
+		digests a-h update and permute round r:r+3
+		update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
+	}
+
+	num_block--;
+	if (num_block==0)	jmp L_last_block;
+
+	for (r=48;r<64;r+=4) {
+		digests a-h update and permute round r:r+3
+		load W([r:r+3]%16) (big-endian per 4 bytes) into xmm0:xmm3
+		pre_calculate and store W+K([r:r+3]%16) in stack
+	}
+
+	ctx->states += digests a-h;
+
+	jmp	L_loop;
+
+L_last_block:
+
+	for (r=48;r<64;r+=4) {
+		digests a-h update and permute round r:r+3
+	}
+
+	ctx->states += digests a-h;
+
+	------------------------------------------------------------------------
+
+	Apple CoreOS vector & numerics
+*/
+#if defined __i386__ 
+
+	// associate variables with registers or memory
+
+	#define	sp 	%esp
+	#define stack_size	(12+16*8+16+16+64)	// 12 (align) + xmm0:xmm7 + 16 (c,f,h,K) + L_aligned_bswap + WK(0:15)
+	#define	ctx_addr	20+stack_size(sp)	// ret_addr + 4 registers = 20, 1st caller argument
+	#define	num_blocks	24+stack_size(sp)	// 2nd caller argument
+	#define	data_addr	28+stack_size(sp)	// 3rd caller argument
+
+	#define	a	%ebx
+	#define	b	%edx
+	#define	c	64(sp)
+	#define	d	%ebp
+	#define	e	%esi
+	#define	f	68(sp)
+	#define	g	%edi
+	#define	h	72(sp)
+
+	#define	K	76(sp)					// pointer to K256[] table
+	#define	L_aligned_bswap	80(sp)		// bswap : big-endian loading of 4-byte words
+	#define	xmm_save	96(sp)			// starting address for xmm save/restore
+
+	// 2 local variables
+	#define	t	%eax
+	#define	s	%ecx
+
+	// a window (16 words) of message scheule
+	#define	W0	%xmm0
+	#define	W1	%xmm1
+	#define	W2	%xmm2
+	#define	W3	%xmm3
+
+	// circular buffer for WK[(r:r+15)%16]
+	#define WK(x)   ((x)&15)*4(sp)
+
+// #define Ch(x,y,z)   (((x) & (y)) ^ ((~(x)) & (z)))
+
+	.macro Ch
+	mov		$0, t		// x
+	mov		$0, s		// x
+	not		t			// ~x
+	and		$1, s		// x & y
+	and		$2, t		// ~x & z
+	xor		s, t		// t = ((x) & (y)) ^ ((~(x)) & (z));
+	.endm
+
+// #define Maj(x,y,z)  (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+
+	.macro	Maj
+	mov	 	$1,	t // y
+	mov		$2,	s // z
+	xor		$2,	t // y^z
+	and		$1,	s // y&z
+	and		$0, 	t // x&(y^z)
+	xor		s,	t // Maj(x,y,z)
+	.endm
+
+// #define sigma0_256(x)   (S32(7,  (x)) ^ S32(18, (x)) ^ R(3 ,   (x)))
+
+	// performs sigma0_256 on 4 words on an xmm registers
+	// use xmm6/xmm7 as intermediate registers
+	.macro	sigma0
+	movdqa	$0, %xmm6
+	movdqa	$0, %xmm7
+	psrld	$$3, $0			// SHR3(x)
+	psrld	$$7, %xmm6		// part of ROTR7
+	pslld	$$14, %xmm7		// part of ROTR18
+	pxor	%xmm6, $0
+	pxor	%xmm7, $0
+	psrld	$$11, %xmm6		// part of ROTR18
+	pslld	$$11, %xmm7		// part of ROTR7
+	pxor	%xmm6, $0
+	pxor	%xmm7, $0
+	.endm
+
+// #define sigma1_256(x)   (S32(17, (x)) ^ S32(19, (x)) ^ R(10,   (x)))
+
+	// performs sigma1_256 on 4 words on an xmm registers
+	// use xmm6/xmm7 as intermediate registers
+	.macro	sigma1
+	movdqa	$0, %xmm6
+	movdqa	$0, %xmm7
+	psrld	$$10, $0		// SHR10(x)
+	psrld	$$17, %xmm6		// part of ROTR17
+	pxor	%xmm6, $0
+	pslld	$$13, %xmm7		// part of ROTR19
+	pxor	%xmm7, $0
+	psrld	$$2, %xmm6		// part of ROTR19
+	pxor	%xmm6, $0
+	pslld	$$2, %xmm7		// part of ROTR17
+	pxor	%xmm7, $0
+	.endm
+
+// #define Sigma0_256(x)   (S32(2,  (x)) ^ S32(13, (x)) ^ S32(22, (x)))
+
+	.macro	Sigma0
+	mov		$0, t			// x
+	mov		$0, s			// x
+	ror		$$2, t			// S32(2,  (x))
+	ror		$$13, s			// S32(13,  (x))
+	xor		s, t			// S32(2,  (x)) ^ S32(13, (x))
+	ror		$$9, s			// S32(22,  (x))
+	xor		s, t			// t = (S32(2,  (x)) ^ S32(13, (x)) ^ S32(22, (x)))
+	.endm
+
+// #define Sigma1_256(x)   (S32(6,  (x)) ^ S32(11, (x)) ^ S32(25, (x)))
+
+	.macro	Sigma1
+	mov		$0, s			// x
+	ror		$$6, s			// S32(6,  (x))
+	mov		s, t			// S32(6,  (x))
+	ror		$$5, s			// S32(11, (x))
+	xor		s, t			// S32(6,  (x)) ^ S32(11, (x))
+	ror		$$14, s			// S32(25, (x))
+	xor		s, t			// t = (S32(6,  (x)) ^ S32(11, (x)) ^ S32(25, (x)))
+	.endm
+
+	// per round digests update
+	.macro	round
+	Sigma1	$4				// t = T1
+	add		t, $7			// use h to store h+Sigma1(e)
+	Ch		$4, $5, $6		// t = Ch (e, f, g);
+	add		$7, t			// t = h+Sigma1(e)+Ch(e,f,g);
+	add		WK($8), t		// h = T1
+	add		t, $3			// d += T1;
+	mov		t, $7			// h = T1
+	Sigma0	$0				// t = Sigma0(a);
+	add		t, $7			// h = T1 + Sigma0(a);
+	Maj		$0, $1, $2		// t = Maj(a,b,c)
+	add		t, $7			// h = T1 + Sigma0(a) + Maj(a,b,c);
+	.endm
+
+	// per 4 rounds digests update and permutation
+	// permutation is absorbed by rotating the roles of digests a-h
+	.macro	rounds
+	round	$0, $1, $2, $3, $4, $5, $6, $7, 0+$8
+	round	$7, $0, $1, $2, $3, $4, $5, $6, 1+$8
+	round	$6, $7, $0, $1, $2, $3, $4, $5, 2+$8
+	round	$5, $6, $7, $0, $1, $2, $3, $4, 3+$8
+	.endm
+
+	// update the message schedule W and W+K (4 rounds) 16 rounds ahead in the future
+	.macro	message_schedule
+
+	// 4 32-bit K256 words in xmm5
+	mov		K, t
+	movdqu	(t), %xmm5
+	addl	$$16, K				// K points to next K256 word for next iteration
+	movdqa	$1, %xmm4 			// W7:W4
+	palignr	$$4, $0, %xmm4		// W4:W1
+	sigma0	%xmm4				// sigma0(W4:W1)
+	movdqa	$3, %xmm6 			// W15:W12
+	paddd	%xmm4, $0			// $0 = W3:W0 + sigma0(W4:W1)
+	palignr	$$4, $2, %xmm6		// W12:W9
+	paddd	%xmm6, $0			// $0 = W12:W9 + sigma0(W4:W1) + W3:W0
+	movdqa	$3, %xmm4			// W15:W12
+	psrldq	$$8, %xmm4			// 0,0,W15,W14
+	sigma1	%xmm4				// sigma1(0,0,W15,W14)
+	paddd	%xmm4, $0			// sigma1(0,0,W15,W14) + W12:W9 + sigma0(W4:W1) + W3:W0
+	movdqa	$0, %xmm4			// W19-sigma1(W17), W18-sigma1(W16), W17, W16
+	pslldq	$$8, %xmm4			// W17, W16, 0, 0
+	sigma1	%xmm4				// sigma1(W17,W16,0,0)
+	paddd	%xmm4, $0			// W19:W16
+	paddd	$0, %xmm5			// WK
+	movdqa	%xmm5, WK($4)
+	.endm
+
+	// this macro is used in the last 16 rounds of a current block
+	// it reads the next message (16 4-byte words), load it into 4 words W[r:r+3], computes WK[r:r+3]
+	// and save into stack to prepare for next block
+
+	.macro	update_W_WK
+	mov		data_addr, t
+	movdqu	$0*16(t), $1		// read 4 4-byte words
+	pshufb	L_aligned_bswap, $1	// big-endian of each 4-byte word, W[r:r+3]
+	mov		K, t
+	movdqu	$0*16(t), %xmm4		// K[r:r+3]
+	paddd	$1, %xmm4			// WK[r:r+3]
+	movdqa	%xmm4, WK($0*4)		// save WK[r:r+3] into stack circular buffer
+	.endm
+
+    .section    __IMPORT,__pointers,non_lazy_symbol_pointers
+L_sha256_K$non_lazy_ptr:
+.indirect_symbol CC_C_LABEL(sha256_K)
+    .long   0
+
+	.text
+    .globl	_AccelerateCrypto_SHA256_compress_ssse3
+_AccelerateCrypto_SHA256_compress_ssse3:
+
+	// push callee-saved registers
+    push    %ebp
+	push    %ebx
+    push    %esi
+    push    %edi
+
+	// allocate stack space
+	sub		$stack_size, sp
+
+	// if kernel code, save used xmm registers
+#if BUILDKERNEL
+	movdqa	%xmm0, 0*16+xmm_save
+	movdqa	%xmm1, 1*16+xmm_save
+	movdqa	%xmm2, 2*16+xmm_save
+	movdqa	%xmm3, 3*16+xmm_save
+	movdqa	%xmm4, 4*16+xmm_save
+	movdqa	%xmm5, 5*16+xmm_save
+	movdqa	%xmm6, 6*16+xmm_save
+	movdqa	%xmm7, 7*16+xmm_save
+#endif
+
+	// set up bswap parameters in the aligned stack space and pointer to table K256[]
+	call    0f          // Push program counter onto stack.
+0:	pop     t      // Get program counter.
+	mov L_sha256_K$non_lazy_ptr-0b(t), t
+	mov		t, K
+	call    0f          // Push program counter onto stack.
+0:	pop     %eax		// Get program counter.
+	lea		L_bswap-0b(%eax), %eax
+	movdqa	(%eax), %xmm0
+	movdqa	%xmm0, L_aligned_bswap
+
+	// load W[0:15] into xmm0-xmm3
+	mov		data_addr, t
+	movdqu	0*16(t), W0
+	movdqu	1*16(t), W1
+	movdqu	2*16(t), W2
+	movdqu	3*16(t), W3
+	addl	$64, data_addr
+
+	pshufb	L_aligned_bswap, W0
+	pshufb	L_aligned_bswap, W1
+	pshufb	L_aligned_bswap, W2
+	pshufb	L_aligned_bswap, W3
+
+	// compute WK[0:15] and save in stack
+	mov		K, t
+	movdqu	0*16(t), %xmm4
+	movdqu	1*16(t), %xmm5
+	movdqu	2*16(t), %xmm6
+	movdqu	3*16(t), %xmm7
+    addl	$64, K
+	paddd	%xmm0, %xmm4
+	paddd	%xmm1, %xmm5
+	paddd	%xmm2, %xmm6
+	paddd	%xmm3, %xmm7
+	movdqa	%xmm4, WK(0)
+	movdqa	%xmm5, WK(4)
+	movdqa	%xmm6, WK(8)
+	movdqa	%xmm7, WK(12)
+
+L_loop:
+
+	// digests a-h = ctx->states;
+	mov		ctx_addr, t
+	mov 	0*4(t), a
+	mov 	1*4(t), b
+	mov 	2*4(t), s
+	mov		s, c
+	mov 	3*4(t), d
+	mov 	4*4(t), e
+	mov 	5*4(t), s
+	mov		s, f
+	mov 	6*4(t), g
+	mov 	7*4(t), s
+	mov		s, h
+
+	// rounds 0:47 interleaved with W/WK update for rounds 16:63
+	rounds	a, b, c, d, e, f, g, h, 0
+	message_schedule W0,W1,W2,W3,16
+	rounds	e, f, g, h, a, b, c, d, 4
+	message_schedule W1,W2,W3,W0,20
+	rounds	a, b, c, d, e, f, g, h, 8
+	message_schedule W2,W3,W0,W1,24
+	rounds	e, f, g, h, a, b, c, d, 12
+	message_schedule W3,W0,W1,W2,28
+	rounds	a, b, c, d, e, f, g, h, 16
+	message_schedule W0,W1,W2,W3,32
+	rounds	e, f, g, h, a, b, c, d, 20
+	message_schedule W1,W2,W3,W0,36
+	rounds	a, b, c, d, e, f, g, h, 24
+	message_schedule W2,W3,W0,W1,40
+	rounds	e, f, g, h, a, b, c, d, 28
+	message_schedule W3,W0,W1,W2,44
+	rounds	a, b, c, d, e, f, g, h, 32
+	message_schedule W0,W1,W2,W3,48
+	rounds	e, f, g, h, a, b, c, d, 36
+	message_schedule W1,W2,W3,W0,52
+	rounds	a, b, c, d, e, f, g, h, 40
+	message_schedule W2,W3,W0,W1,56
+	rounds	e, f, g, h, a, b, c, d, 44
+	message_schedule W3,W0,W1,W2,60
+
+	// revert K to the beginning of K256[]
+	subl		$256, K
+	subl		$1, num_blocks				// num_blocks--
+
+	je		L_final_block				// if final block, wrap up final rounds
+
+	// rounds 48:63 interleaved with W/WK initialization for next block rounds 0:15
+	rounds	a, b, c, d, e, f, g, h, 48
+	update_W_WK	0, W0
+	rounds	e, f, g, h, a, b, c, d, 52
+	update_W_WK	1, W1
+	rounds	a, b, c, d, e, f, g, h, 56
+	update_W_WK	2, W2
+	rounds	e, f, g, h, a, b, c, d, 60
+	update_W_WK	3, W3
+
+	addl	$64, K
+	addl	$64, data_addr
+
+	// ctx->states += digests a-h
+	mov		ctx_addr, t
+	add		a, 0*4(t)
+	add		b, 1*4(t)
+	mov		c, s
+	add		s, 2*4(t)
+	add		d, 3*4(t)
+	add		e, 4*4(t)
+	mov		f, s
+	add		s, 5*4(t)
+	add		g, 6*4(t)
+	mov		h, s
+	add		s, 7*4(t)
+
+	jmp		L_loop				// branch for next block
+
+	// wrap up digest update round 48:63 for final block
+L_final_block:
+	rounds	a, b, c, d, e, f, g, h, 48
+	rounds	e, f, g, h, a, b, c, d, 52
+	rounds	a, b, c, d, e, f, g, h, 56
+	rounds	e, f, g, h, a, b, c, d, 60
+
+	// ctx->states += digests a-h
+	mov		ctx_addr, t
+	add		a, 0*4(t)
+	add		b, 1*4(t)
+	mov		c, s
+	add		s, 2*4(t)
+	add		d, 3*4(t)
+	add		e, 4*4(t)
+	mov		f, s
+	add		s, 5*4(t)
+	add		g, 6*4(t)
+	mov		h, s
+	add		s, 7*4(t)
+
+	// if kernel, restore xmm0-xmm7
+#if BUILDKERNEL
+	movdqa	0*16+xmm_save, %xmm0
+	movdqa	1*16+xmm_save, %xmm1
+	movdqa	2*16+xmm_save, %xmm2
+	movdqa	3*16+xmm_save, %xmm3
+	movdqa	4*16+xmm_save, %xmm4
+	movdqa	5*16+xmm_save, %xmm5
+	movdqa	6*16+xmm_save, %xmm6
+	movdqa	7*16+xmm_save, %xmm7
+#endif
+
+	// free allocated stack memory
+	add		$stack_size, sp
+
+	// restore callee-saved registers
+    pop		%edi
+    pop		%esi
+	pop		%ebx
+    pop		%ebp
+
+	// return
+	ret
+
+	// data for using ssse3 pshufb instruction (big-endian loading of data)
+    CC_ASM_SECTION_CONST
+    .p2align  4, 0x90
+
+L_bswap:
+    .long   0x00010203
+    .long   0x04050607
+    .long   0x08090a0b
+    .long   0x0c0d0e0f
+
+
+#endif      // i386 
+
--- a/acceleratecrypto/Source/sha256/intel/sha256_compress_ssse3_64.s
+++ b/acceleratecrypto/Source/sha256/intel/sha256_compress_ssse3_64.s
--- a/acceleratecrypto/Source/sha512/arm/sha512_compress_armv7neon.s
+++ b/acceleratecrypto/Source/sha512/arm/sha512_compress_armv7neon.s
@ -0,0 +1,564 @@
+# Copyright (c) (2016,2018,2019) Apple Inc. All rights reserved.
+#
+# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+# is contained in the License.txt file distributed with corecrypto) and only to 
+# people who accept that license. IMPORTANT:  Any license rights granted to you by 
+# Apple Inc. (if any) are limited to internal use within your organization only on 
+# devices and computers you own or control, for the sole purpose of verifying the 
+# security characteristics and correct functioning of the Apple Software.  You may 
+# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+/*
+	This file provides armv7 neon hand implementation of the following function
+
+    void sha512_compress(uint64_t *state, size_t nblocks, const void *in);
+
+	sha512 algorithm per block description:
+
+		1. W(0:15) = big-endian (per 8 bytes) loading of input data (128 bytes)
+		2. load 8 digests (each 64bit) a-h from state
+		3. for r = 0:15
+				T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
+				d += T1;
+				h = T1 + Sigma0(a) + Maj(a,b,c)
+				permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
+		4. for r = 16:79
+				W[r] = W[r-16] + Gamma1(W[r-2]) + W[r-7] + Gamma0(W[r-15]);
+				T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
+				d += T1;
+				h = T1 + Sigma0(a) + Maj(a,b,c)
+				permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
+
+	In the assembly implementation:
+		- a circular window of message schedule W(r:r+15) is updated and stored in v0-v7
+		- its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
+		- the 8 digests (a-h) will be stored in GPR (%r8-%r15) 
+
+	----------------------------------------------------------------------------
+
+	our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block
+	into the last 16 rounds of its previous block:
+
+	----------------------------------------------------------------------------
+
+	load W(0:15) (big-endian per 8 bytes) into v0:v7
+	pre_calculate and store W+K(0:15) in stack
+
+L_loop:
+
+	load digests a-h from ctx->state;
+
+	for (r=0;r<64;r+=2) {
+		digests a-h update and permute round r:r+1
+		update W([r:r+1]%16) and WK([r:r+1]%16) for the next 8th iteration
+	}
+
+	num_block--;
+	if (num_block==0)	jmp L_last_block;
+
+	for (r=64;r<80;r+=2) {
+		digests a-h update and permute round r:r+1
+		load W([r:r+1]%16) (big-endian per 8 bytes) into v0:v7
+		pre_calculate and store W+K([r:r+1]%16) in stack
+	}
+
+	ctx->states += digests a-h;
+
+	jmp	L_loop;
+
+L_last_block:
+
+	for (r=64;r<80;r+=2) {
+		digests a-h update and permute round r:r+2
+	}
+
+	ctx->states += digests a-h;
+
+	------------------------------------------------------------------------
+
+	Apple CoreOS vector & numerics
+*/
+
+#if (defined(__arm__) && defined(__ARM_NEON__))
+
+	// associate variables with registers or memory
+
+    #define stack_size     (16*8) 
+
+	#define	ctx			r0
+	#define num_blocks	r1
+	#define	data        r2
+
+    /* use d0-d7 (q0-q3) for 8 digests */
+	#define	a			d0
+	#define	b			d1
+	#define	c			d2
+	#define	d			d3
+	#define	e			d4
+	#define	f			d5
+	#define	g			d6
+	#define	h			d7
+
+	#define	K			r3
+
+	// 3 local variables
+	#define	s	d8
+	#define	t	d9
+	#define	u	d10
+
+	// a window (16 quad-words) of message scheule
+	#define	W0	q8
+	#define	W1	q9
+	#define	W2	q10
+	#define	W3	q11
+	#define	W4	q12
+	#define	W5	q13
+	#define	W6	q14
+	#define	W7	q15
+
+	// circular buffer for WK[(r:r+15)%16]
+	#define WK(x)   [sp,#((x)&15)*8]
+
+// #define Ch(x,y,z)   (((x) & (y)) ^ ((~(x)) & (z)))
+
+    /* t = Ch($0, $1, $2) */
+	.macro Ch
+    veor     t, $1, $2  
+    vand     t, t, $0
+    veor     t, t, $2
+	.endm
+
+// #define Maj(x,y,z)  (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+
+    
+    /* t = Maj($0, $1, $2) */
+	.macro	Maj
+	veor    t, $1, $2  // y^z
+	vand	s, $1,$2   // y&z
+	vand	t, t, $0   // x&(y^z)
+	veor	t, t, s    // Maj(x,y,z)
+	.endm
+
+// #define Gamma0(x)   (S64(1,  (x)) ^ S64(8, (x)) ^ R(7 ,   (x)))
+
+	// performs Gamma0_512 on 2 words on an vector registers
+	// use q6/q7 as intermediate registers
+	.macro	Gamma0
+    vshr.u64 q6, $0, #1         // part of S64(1, x)
+    vshl.i64 q7, $0, #56        // part of S64(8, x)
+    vshr.u64 $0, $0, #7         // R(7, x)
+    veor     $0, $0, q6
+    vshr.u64 q6, q6, #7         // part of S64(8, x)
+    veor     $0, $0, q7
+    vshl.i64 q7, q7, #7         // part of S64(1, x)
+    veor     $0, $0, q6
+    veor     $0, $0, q7
+	.endm
+
+// #define Gamma1(x)   (S64(19, (x)) ^ S64(61, (x)) ^ R(6,   (x)))
+
+	// performs Gamma1_512 on 2 words on an vector registers
+	// use v16/v17 as intermediate registers
+	.macro	Gamma1
+    vshr.u64 q6, $0, #19        // part of S64(19, x)
+    vshl.i64 q7, $0, #3         // part of S64(61, x)
+    vshr.u64 $0, $0, #6         // R(6, x)
+    veor     $0, $0, q6
+    vshr.u64 q6, q6, #42        // part of S64(61, x)
+    veor     $0, $0, q7
+    vshl.i64 q7, q7, #42        // part of S64(19, x)
+    veor     $0, $0, q6
+    veor     $0, $0, q7
+	.endm
+
+    // W[r] = W[r-16] + Gamma1(W[r-2]) + W[r-7] + Gamma0(W[r-15]);
+    /*
+        W0 W1 W2 W3 W4 W5 W6 W7
+        
+        update 2 quad words in W0 = W0 + Gamma1(W7) + vext(W4,W5) + Gamma0(vext(W0,W1)). 
+        use q5-q7 for temp
+    */
+    .macro  message_update2
+    vext.64     q7, $4, $5, #1      // W[r-7]
+    vext.64     q5, $0, $1, #1      // W[r-15]
+    vadd.s64    $0, $0, q7          // W[r-16] + W[r-7];
+    Gamma0      q5
+    vadd.s64    $0, $0, q5          // W[r-16] + W[r-7] + Gamma0(W[r-15]) 
+    vshr.u64    q6, $7, #19         // Gamma1(W[r-2]), part of S64(19, x)
+    vshl.i64    q7, $7, #3          // part of S64(61, x)
+    vshr.u64    q5, $7, #6         // R(6, x)
+    veor        q5, q5, q6
+    vshr.u64    q6, q6, #42        // part of S64(61, x)
+    veor        q5, q5, q7
+    vshl.i64    q7, q7, #42        // part of S64(19, x)
+    veor        q5, q5, q6
+    veor        q5, q5, q7
+    vadd.s64    $0, $0, q5         // W[r-16] + W[r-7] + Gamma1(W7) 
+    .endm 
+
+// #define Sigma0(x)   (S64(28,  (x)) ^ S64(34, (x)) ^ S64(39, (x)))
+
+	.macro	Sigma0
+    vshr.u64    t, $0, #28 
+    vshl.i64    s, $0, #25 
+    vshr.u64    u, t, #6 
+    veor        t, t, s
+    vshl.i64    s, s, #5 
+    veor        t, t, u
+    vshr.u64    u, u, #5
+    veor        t, t, s
+    vshl.i64    s, s, #6 
+    veor        t, t, u
+    veor        t, t, s
+	.endm
+
+// #define Sigma1(x)   (S(14,  (x)) ^ S(18, (x)) ^ S(41, (x)))
+
+	.macro	Sigma1
+    vshr.u64    t, $0, #14
+    vshl.i64    s, $0, #23 
+    vshr.u64    u, t, #4
+    veor        t, t, s
+    vshl.i64    s, s, #23
+    veor        t, t, u
+    vshr.u64    u, u, #23
+    veor        t, t, s
+    vshl.i64    s, s, #4
+    veor        t, t, u
+    veor        t, t, s
+	.endm
+
+	// per round digests update
+	.macro	round_ref
+	Sigma1	$4				// t = Sigma1(e);
+	vadd.s64 $7, $7, t		// h = h+Sigma1(e)
+	Ch		$4, $5, $6		// t = Ch (e, f, g);
+    vldr    s, WK($8)       // s = WK
+	vadd.s64	$7, $7, t		// h = h+Sigma1(e)+Ch(e,f,g);
+	vadd.s64	$7, $7, s		// h = h+Sigma1(e)+Ch(e,f,g)+WK
+	vadd.s64	$3, $3, $7		// d += h;
+	Sigma0	$0				// t = Sigma0(a);
+	vadd.s64	$7, $7, t		// h += Sigma0(a);
+	Maj		$0, $1, $2		// t = Maj(a,b,c)
+	vadd.s64	$7, $7, t		// h = T1 + Sigma0(a) + Maj(a,b,c);
+	.endm
+
+	.macro	round
+	Sigma1	$4				// t = Sigma1(e);
+    vldr    s, WK($8)       // s = WK
+	vadd.s64 $7, $7, t		// h = h+Sigma1(e)
+    veor     t, $5, $6  
+	vadd.s64	$7, $7, s	// h = h+Sigma1(e)+WK
+    vand     t, t, $4
+    veor     t, t, $6       // t = Ch (e, f, g);
+	vadd.s64	$7, $7, t		// h = h+Sigma1(e)+Ch(e,f,g);
+	Sigma0	$0				// t = Sigma0(a);
+	vadd.s64	$3, $3, $7		// d += h;
+	vadd.s64	$7, $7, t		// h += Sigma0(a);
+	Maj		$0, $1, $2		// t = Maj(a,b,c)
+	vadd.s64	$7, $7, t		// h = T1 + Sigma0(a) + Maj(a,b,c);
+	.endm
+
+    /*
+        16 rounds of hash update, update input schedule W (in vector register v0-v7) and WK = W + K (in stack)
+    */
+	.macro	rounds_schedule
+    mov     r12, sp
+
+    message_update2 W0, W1, W2, W3, W4, W5, W6, W7
+	round	$0, $1, $2, $3, $4, $5, $6, $7, 0+$8
+	round	$7, $0, $1, $2, $3, $4, $5, $6, 1+$8
+    vld1.64 {q7}, [K,:128]!
+    vadd.s64 q7, q7, W0
+    vst1.64 {q7}, [r12]!
+
+    message_update2 W1, W2, W3, W4, W5, W6, W7, W0
+	round	$6, $7, $0, $1, $2, $3, $4, $5, 2+$8
+	round	$5, $6, $7, $0, $1, $2, $3, $4, 3+$8
+    vld1.64 {q7}, [K,:128]!
+    vadd.s64 q7, q7, W1
+    vst1.64 {q7}, [r12]!
+
+
+    message_update2 W2, W3, W4, W5, W6, W7, W0, W1
+	round	$4, $5, $6, $7, $0, $1, $2, $3, 4+$8
+	round	$3, $4, $5, $6, $7, $0, $1, $2, 5+$8
+    vld1.64 {q7}, [K,:128]!
+    vadd.s64 q7, q7, W2
+    vst1.64 {q7}, [r12]!
+
+    message_update2 W3, W4, W5, W6, W7, W0, W1, W2
+	round	$2, $3, $4, $5, $6, $7, $0, $1, 6+$8
+	round	$1, $2, $3, $4, $5, $6, $7, $0, 7+$8
+    vld1.64 {q7}, [K,:128]!
+    vadd.s64 q7, q7, W3
+    vst1.64 {q7}, [r12]!
+
+    message_update2 W4, W5, W6, W7, W0, W1, W2, W3
+	round	$0, $1, $2, $3, $4, $5, $6, $7, 8+$8
+	round	$7, $0, $1, $2, $3, $4, $5, $6, 9+$8
+    vld1.64 {q7}, [K,:128]!
+    vadd.s64 q7, q7, W4
+    vst1.64 {q7}, [r12]!
+
+    message_update2 W5, W6, W7, W0, W1, W2, W3, W4
+	round	$6, $7, $0, $1, $2, $3, $4, $5, 10+$8
+	round	$5, $6, $7, $0, $1, $2, $3, $4, 11+$8
+    vld1.64 {q7}, [K,:128]!
+    vadd.s64 q7, q7, W5
+    vst1.64 {q7}, [r12]!
+
+    message_update2 W6, W7, W0, W1, W2, W3, W4, W5
+	round	$4, $5, $6, $7, $0, $1, $2, $3, 12+$8
+	round	$3, $4, $5, $6, $7, $0, $1, $2, 13+$8
+    vld1.64 {q7}, [K,:128]!
+    vadd.s64 q7, q7, W6
+    vst1.64 {q7}, [r12]!
+
+    message_update2 W7, W0, W1, W2, W3, W4, W5, W6
+	round	$2, $3, $4, $5, $6, $7, $0, $1, 14+$8
+	round	$1, $2, $3, $4, $5, $6, $7, $0, 15+$8
+
+    vld1.64 {q7}, [K,:128]!
+    vadd.s64 q7, q7, W7
+    vst1.64 {q7}, [r12]!
+
+	.endm
+
+    .macro  rev64
+    vrev64.8    $0, $0
+    .endm
+    /*
+        16 rounds of hash update, load new input schedule W (in vector register v0-v7) and update WK = W + K (in stack)
+    */
+	.macro	rounds_schedule_initial
+    mov     r12, sp
+    vld1.8 {W0}, [data]!
+	round	$0, $1, $2, $3, $4, $5, $6, $7, 0+$8
+    rev64   W0
+	round	$7, $0, $1, $2, $3, $4, $5, $6, 1+$8
+    vld1.64 {q7}, [K,:128]!
+    vadd.s64 q7, q7, W0
+    vst1.64 {q7}, [r12]!
+    
+    vld1.8 {W1}, [data]!
+	round	$6, $7, $0, $1, $2, $3, $4, $5, 2+$8
+    rev64   W1
+	round	$5, $6, $7, $0, $1, $2, $3, $4, 3+$8
+    vld1.64 {q7}, [K,:128]!
+    vadd.s64 q7, q7, W1
+    vst1.64 {q7}, [r12]!
+
+    vld1.8 {W2}, [data]!
+	round	$4, $5, $6, $7, $0, $1, $2, $3, 4+$8
+    rev64   W2
+	round	$3, $4, $5, $6, $7, $0, $1, $2, 5+$8
+    vld1.64 {q7}, [K,:128]!
+    vadd.s64 q7, q7, W2
+    vst1.64 {q7}, [r12]!
+
+    vld1.8 {W3}, [data]!
+	round	$2, $3, $4, $5, $6, $7, $0, $1, 6+$8
+    rev64   W3
+	round	$1, $2, $3, $4, $5, $6, $7, $0, 7+$8
+    vld1.64 {q7}, [K,:128]!
+    vadd.s64 q7, q7, W3
+    vst1.64 {q7}, [r12]!
+
+    vld1.8 {W4}, [data]!
+	round	$0, $1, $2, $3, $4, $5, $6, $7, 8+$8
+    rev64   W4
+	round	$7, $0, $1, $2, $3, $4, $5, $6, 9+$8
+    vld1.64 {q7}, [K,:128]!
+    vadd.s64 q7, q7, W4
+    vst1.64 {q7}, [r12]!
+
+    vld1.8 {W5}, [data]!
+	round	$6, $7, $0, $1, $2, $3, $4, $5, 10+$8
+    rev64   W5
+	round	$5, $6, $7, $0, $1, $2, $3, $4, 11+$8
+    vld1.64 {q7}, [K,:128]!
+    vadd.s64 q7, q7, W5
+    vst1.64 {q7}, [r12]!
+
+    vld1.8 {W6}, [data]!
+	round	$4, $5, $6, $7, $0, $1, $2, $3, 12+$8
+    rev64   W6
+	round	$3, $4, $5, $6, $7, $0, $1, $2, 13+$8
+    vld1.64 {q7}, [K,:128]!
+    vadd.s64 q7, q7, W6
+    vst1.64 {q7}, [r12]!
+
+    vld1.8 {W7}, [data]!
+	round	$2, $3, $4, $5, $6, $7, $0, $1, 14+$8
+    rev64   W7
+	round	$1, $2, $3, $4, $5, $6, $7, $0, 15+$8
+    vld1.64 {q7}, [K,:128]!
+    vadd.s64 q7, q7, W7
+    vst1.64 {q7}, [r12]!
+
+	.endm
+
+    /*
+        16 rounds of hash update
+    */
+	.macro	rounds_schedule_final
+	round	$0, $1, $2, $3, $4, $5, $6, $7, 0+$8
+	round	$7, $0, $1, $2, $3, $4, $5, $6, 1+$8
+
+	round	$6, $7, $0, $1, $2, $3, $4, $5, 2+$8
+	round	$5, $6, $7, $0, $1, $2, $3, $4, 3+$8
+
+	round	$4, $5, $6, $7, $0, $1, $2, $3, 4+$8
+	round	$3, $4, $5, $6, $7, $0, $1, $2, 5+$8
+
+	round	$2, $3, $4, $5, $6, $7, $0, $1, 6+$8
+	round	$1, $2, $3, $4, $5, $6, $7, $0, 7+$8
+
+	round	$0, $1, $2, $3, $4, $5, $6, $7, 8+$8
+	round	$7, $0, $1, $2, $3, $4, $5, $6, 9+$8
+
+	round	$6, $7, $0, $1, $2, $3, $4, $5, 10+$8
+	round	$5, $6, $7, $0, $1, $2, $3, $4, 11+$8
+
+	round	$4, $5, $6, $7, $0, $1, $2, $3, 12+$8
+	round	$3, $4, $5, $6, $7, $0, $1, $2, 13+$8
+
+	round	$2, $3, $4, $5, $6, $7, $0, $1, 14+$8
+	round	$1, $2, $3, $4, $5, $6, $7, $0, 15+$8
+	.endm
+
+    .p2align  4
+L_table1:
+    .long   L_Tab$non_lazy_ptr-(L_table0+8)
+
+    .p2align  4
+	.text
+    .globl	_AccelerateCrypto_SHA512_compress
+_AccelerateCrypto_SHA512_compress:
+
+    // push callee-saved registers
+    push    {r4,r5,r7,lr}
+    add     r7, sp, #8         // set up dtrace frame pointer
+
+    vpush   {q4-q7}
+#if BUILDKERNEL
+    vpush   {q0-q3}
+    vpush   {q8-q15}
+#endif
+
+
+	// allocate stack space for WK[0:15]
+	sub		sp, sp, #stack_size
+
+    ldr     K, L_table1
+L_table0:
+    mov     r12, pc
+    ldr     K, [r12, K]
+
+    vld1.8   {W0,W1}, [data]!
+    vld1.8   {W2,W3}, [data]!
+    vld1.8   {W4,W5}, [data]!
+    vld1.8   {W6,W7}, [data]!
+
+    rev64   W0
+    rev64   W1
+    rev64   W2
+    rev64   W3
+    rev64   W4
+    rev64   W5
+    rev64   W6
+    rev64   W7
+
+    mov     r12, sp
+	// compute WK[0:15] and save in stack, use q0-q7 as they have not yet being used
+    vld1.8   {q0,q1}, [K,:128]!
+    vld1.8   {q2,q3}, [K,:128]!
+    vld1.8   {q4,q5}, [K,:128]!
+    vld1.8   {q6,q7}, [K,:128]!
+
+    vadd.s64 q0, q0, W0
+    vadd.s64 q1, q1, W1
+    vadd.s64 q2, q2, W2
+    vadd.s64 q3, q3, W3
+    vadd.s64 q4, q4, W4
+    vadd.s64 q5, q5, W5
+    vadd.s64 q6, q6, W6
+    vadd.s64 q7, q7, W7
+
+    vst1.32   {q0,q1}, [r12]!
+    vst1.32   {q2,q3}, [r12]!
+    vst1.32   {q4,q5}, [r12]!
+    vst1.32   {q6,q7}, [r12]!
+
+L_loop:
+
+	// digests a-h = ctx->states;
+    mov     r12, ctx
+    vld1.64  {q0,q1}, [r12]!
+    vld1.64  {q2,q3}, [r12]
+
+	// rounds 0:47 interleaved with W/WK update for rounds 16:63
+    mov     r4, #4
+L_i_loop:
+    rounds_schedule a, b, c, d, e, f, g, h, 16
+    subs    r4, r4, #1
+    bgt     L_i_loop
+
+	// revert K to the beginning of K256[]
+	sub		K, K, #640
+	subs    num_blocks, num_blocks, #1				// num_blocks--
+
+	beq	    L_final_block				// if final block, wrap up final rounds
+
+    rounds_schedule_initial a, b, c, d, e, f, g, h, 0
+
+	// ctx->states += digests a-h
+    mov     r12, ctx
+    vld1.64  {q4,q5}, [r12]!
+    vld1.64  {q6,q7}, [r12]
+    vadd.s64    q4, q0, q4
+    vadd.s64    q5, q1, q5
+    vadd.s64    q6, q2, q6
+    vadd.s64    q7, q3, q7
+    vst1.64  {q4,q5}, [ctx]
+    vst1.64  {q6,q7}, [r12]
+
+	bal		L_loop				// branch for next block
+
+	// wrap up digest update round 48:63 for final block
+L_final_block:
+    rounds_schedule_final a, b, c, d, e, f, g, h, 0
+
+	// ctx->states += digests a-h
+    mov     r12, ctx
+    vld1.64  {q4,q5}, [r12]!
+    vld1.64  {q6,q7}, [r12]
+    vadd.s64    q4, q0, q4
+    vadd.s64    q5, q1, q5
+    vadd.s64    q6, q2, q6
+    vadd.s64    q7, q3, q7
+    vst1.64  {q4,q5}, [ctx]
+    vst1.64  {q6,q7}, [r12]
+
+	// free allocated stack memory
+    add     sp, sp, #stack_size
+
+	// if kernel, restore used vector registers
+#if BUILDKERNEL
+    vpop   {q8-q15}
+    vpop   {q0-q3}
+#endif
+    vpop    {q4-q7}
+
+	// return
+    pop     {r4,r5,r7,pc}
+
+
+    .section    __DATA,__nl_symbol_ptr,non_lazy_symbol_pointers
+    .p2align  4
+L_Tab$non_lazy_ptr:
+    .indirect_symbol    _sha512_K
+    .long   0
+
+
+#endif // (defined(__arm__) && defined(__ARM_NEON__))
--- a/acceleratecrypto/Source/sha512/arm64/sha512_compress_arm64.s
+++ b/acceleratecrypto/Source/sha512/arm64/sha512_compress_arm64.s
@ -0,0 +1,622 @@
+# Copyright (c) (2016,2018-2020) Apple Inc. All rights reserved.
+#
+# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+# is contained in the License.txt file distributed with corecrypto) and only to
+# people who accept that license. IMPORTANT:  Any license rights granted to you by
+# Apple Inc. (if any) are limited to internal use within your organization only on
+# devices and computers you own or control, for the sole purpose of verifying the
+# security characteristics and correct functioning of the Apple Software.  You may
+# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+
+/*
+	This file provides arm64 hand implementation of the following function
+
+    void sha512_compress(uint64_t *state, size_t nblocks, const void *in);
+
+	sha512 algorithm per block description:
+
+		1. W(0:15) = big-endian (per 8 bytes) loading of input data (128 bytes)
+		2. load 8 digests (each 64bit) a-h from state
+		3. for r = 0:15
+				T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
+				d += T1;
+				h = T1 + Sigma0(a) + Maj(a,b,c)
+				permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
+		4. for r = 16:79
+				W[r] = W[r-16] + Gamma1(W[r-2]) + W[r-7] + Gamma0(W[r-15]);
+				T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
+				d += T1;
+				h = T1 + Sigma0(a) + Maj(a,b,c)
+				permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
+
+	In the assembly implementation:
+		- a circular window of message schedule W(r:r+15) is updated and stored in v0-v7
+		- its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
+		- the 8 digests (a-h) will be stored in GPR (%r8-%r15) 
+
+	----------------------------------------------------------------------------
+
+	our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block
+	into the last 16 rounds of its previous block:
+
+	----------------------------------------------------------------------------
+
+	load W(0:15) (big-endian per 8 bytes) into v0:v7
+	pre_calculate and store W+K(0:15) in stack
+
+L_loop:
+
+	load digests a-h from ctx->state;
+
+	for (r=0;r<64;r+=2) {
+		digests a-h update and permute round r:r+1
+		update W([r:r+1]%16) and WK([r:r+1]%16) for the next 8th iteration
+	}
+
+	num_block--;
+	if (num_block==0)	jmp L_last_block;
+
+	for (r=64;r<80;r+=2) {
+		digests a-h update and permute round r:r+1
+		load W([r:r+1]%16) (big-endian per 8 bytes) into v0:v7
+		pre_calculate and store W+K([r:r+1]%16) in stack
+	}
+
+	ctx->states += digests a-h;
+
+	jmp	L_loop;
+
+L_last_block:
+
+	for (r=64;r<80;r+=2) {
+		digests a-h update and permute round r:r+2
+	}
+
+	ctx->states += digests a-h;
+
+	------------------------------------------------------------------------
+
+	Apple CoreOS vector & numerics
+*/
+
+#if defined __arm64__
+
+#include "ccarm_pac_bti_macros.h"
+	// associate variables with registers or memory
+
+    #define stack_size     (16*8) 
+
+	#define	ctx			x0
+	#define num_blocks	x1
+	#define	data        x2
+
+	#define	a			x4
+	#define	bb			x5
+	#define	c			x6
+	#define	d			x7
+	#define	e			x8
+	#define	f			x9
+	#define	g			x10
+	#define	h			x11
+
+	#define	K			x3
+
+	// 3 local variables
+	#define	s	x12
+	#define	t	x13
+	#define	u	x14
+
+	// a window (16 quad-words) of message scheule
+	#define	W0	v0
+	#define	W1	v1
+	#define	W2	v2
+	#define	W3	v3
+	#define	W4	v4
+	#define	W5	v5
+	#define	W6	v6
+	#define	W7	v7
+
+	// circular buffer for WK[(r:r+15)%16]
+	#define WK(x)   [sp,#((x)&15)*8]
+
+// #define Ch(x,y,z)   (((x) & (y)) ^ ((~(x)) & (z)))
+
+    /* t = Ch($0, $1, $2) */
+	.macro Ch
+    eor     t, $1, $2  
+    and     t, t, $0
+    eor     t, t, $2
+	.endm
+
+// #define Maj(x,y,z)  (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+
+    
+    /* t = Maj($0, $1, $2) */
+	.macro	Maj
+	eor     t, $1, $2  // y^z
+	and		s, $1,$2   // y&z
+	and		t, t, $0   // x&(y^z)
+	eor		t, t, s    // Maj(x,y,z)
+	.endm
+
+// #define Gamma0(x)   (S64(1,  (x)) ^ S64(8, (x)) ^ R(7 ,   (x)))
+
+	// performs Gamma0_512 on 2 words on an vector registers
+	// use v20/v21 as intermediate registers
+	.macro	Gamma0
+    ushr.2d v20, $0, #1         // part of S64(1, x)
+    shl.2d  v21, $0, #56        // part of S64(8, x)
+    ushr.2d $0, $0, #7          // R(7, x)
+    eor.16b $0, $0, v20
+    ushr.2d v20, v20, #7        // part of S64(8, x)
+    eor.16b $0, $0, v21
+    shl.2d  v21,v21, #7         // part of S64(1, x)
+    eor.16b $0, $0, v20
+    eor.16b $0, $0, v21
+	.endm
+
+// #define Gamma1(x)   (S64(19, (x)) ^ S64(61, (x)) ^ R(6,   (x)))
+
+	// performs Gamma1_512 on 2 words on an vector registers
+	// use v16/v17 as intermediate registers
+	.macro	Gamma1
+    ushr.2d v16, $0, #19        // part of S64(19, x)
+    shl.2d  v17, $0, #3         // part of S64(61, x)
+    ushr.2d $0, $0, #6          // R(6, x)
+    eor.16b $0, $0, v16
+    ushr.2d v16, v16, #42       // part of S64(61, x)
+    eor.16b $0, $0, v17
+    shl.2d  v17,v17, #42        // part of S64(19, x)
+    eor.16b $0, $0, v16
+    eor.16b $0, $0, v17
+	.endm
+
+    // W[r] = W[r-16] + Gamma1(W[r-2]) + W[r-7] + Gamma0(W[r-15]);
+    /*
+        W0 W1 W2 W3 W4 W5 W6 W7
+        
+        update 2 quad words in W0 = W0 + Gamma1(W7) + vext(W4,W5) + Gamma0(vext(W0,W1)). 
+        use v16-v19 for temp
+    */
+    .macro  message_update2 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7
+    ext.16b v18, \vec4, \vec5, #8         // vext(W4,W5)
+    ext.16b v19, \vec0, \vec1, #8         // vext(W0,W1)
+
+    ushr.2d  v16, \vec7, #19            // part of S64(19, x)
+    shl.2d  v17, \vec7, #3             // part of S64(61, x)
+    add.2d  \vec0, \vec0, v18             // W0 + vext(W4,W5)
+    ushr.2d v18, \vec7, #6             // R(6,x)
+    ushr.2d v20, v19, #1         // part of S64(1, x)
+    shl.2d  v21, v19, #56        // part of S64(8, x)
+    ushr.2d v19, v19, #7          // R(7, x)
+
+    eor.16b v18, v18, v16
+    ushr.2d v16, v16, #42           // part of S64(61, x)
+    eor.16b v19, v19, v20
+    ushr.2d v20, v20, #7        // part of S64(8, x)
+
+    eor.16b v18, v18, v17
+    shl.2d  v17, v17, #42           // part of S64(19, x)
+    eor.16b v19, v19, v21
+    shl.2d  v21,v21, #7         // part of S64(1, x)
+    eor.16b v18, v18, v16
+    eor.16b v19, v19, v20
+
+    eor.16b v18, v18, v17
+    eor.16b v19, v19, v21
+
+    add.2d  \vec0, \vec0, v18             // W0 + Gamma1(W7) + vext(W4,W5)
+    add.2d  \vec0, \vec0, v19             // W0 + Gamma1(W7) + vext(W4,W5) + Gamma0(vext(W0,W1))
+    .endm 
+
+// #define Sigma0(x)   (S64(28,  (x)) ^ S64(34, (x)) ^ S64(39, (x)))
+
+	.macro	Sigma0
+    ror     t, $0, #28
+    eor     t, t, $0, ror #34
+    eor     t, t, $0, ror #39
+	.endm
+
+// #define Sigma1(x)   (S(14,  (x)) ^ S(18, (x)) ^ S(41, (x)))
+
+	.macro	Sigma1
+    ror     t, $0, #14
+    eor     t, t, $0, ror #18
+    eor     t, t, $0, ror #41
+	.endm
+
+	// per round digests update
+	.macro	round_ref
+	Sigma1	$4				// t = Sigma1(e);
+	add		$7, $7, t		// h = h+Sigma1(e)
+	Ch		$4, $5, $6		// t = Ch (e, f, g);
+    ldr     s, WK($8)       // s = WK
+	add		$7, $7, t		// h = h+Sigma1(e)+Ch(e,f,g);
+	add		$7, $7, s		// h = h+Sigma1(e)+Ch(e,f,g)+WK
+	add		$3, $3, $7		// d += h;
+	Sigma0	$0				// t = Sigma0(a);
+	add		$7, $7, t		// h += Sigma0(a);
+	Maj		$0, $1, $2		// t = Maj(a,b,c)
+	add		$7, $7, t		// h = T1 + Sigma0(a) + Maj(a,b,c);
+	.endm
+
+	.macro	round s0, s1, s2, s3, s4, s5, s6, s7, s8
+    ror     t, \s4, #14
+    eor     s, \s5, \s6  
+    ldr     u, WK(\s8)       // t = WK
+    eor     t, t, \s4, ror #18
+    and     s, s, \s4
+	add		\s7, \s7, u		// h = h+WK
+    eor     t, t, \s4, ror #41
+    eor     s, s, \s6
+	add		\s7, \s7, t		// h = h+WK+Sigma1(e)
+	eor     t, \s1, \s2  // y^z
+	add		\s7, \s7, s		// h = h+WK+Sigma1(e)+Ch(e,f,g);
+    ror     s, \s0, #28
+	add		\s3, \s3, \s7		// d += h;
+	and		u, \s1,\s2   // y&z
+    eor     s, s, \s0, ror #34
+	and		t, t, \s0   // x&(y^z)
+    eor     s, s, \s0, ror #39
+	eor		t, t, u    // Maj(x,y,z)
+	add		\s7, \s7, s		// h += Sigma0(a);
+	add		\s7, \s7, t		// h = T1 + Sigma0(a) + Maj(a,b,c);
+	.endm
+
+    .macro  combined_message_round_update2 s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7
+
+    //
+    // message_update2 \vec0, \vec1, \vec2, \vec3, \vec4, \vec5, \vec6, \vec7
+	// round	\s0, \s1, \s2, \s3, \s4, \s5, \s6, \s7, 0+\s8+\s9
+	// round	\s7, \s0, \s1, \s2, \s3, \s4, \s5, \s6, 1+\s8+\s9
+
+    ror     t, \s4, #14
+    ldr     u, WK(0+\s8+\s9)       // t = WK
+    eor     s, \s5, \s6
+    ext.16b v18, \vec4, \vec5, #8         // vext(W4,W5)
+    eor     t, t, \s4, ror #18
+    and     s, s, \s4
+    ext.16b v19, \vec0, \vec1, #8         // vext(W0,W1)
+
+    add     \s7, \s7, u     // h = h+WK
+    eor     t, t, \s4, ror #41
+    ushr.2d  v16, \vec7, #19            // part of S64(19, x)
+    eor     s, s, \s6
+    add     \s7, \s7, t     // h = h+WK+Sigma1(e)
+    shl.2d  v17, \vec7, #3             // part of S64(61, x)
+    eor     t, \s1, \s2  // y^z
+    add.2d  \vec0, \vec0, v18             // W0 + vext(W4,W5)
+    ushr.2d v18, \vec7, #6             // R(6,x)
+    add     \s7, \s7, s     // h = h+WK+Sigma1(e)+Ch(e,f,g);
+    ushr.2d v20, v19, #1         // part of S64(1, x)
+    ror     s, \s0, #28
+    shl.2d  v21, v19, #56        // part of S64(8, x)
+    add     \s3, \s3, \s7       // d += h;
+    ushr.2d v19, v19, #7          // R(7, x)
+    and     u, \s1,\s2   // y&z
+
+    eor.16b v18, v18, v16
+    eor     s, s, \s0, ror #34
+    ushr.2d v16, v16, #42           // part of S64(61, x)
+    and     t, t, \s0   // x&(y^z)
+    eor.16b v19, v19, v20
+    eor     s, s, \s0, ror #39
+    ushr.2d v20, v20, #7        // part of S64(8, x)
+    eor     t, t, u    // Maj(x,y,z)
+
+    eor.16b v18, v18, v17
+    add     \s7, \s7, s     // h += Sigma0(a);
+    shl.2d  v17, v17, #42           // part of S64(19, x)
+    add     \s7, \s7, t     // h = T1 + Sigma0(a) + Maj(a,b,c);
+    eor.16b v19, v19, v21
+    ror     t, \s3, #14
+    shl.2d  v21,v21, #7         // part of S64(1, x)
+    ldr     u, WK(1+\s8+\s9)       // t = WK
+    eor     s, \s4, \s5
+    eor.16b v18, v18, v16
+    ldr     q16, [K]
+    eor     t, t, \s3, ror #18
+    eor.16b v19, v19, v20
+    add     K, K, #16
+
+    eor.16b v18, v18, v17
+    and     s, s, \s3
+    eor.16b v19, v19, v21
+    add     \s6, \s6, u     // h = h+WK
+
+    add.2d  \vec0, \vec0, v18             // W0 + Gamma1(W7) + vext(W4,W5)
+    eor     t, t, \s3, ror #41
+    add.2d  \vec0, \vec0, v19             // W0 + Gamma1(W7) + vext(W4,W5) + Gamma0(vext(W0,W1))
+    eor     s, s, \s5
+    add     \s6, \s6, t     // h = h+WK+Sigma1(e)
+    eor     t, \s0, \s1  // y^z
+    add.2d  v16, v16, \vec0
+    add     \s6, \s6, s     // h = h+WK+Sigma1(e)+Ch(e,f,g);
+    ror     s, \s7, #28
+    add     \s2, \s2, \s6       // d += h;
+    and     u, \s0,\s1   // y&z
+    eor     s, s, \s7, ror #34
+    and     t, t, \s7   // x&(y^z)
+    eor     s, s, \s7, ror #39
+    eor     t, t, u    // Maj(x,y,z)
+    add     \s6, \s6, s     // h += Sigma0(a);
+    add     \s6, \s6, t     // h = T1 + Sigma0(a) + Maj(a,b,c);
+
+    str     q16, WK(\s9)
+    .endm
+
+    /*
+        16 rounds of hash update, update input schedule W (in vector register v0-v7) and WK = W + K (in stack)
+    */
+	.macro	rounds_schedule
+
+    combined_message_round_update2  $0, $1, $2, $3, $4, $5, $6, $7, $8, 0, W0, W1, W2, W3, W4, W5, W6, W7
+    combined_message_round_update2  $6, $7, $0, $1, $2, $3, $4, $5, $8, 2, W1, W2, W3, W4, W5, W6, W7, W0
+    combined_message_round_update2  $4, $5, $6, $7, $0, $1, $2, $3, $8, 4, W2, W3, W4, W5, W6, W7, W0, W1
+    combined_message_round_update2  $2, $3, $4, $5, $6, $7, $0, $1, $8, 6, W3, W4, W5, W6, W7, W0, W1, W2
+    combined_message_round_update2  $0, $1, $2, $3, $4, $5, $6, $7, $8, 8, W4, W5, W6, W7, W0, W1, W2, W3
+    combined_message_round_update2  $6, $7, $0, $1, $2, $3, $4, $5, $8,10, W5, W6, W7, W0, W1, W2, W3, W4
+    combined_message_round_update2  $4, $5, $6, $7, $0, $1, $2, $3, $8,12, W6, W7, W0, W1, W2, W3, W4, W5
+    combined_message_round_update2  $2, $3, $4, $5, $6, $7, $0, $1, $8,14, W7, W0, W1, W2, W3, W4, W5, W6
+
+	.endm
+
+    /*
+        16 rounds of hash update, load new input schedule W (in vector register v0-v7) and update WK = W + K (in stack)
+    */
+    .macro  combined_initial_round_update2 s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, vec0
+
+    ror     t, \s4, #14
+    ldr     u, WK(0+\s8+\s9)       // t = WK
+    eor     s, \s5, \s6  
+    ld1.16b {\vec0}, [data], #16
+    eor     t, t, \s4, ror #18
+    and     s, s, \s4
+	add		\s7, \s7, u		// h = h+WK
+    eor     t, t, \s4, ror #41
+    eor     s, s, \s6
+	add		\s7, \s7, t		// h = h+WK+Sigma1(e)
+	eor     t, \s1, \s2  // y^z
+	add		\s7, \s7, s		// h = h+WK+Sigma1(e)+Ch(e,f,g);
+    ror     s, \s0, #28
+    ld1.2d  {v16}, [K], #16
+
+	add		\s3, \s3, \s7		// d += h;
+	and		u, \s1,\s2   // y&z
+    eor     s, s, \s0, ror #34
+	and		t, t, \s0   // x&(y^z)
+
+    
+
+    eor     s, s, \s0, ror #39
+	eor		t, t, u    // Maj(x,y,z)
+	add		\s7, \s7, s		// h += Sigma0(a);
+	add		\s7, \s7, t		// h = T1 + Sigma0(a) + Maj(a,b,c);
+    ror     t, \s3, #14
+    eor     s, \s4, \s5
+    ldr     u, WK(1+\s8+\s9)       // t = WK
+
+    eor     t, t, \s3, ror #18
+    and     s, s, \s3
+
+	add		\s6, \s6, u		// h = h+WK
+
+    rev64.16b   \vec0, \vec0
+
+    eor     t, t, \s3, ror #41
+    eor     s, s, \s5
+	add		\s6, \s6, t		// h = h+WK+Sigma1(e)
+	eor     t, \s0, \s1  // y^z
+	add		\s6, \s6, s		// h = h+WK+Sigma1(e)+Ch(e,f,g);
+    ror     s, \s7, #28
+    add.2d  v16, v16, \vec0
+	add		\s2, \s2, \s6		// d += h;
+	and		u, \s0,\s1   // y&z
+    eor     s, s, \s7, ror #34
+	and		t, t, \s7   // x&(y^z)
+    eor     s, s, \s7, ror #39
+	eor		t, t, u    // Maj(x,y,z)
+	add		\s6, \s6, s		// h += Sigma0(a);
+    str     q16, WK(\s9)
+	add		\s6, \s6, t		// h = T1 + Sigma0(a) + Maj(a,b,c);
+
+    .endm
+
+	.macro	rounds_schedule_initial
+
+    combined_initial_round_update2  $0, $1, $2, $3, $4, $5, $6, $7, $8, 0, W0
+    combined_initial_round_update2  $6, $7, $0, $1, $2, $3, $4, $5, $8, 2, W1
+    combined_initial_round_update2  $4, $5, $6, $7, $0, $1, $2, $3, $8, 4, W2
+    combined_initial_round_update2  $2, $3, $4, $5, $6, $7, $0, $1, $8, 6, W3
+    combined_initial_round_update2  $0, $1, $2, $3, $4, $5, $6, $7, $8, 8, W4
+    combined_initial_round_update2  $6, $7, $0, $1, $2, $3, $4, $5, $8,10, W5
+    combined_initial_round_update2  $4, $5, $6, $7, $0, $1, $2, $3, $8,12, W6
+    combined_initial_round_update2  $2, $3, $4, $5, $6, $7, $0, $1, $8,14, W7
+    
+	.endm
+
+    /*
+        16 rounds of hash update
+    */
+	.macro	rounds_schedule_final
+	round	$0, $1, $2, $3, $4, $5, $6, $7, 0+$8
+	round	$7, $0, $1, $2, $3, $4, $5, $6, 1+$8
+
+	round	$6, $7, $0, $1, $2, $3, $4, $5, 2+$8
+	round	$5, $6, $7, $0, $1, $2, $3, $4, 3+$8
+
+	round	$4, $5, $6, $7, $0, $1, $2, $3, 4+$8
+	round	$3, $4, $5, $6, $7, $0, $1, $2, 5+$8
+
+	round	$2, $3, $4, $5, $6, $7, $0, $1, 6+$8
+	round	$1, $2, $3, $4, $5, $6, $7, $0, 7+$8
+
+	round	$0, $1, $2, $3, $4, $5, $6, $7, 8+$8
+	round	$7, $0, $1, $2, $3, $4, $5, $6, 9+$8
+
+	round	$6, $7, $0, $1, $2, $3, $4, $5, 10+$8
+	round	$5, $6, $7, $0, $1, $2, $3, $4, 11+$8
+
+	round	$4, $5, $6, $7, $0, $1, $2, $3, 12+$8
+	round	$3, $4, $5, $6, $7, $0, $1, $2, 13+$8
+
+	round	$2, $3, $4, $5, $6, $7, $0, $1, 14+$8
+	round	$1, $2, $3, $4, $5, $6, $7, $0, 15+$8
+	.endm
+
+.subsections_via_symbols
+	.text
+    .p2align  4
+    .globl	_AccelerateCrypto_SHA512_compress
+_AccelerateCrypto_SHA512_compress:
+    BRANCH_TARGET_CALL
+
+#ifdef __ILP32__
+    uxtw    num_blocks, num_blocks        // in arm64_32 size_t is 32-bit, so we need to extend it
+#endif
+
+
+    adrp    K, _sha512_K@page
+    cbnz    num_blocks, 1f                       // if number of blocks is nonzero, go on for sha256 transform operation
+    ret     lr                          // otherwise, return
+1:
+    add     K, K, _sha512_K@pageoff 
+
+#if BUILDKERNEL
+    // v0-v7, v16-v23
+    sub     x4, sp, #16*16
+    sub     sp, sp, #16*16
+    st1.4s  {v0, v1, v2, v3}, [x4], #64
+    st1.4s  {v4, v5, v6, v7}, [x4], #64
+    st1.4s  {v16, v17, v18, v19}, [x4], #64
+    st1.4s  {v20, v21, v22, v23}, [x4], #64
+#endif
+
+
+	// allocate stack space for WK[0:15]
+	sub		sp, sp, #stack_size
+    ldr     q0, [data], #128
+    ldr     q1, [data, #-112]
+    ldr     q2, [data, #-96]
+
+    ldr     q3, [data, #-80]
+    rev64.16b   v0, v0
+    ldr     q4, [data, #-64]
+    rev64.16b   v1, v1
+    ldr     q5, [data, #-48]
+    rev64.16b   v2, v2
+    ldr     q6, [data, #-32]
+    rev64.16b   v3, v3
+    ldr     q7, [data, #-16]
+    rev64.16b   v4, v4
+    ldr     q16, [K], #64
+    rev64.16b   v5, v5
+    ldr     q17, [K, #-48]
+    rev64.16b   v6, v6
+    ldr     q18, [K, #-32]
+    rev64.16b   v7, v7
+    ldr     q19, [K, #-16]
+
+
+	// compute WK[0:15] and save in stack
+    add.2d  v20, v16, v0
+    ldr     q16, [K], #64
+    add.2d  v21, v17, v1
+    ldr     q17, [K, #-48]
+    add.2d  v22, v18, v2
+    ldr     q18, [K, #-32]
+    add.2d  v23, v19, v3
+    ldr     q19, [K, #-16]
+    add.2d  v16, v16, v4
+    str     q20, [sp]
+    add.2d  v17, v17, v5
+    str     q21, [sp, #16*1]
+    add.2d  v18, v18, v6
+    str     q22, [sp, #16*2]
+    add.2d  v19, v19, v7
+    str     q23, [sp, #16*3]
+    str     q16, [sp, #16*4]
+    str     q17, [sp, #16*5]
+    str     q18, [sp, #16*6]
+    str     q19, [sp, #16*7]
+
+L_loop:
+
+	// digests a-h = ctx->states;
+    ldp     a, bb, [ctx]
+    ldp     c, d, [ctx, #16]
+    ldp     e, f, [ctx, #32]
+    ldp     g, h, [ctx, #48]
+
+	// rounds 0:47 interleaved with W/WK update for rounds 16:63
+    mov     w15, #4
+L_i_loop:
+    rounds_schedule a, bb, c, d, e, f, g, h, 16
+    subs    w15, w15, #1
+    b.gt    L_i_loop
+
+	// revert K to the beginning of K256[]
+	sub		K, K, #640
+	subs    num_blocks, num_blocks, #1				// num_blocks--
+
+	b.eq	L_final_block				// if final block, wrap up final rounds
+
+    rounds_schedule_initial a, bb, c, d, e, f, g, h, 0
+
+	// ctx->states += digests a-h
+    ldp     s, t, [ctx]
+    add     s, s, a
+    add     t, t, bb
+    stp     s, t, [ctx]
+    ldp     s, t, [ctx, #16]
+    add     s, s, c
+    add     t, t, d
+    stp     s, t, [ctx, #16]
+    ldp     s, t, [ctx, #32]
+    add     s, s, e
+    add     t, t, f
+    stp     s, t, [ctx, #32]
+    ldp     s, t, [ctx, #48]
+    add     s, s, g
+    add     t, t, h
+    stp     s, t, [ctx, #48]
+
+	b		L_loop				// branch for next block
+
+	// wrap up digest update round 48:63 for final block
+L_final_block:
+    rounds_schedule_final a, bb, c, d, e, f, g, h, 0
+
+	// ctx->states += digests a-h
+    ldp     s, t, [ctx]
+    add     s, s, a
+    add     t, t, bb
+    stp     s, t, [ctx]
+    ldp     s, t, [ctx, #16]
+    add     s, s, c
+    add     t, t, d
+    stp     s, t, [ctx, #16]
+    ldp     s, t, [ctx, #32]
+    add     s, s, e
+    add     t, t, f
+    stp     s, t, [ctx, #32]
+    ldp     s, t, [ctx, #48]
+    add     s, s, g
+    add     t, t, h
+    stp     s, t, [ctx, #48]
+
+	// if kernel, restore used vector registers
+#if BUILDKERNEL
+    ld1.4s  {v0, v1, v2, v3}, [sp], #64
+    ld1.4s  {v4, v5, v6, v7}, [sp], #64
+    ld1.4s  {v16, v17, v18, v19}, [sp], #64
+    ld1.4s  {v20, v21, v22, v23}, [sp], #64
+#endif
+
+	// free allocated stack memory
+    add     sp, sp, #stack_size
+
+	// return
+	ret     lr
+
+#endif      // __arm64__
--- a/acceleratecrypto/Source/sha512/arm64/sha512_compress_arm64hw.s
+++ b/acceleratecrypto/Source/sha512/arm64/sha512_compress_arm64hw.s
@ -0,0 +1,259 @@
+# Copyright (c) (2016,2018,2019,2020) Apple Inc. All rights reserved.
+#
+# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+# is contained in the License.txt file distributed with corecrypto) and only to 
+# people who accept that license. IMPORTANT:  Any license rights granted to you by 
+# Apple Inc. (if any) are limited to internal use within your organization only on 
+# devices and computers you own or control, for the sole purpose of verifying the 
+# security characteristics and correct functioning of the Apple Software.  You may 
+# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+/*
+	This file provides arm64 hand implementation of the following function
+
+    void sha512_compress(uint64_t *state, size_t nblocks, const void *in);
+
+	sha512 algorithm per block description:
+
+		1. W(0:15) = big-endian (per 8 bytes) loading of input data (128 bytes)
+		2. load 8 digests (each 64bit) a-h from state
+		3. for r = 0:15
+				T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
+				d += T1;
+				h = T1 + Sigma0(a) + Maj(a,b,c)
+				permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
+		4. for r = 16:79
+				W[r] = W[r-16] + Gamma1(W[r-2]) + W[r-7] + Gamma0(W[r-15]);
+				T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
+				d += T1;
+				h = T1 + Sigma0(a) + Maj(a,b,c)
+				permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
+
+	In the assembly implementation:
+		- a circular window of message schedule W(r:r+15) is updated and stored in v0-v7
+		- its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
+		- the 8 digests (a-h) will be stored in GPR (%r8-%r15) 
+
+	----------------------------------------------------------------------------
+
+	our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block
+	into the last 16 rounds of its previous block:
+
+	----------------------------------------------------------------------------
+
+	load W(0:15) (big-endian per 8 bytes) into v0:v7
+	pre_calculate and store W+K(0:15) in stack
+
+L_loop:
+
+	load digests a-h from ctx->state;
+
+	for (r=0;r<64;r+=2) {
+		digests a-h update and permute round r:r+1
+		update W([r:r+1]%16) and WK([r:r+1]%16) for the next 8th iteration
+	}
+
+	num_block--;
+	if (num_block==0)	jmp L_last_block;
+
+	for (r=64;r<80;r+=2) {
+		digests a-h update and permute round r:r+1
+		load W([r:r+1]%16) (big-endian per 8 bytes) into v0:v7
+		pre_calculate and store W+K([r:r+1]%16) in stack
+	}
+
+	ctx->states += digests a-h;
+
+	jmp	L_loop;
+
+L_last_block:
+
+	for (r=64;r<80;r+=2) {
+		digests a-h update and permute round r:r+2
+	}
+
+	ctx->states += digests a-h;
+
+	------------------------------------------------------------------------
+
+	Apple CoreOS vector & numerics
+*/
+
+#if defined __arm64__
+
+#include "ccarm_pac_bti_macros.h"
+
+    .macro  swap_hilo
+    ext.16b $0, $0, $0, #8
+    .endm
+
+    .macro  ext16b
+    ext.16b $0, $1, $2, #8
+    .endm
+
+
+	.text
+    .align  4
+    .globl  _AccelerateCrypto_SHA512_compress_hwassist
+
+_AccelerateCrypto_SHA512_compress_hwassist:
+
+    BRANCH_TARGET_CALL
+
+
+	#define	hashes		x0
+	#define	numblocks	x1
+	#define	data		x2
+	#define	ktable		x3
+
+#ifdef __ILP32__
+    uxtw    numblocks, numblocks        // in arm64_32 size_t is 32-bit, so we need to extend it
+#endif
+
+
+	adrp	ktable, _ccsha512_K@page
+	cbnz	numblocks, 1f						
+	ret		lr							// otherwise, return
+1:
+	add		ktable, ktable, _ccsha512_K@pageoff
+
+#if BUILDKERNEL
+	sub		x4, sp, #28*16
+	sub		sp, sp, #28*16
+	st1.4s	{v0, v1, v2, v3}, [x4], #64
+	st1.4s	{v4, v5, v6, v7}, [x4], #64
+	st1.4s	{v16, v17, v18, v19}, [x4], #64
+	st1.4s	{v20, v21, v22, v23}, [x4], #64
+	st1.4s	{v24, v25, v26, v27}, [x4], #64
+	st1.4s	{v28, v29, v30, v31}, [x4], #64
+#else
+	sub		x4, sp, #4*16
+	sub		sp, sp, #4*16
+#endif
+	st1.4s	{v8, v9, v10, v11}, [x4], #64
+
+	ld1.2d	{v8,v9,v10,v11}, [hashes]				// (a,b) (c,d) (e,f) (g,h)
+
+L_loop:
+
+    mov.16b     v24, v8
+    ldr         q0, [data, #0*16]
+    mov.16b     v25, v9
+    ldr         q1, [data, #1*16]
+    mov.16b     v26, v10
+    ldr         q2, [data, #2*16]
+    mov.16b     v27, v11
+    ldr         q3, [data, #3*16]
+
+    rev64.16b   v0, v0
+    ldr         q4, [data, #4*16]
+    rev64.16b   v1, v1
+    ldr         q5, [data, #5*16]
+    rev64.16b   v2, v2
+    ldr         q6, [data, #6*16]
+    rev64.16b   v3, v3
+    ldr         q7, [data, #7*16]
+    rev64.16b   v4, v4
+    ldr         q16, [ktable, #0*16]
+    rev64.16b   v5, v5
+    ldr         q17, [ktable, #1*16]
+    rev64.16b   v6, v6
+    ldr         q18, [ktable, #2*16]
+    rev64.16b   v7, v7
+    ldr         q19, [ktable, #3*16]
+
+    add.2d		v16, v16, v0
+    ldr         q20, [ktable, #4*16]
+    add.2d		v17, v17, v1
+    ldr         q21, [ktable, #5*16]
+    add.2d		v18, v18, v2
+    ldr         q22, [ktable, #6*16]
+    add.2d		v19, v19, v3
+    ldr         q23, [ktable, #7*16]
+    add.2d		v20, v20, v4
+    add         data, data, #8*16
+    add.2d		v21, v21, v5
+    add         ktable, ktable, #8*16
+    add.2d		v22, v22, v6
+    add.2d		v23, v23, v7
+
+    .macro  sha512_round S0, S1, S2, S3, WK, w0, w1, w4, w5, w7, i
+    ext16b  \WK, \WK, \WK
+    ext16b  v29, \S2, \S3
+    ext16b  v28, \S1, \S2
+    add.2d  \S3, \S3, \WK
+                                ext16b  v31, \w4, \w5
+                                ldr         q30, [ktable, #\i*16]
+    sha512h.2d \S3, v29, v28 
+                                sha512su0.2d   \w0, \w1 
+    mov.16b v28, \S3
+    sha512h2.2d \S3, \S1, \S0
+                                sha512su1.2d   \w0, \w7, v31
+    add.2d  \S1, \S1, v28
+                                add.2d      \WK, \w0, v30 
+    .endm
+
+    .macro sha512_8_rounds
+    sha512_round    v24, v25, v26, v27, v16, v0, v1, v4, v5, v7, 0
+    sha512_round    v27, v24, v25, v26, v17, v1, v2, v5, v6, v0, 1
+    sha512_round    v26, v27, v24, v25, v18, v2, v3, v6, v7, v1, 2
+    sha512_round    v25, v26, v27, v24, v19, v3, v4, v7, v0, v2, 3
+    sha512_round    v24, v25, v26, v27, v20, v4, v5, v0, v1, v3, 4
+    sha512_round    v27, v24, v25, v26, v21, v5, v6, v1, v2, v4, 5
+    sha512_round    v26, v27, v24, v25, v22, v6, v7, v2, v3, v5, 6
+    sha512_round    v25, v26, v27, v24, v23, v7, v0, v3, v4, v6, 7
+    add     ktable, ktable, #16*8
+    .endm
+
+    .macro  sha512_round_final S0, S1, S2, S3, WK, w0, w1, w4, w5, w7
+                                ext16b      \WK, \WK, \WK
+    ext16b  v29, \S2, \S3
+    ext16b  v28, \S1, \S2
+    add.2d  v30, \S3, \WK
+    sha512h.2d v30, v29, v28 
+    mov.16b \S3, v30
+    sha512h2.2d \S3, \S1, \S0
+    add.2d  \S1, \S1, v30
+    .endm
+
+    .macro  sha512_8_rounds_final
+    sha512_round_final    v24, v25, v26, v27, v16
+    sha512_round_final    v27, v24, v25, v26, v17
+    sha512_round_final    v26, v27, v24, v25, v18
+    sha512_round_final    v25, v26, v27, v24, v19
+    sha512_round_final    v24, v25, v26, v27, v20
+    sha512_round_final    v27, v24, v25, v26, v21
+    sha512_round_final    v26, v27, v24, v25, v22
+    sha512_round_final    v25, v26, v27, v24, v23
+    .endm
+
+    sha512_8_rounds
+    sha512_8_rounds
+    sha512_8_rounds
+    sha512_8_rounds
+    sha512_8_rounds_final
+
+    add.2d  v8, v8, v24
+    add.2d  v9, v9, v25
+    add.2d  v10, v10, v26
+    add.2d  v11, v11, v27
+
+	subs 		numblocks, numblocks, #1	// pre-decrement num_blocks by 1
+	sub			ktable, ktable, #640
+	b.gt		L_loop
+
+    st1.2d  {v8,v9,v10,v11}, [hashes]
+
+#if BUILDKERNEL
+	ld1.4s	{v0, v1, v2, v3}, [sp], #64
+	ld1.4s	{v4, v5, v6, v7}, [sp], #64
+	ld1.4s	{v16, v17, v18, v19}, [sp], #64
+	ld1.4s	{v20, v21, v22, v23}, [sp], #64
+	ld1.4s	{v24, v25, v26, v27}, [sp], #64
+	ld1.4s	{v28, v29, v30, v31}, [sp], #64
+#endif
+	ld1.4s	{v8, v9, v10, v11}, [sp], #64
+
+	ret		lr
+
+#endif
+
--- a/acceleratecrypto/Source/sha512/intel/sha512_compress.c
+++ b/acceleratecrypto/Source/sha512/intel/sha512_compress.c
@ -0,0 +1,29 @@
+/* Copyright (c) (2019) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to 
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by 
+ * Apple Inc. (if any) are limited to internal use within your organization only on 
+ * devices and computers you own or control, for the sole purpose of verifying the 
+ * security characteristics and correct functioning of the Apple Software.  You may 
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#include <stddef.h>
+#include "config.h"
+#include "AccelerateCrypto.h"
+
+#if defined(__x86_64__)
+
+extern void AccelerateCrypto_SHA512_compress_ssse3(uint64_t *state, size_t num, const void *buf) __asm__("_AccelerateCrypto_SHA512_compress_ssse3");
+extern void AccelerateCrypto_SHA512_compress_AVX1(uint64_t *state, size_t num, const void *buf) __asm__("_AccelerateCrypto_SHA512_compress_AVX1");
+extern void AccelerateCrypto_SHA512_compress_AVX2(uint64_t *state, size_t num, const void *buf)__asm__("_AccelerateCrypto_SHA512_compress_AVX2");
+
+void  AccelerateCrypto_SHA512_compress(uint64_t *state, size_t num, const void *buf)
+{
+    if (HAS_AVX2()) AccelerateCrypto_SHA512_compress_AVX2(state, num, buf);
+    else if (HAS_AVX1()) AccelerateCrypto_SHA512_compress_AVX1(state, num, buf);
+    else 
+        AccelerateCrypto_SHA512_compress_ssse3(state, num, buf);  
+}
+#endif  // defined(__x86_64__)
--- a/acceleratecrypto/Source/sha512/intel/sha512_compress_avx1.s
+++ b/acceleratecrypto/Source/sha512/intel/sha512_compress_avx1.s
@ -0,0 +1,616 @@
+# Copyright (c) (2016,2018,2019) Apple Inc. All rights reserved.
+#
+# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+# is contained in the License.txt file distributed with corecrypto) and only to 
+# people who accept that license. IMPORTANT:  Any license rights granted to you by 
+# Apple Inc. (if any) are limited to internal use within your organization only on 
+# devices and computers you own or control, for the sole purpose of verifying the 
+# security characteristics and correct functioning of the Apple Software.  You may 
+# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+#include <corecrypto/cc_config.h>
+
+/*
+	This file provides x86_64 hand implementation of the following function
+
+    void sha512_compress(uint64_t *state, size_t nblocks, const void *in);
+
+	sha512 algorithm per block description:
+
+		1. W(0:15) = big-endian (per 8 bytes) loading of input data (128 bytes)
+		2. load 8 digests (each 64bit) a-h from state
+		3. for r = 0:15
+				T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
+				d += T1;
+				h = T1 + Sigma0(a) + Maj(a,b,c)
+				permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
+		4. for r = 16:79
+				W[r] = W[r-16] + Gamma1(W[r-2]) + W[r-7] + Gamma0(W[r-15]);
+				T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
+				d += T1;
+				h = T1 + Sigma0(a) + Maj(a,b,c)
+				permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
+
+	In the assembly implementation:
+		- a circular window of message schedule W(r:r+15) is updated and stored in xmm0-xmm7 (or ymm0-ymm3/zmm0-zmm1 for avx1/avx2)
+		- its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
+		- the 8 digests (a-h) will be stored in GPR (%r8-%r15) 
+
+	----------------------------------------------------------------------------
+
+	our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block
+	into the last 16 rounds of its previous block:
+
+	----------------------------------------------------------------------------
+
+	load W(0:15) (big-endian per 8 bytes) into xmm0:xmm7
+	pre_calculate and store W+K(0:15) in stack
+
+L_loop:
+
+	load digests a-h from ctx->state;
+
+	for (r=0;r<64;r+=2) {
+		digests a-h update and permute round r:r+1
+		update W([r:r+1]%16) and WK([r:r+1]%16) for the next 8th iteration
+	}
+
+	num_block--;
+	if (num_block==0)	jmp L_last_block;
+
+	for (r=64;r<80;r+=2) {
+		digests a-h update and permute round r:r+1
+		load W([r:r+1]%16) (big-endian per 8 bytes) into xmm0:xmm7
+		pre_calculate and store W+K([r:r+1]%16) in stack
+	}
+
+	ctx->states += digests a-h;
+
+	jmp	L_loop;
+
+L_last_block:
+
+	for (r=64;r<80;r+=2) {
+		digests a-h update and permute round r:r+2
+	}
+
+	ctx->states += digests a-h;
+
+	------------------------------------------------------------------------
+
+	Apple CoreOS vector & numerics
+*/
+#if defined __x86_64__
+
+	// associate variables with registers or memory
+
+	#define	sp			%rsp
+	#define	ctx			%rdi
+	#define num_blocks	%rsi        // later move this to stack, use %rsi for temp variable u
+	#define	data        %rdx
+
+	#define	a			%r8
+	#define	b			%r9
+	#define	c			%r10
+	#define	d			%r11
+	#define	e			%r12
+	#define	f			%r13
+	#define	g			%r14
+	#define	h			%r15
+
+	#define	K			%rbx
+    #define _num_blocks  (-48)(%rbp)        // rbx/r12-r15 
+	#define stack_size	(8+32*12+128+16)	    // 8 (_num_blocks) + ymm0:ymm11 + WK(0:15) + 16byte for 32-byte alignment
+
+	#define	L_aligned_bswap	L_bswap(%rip)   // bswap : big-endian loading of 4-byte words
+	#define	ymm_save	128(sp)			    // starting address for xmm save/restore
+
+	// 3 local variables
+	#define	s	%rax
+	#define	t	%rcx
+	#define	u	%rsi
+
+	// a window (16 quad-words) of message scheule
+	#define	W0	%xmm0
+	#define	W1	%xmm1
+	#define	W2	%xmm2
+	#define	W3	%xmm3
+	#define	W4	%xmm4
+	#define	W5	%xmm5
+	#define	W6	%xmm6
+	#define	W7	%xmm7
+
+	// circular buffer for WK[(r:r+15)%16]
+	#define WK(x)   ((x)&15)*8(sp)
+
+// #define Ch(x,y,z)   (((x) & (y)) ^ ((~(x)) & (z)))
+
+	.macro Ch arg0, arg1, arg2
+#if 1
+    mov     \arg2, t
+    xor     \arg1, t
+    and     \arg0, t
+    xor     \arg2, t
+#else
+	mov		\arg0, t		// x
+	mov		\arg0, s		// x
+	not		t			// ~x
+	and		\arg1, s		// x & y
+	and		\arg2, t		// ~x & z
+	xor		s, t		// t = ((x) & (y)) ^ ((~(x)) & (z));
+#endif
+	.endm
+
+// #define Maj(x,y,z)  (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+
+	.macro	Maj arg0, arg1, arg2
+	mov	 	\arg1,	t // y
+	mov		\arg2,	s // z
+	xor		\arg2,	t // y^z
+	and		\arg1,	s // y&z
+	and		\arg0, 	t // x&(y^z)
+	xor		s,	t // Maj(x,y,z)
+	.endm
+
+// #define Gamma0(x)   (S64(1,  (x)) ^ S64(8, (x)) ^ R(7 ,   (x)))
+
+	// performs Gamma0_512 on 2 words on an xmm registers
+	// use xmm8/xmm9 as intermediate registers
+	.macro	Gamma0 arg0
+    vpsrlq  $1, \arg0, %xmm8      // part of S64(1, x)
+    vpsllq  $56, \arg0, %xmm9     // part of S64(8, x)
+    vpsrlq  $7, \arg0, \arg0         // R(7, x)
+    vpxor   %xmm8, \arg0, \arg0
+    vpsrlq  $7, %xmm8, %xmm8   // part of S64(8, x)
+    vpxor   %xmm9, \arg0, \arg0
+    vpsllq  $7, %xmm9, %xmm9   // part of S64(1, x)
+    vpxor   %xmm8, \arg0, \arg0
+    vpxor   %xmm9, \arg0, \arg0
+	.endm
+
+// #define Gamma1(x)   (S64(19, (x)) ^ S64(61, (x)) ^ R(6,   (x)))
+
+	// performs Gamma1_512 on 2 words on an xmm registers
+	// use xmm8/xmm9 as intermediate registers
+	.macro	Gamma1 arg0
+    vpsrlq  $19, \arg0, %xmm8     // part of S64(19, x)
+    vpsllq  $3, \arg0, %xmm9      // part of S64(61, x)
+    vpsrlq  $6, \arg0, \arg0         // R(6, x)
+    vpxor   %xmm8, \arg0, \arg0
+    vpsrlq  $42, %xmm8, %xmm8  // part of S64(61, x)
+    vpxor   %xmm9, \arg0, \arg0
+    vpsllq  $42, %xmm9, %xmm9  // part of S64(19, x)
+    vpxor   %xmm8, \arg0, \arg0
+    vpxor   %xmm9, \arg0, \arg0
+	.endm
+
+    // W[r] = W[r-16] + Gamma1(W[r-2]) + W[r-7] + Gamma0(W[r-15]);
+    /*
+        W0 W1 W2 W3 W4 W5 W6 W7
+        
+        update 2 quad words in W0 = W0 + Gamma1(W7) + vext(W4,W5) + Gamma0(vext(W0,W1)). 
+        use %xmm10, %xmm11 for temp
+    */
+    .macro  message_update2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
+    vpalignr $8, \arg4, \arg5, %xmm10     // vext(W4,W5)
+    vpalignr $8, \arg0, \arg1, %xmm11     // vext(W0,W1)
+    vpaddq   %xmm10, \arg0, \arg0          // W0 + vext(W4,W5)
+    // vmovdqa  \arg7, %xmm10
+    // Gamma1  %xmm10                   // Gamma1(W7)
+    vpsrlq  $19, \arg7, %xmm8     // part of S64(19, x)
+    vpsllq  $3, \arg7, %xmm9      // part of S64(61, x)
+    vpsrlq  $6, \arg7, %xmm10        // R(6, x)
+    vpxor   %xmm8, %xmm10, %xmm10
+    vpsrlq  $42, %xmm8, %xmm8  // part of S64(61, x)
+    vpxor   %xmm9, %xmm10, %xmm10
+    vpsllq  $42, %xmm9, %xmm9  // part of S64(19, x)
+    vpxor   %xmm8, %xmm10, %xmm10
+    vpxor   %xmm9, %xmm10, %xmm10
+    Gamma0  %xmm11                   // Gamma0(vext(W0,W1))
+    vpaddq   %xmm10, \arg0, \arg0          // W0 + Gamma1(W7) + vext(W4,W5)
+    vpaddq   %xmm11, \arg0, \arg0          // W0 + Gamma1(W7) + vext(W4,W5) + Gamma0(vext(W0,W1))
+    .endm 
+
+// #define Sigma0(x)   (S64(28,  (x)) ^ S64(34, (x)) ^ S64(39, (x)))
+
+	.macro	Sigma0 arg0
+	mov		\arg0, t			// x
+	mov		\arg0, s			// x
+	ror		$28, t			// S(28,  (x))
+	ror		$34, s			// S(34,  (x))
+	xor		s, t			// S(28,  (x)) ^ S(34, (x))
+	ror		$5, s			// S(39,  (x))
+	xor		s, t			// t = (S(28,  (x)) ^ S(34, (x)) ^ S(39, (x)))
+	.endm
+
+// #define Sigma1(x)   (S(14,  (x)) ^ S(18, (x)) ^ S(41, (x)))
+
+	.macro	Sigma1 arg0
+	mov		\arg0, s			// x
+	ror		$14, s			// S(14,  (x))
+	mov		s, t			// S(14,  (x))
+	ror		$4, s			// S(18, (x))
+	xor		s, t			// S(14,  (x)) ^ S(18, (x))
+	ror		$23, s			// S(41, (x))
+	xor		s, t			// t = (S(14,  (x)) ^ S(18, (x)) ^ S(41, (x)))
+	.endm
+
+	// per round digests update
+	.macro	round_ref arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+	Sigma1	\arg4				// t = Sigma1(e);
+	add		t, \arg7			// h = h+Sigma1(e)
+	Ch		\arg4, \arg5, \arg6		// t = Ch (e, f, g);
+	add		t, \arg7			// h = h+Sigma1(e)+Ch(e,f,g);
+	add		WK(\arg8), \arg7		// h = h+Sigma1(e)+Ch(e,f,g)+WK
+	add		\arg7, \arg3			// d += h;
+	Sigma0	\arg0				// t = Sigma0(a);
+	add		t, \arg7			// h += Sigma0(a);
+	Maj		\arg0, \arg1, \arg2		// t = Maj(a,b,c)
+	add		t, \arg7			// h = T1 + Sigma0(a) + Maj(a,b,c);
+	.endm
+
+	.macro	round arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+	mov		\arg4, s
+	mov		\arg0, t
+	ror		$(41-18), s
+	ror		$(39-34), t
+	xor		\arg4, s
+	mov		\arg5, u
+	xor		\arg0, t
+	ror		$(18-14), s
+	xor		\arg6, u
+	xor		\arg4, s
+	ror		$(34-28), t
+	and		\arg4, u
+	xor		\arg0, t
+	xor		\arg6, u
+	ror		$14, s
+	ror		$28, t
+	add		s, u
+	mov		\arg0, s
+	add		WK(\arg8), u
+	or		\arg2, s
+	add		u, \arg7
+	mov		\arg0, u
+	add		\arg7, \arg3
+	and		\arg1, s
+	and		\arg2, u
+	or		u, s
+	add		t, \arg7
+	add		s, \arg7	
+	.endm
+
+    /*
+        16 rounds of hash update, update input schedule W (in vector register xmm0-xmm7) and WK = W + K (in stack)
+    */
+	.macro	rounds_schedule arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+    message_update2 W0, W1, W2, W3, W4, W5, W6, W7
+    vmovdqa  0*16(K), %xmm8
+	round	\arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 0+\arg8
+    vpaddq  W0, %xmm8, %xmm8
+	round	\arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 1+\arg8
+    vmovdqa  %xmm8, WK(0)
+
+    message_update2 W1, W2, W3, W4, W5, W6, W7, W0
+    vmovdqa  1*16(K), %xmm8
+	round	\arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 2+\arg8
+    vpaddq   W1, %xmm8, %xmm8
+	round	\arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 3+\arg8
+    vmovdqa  %xmm8, WK(2)
+
+    message_update2 W2, W3, W4, W5, W6, W7, W0, W1
+    vmovdqa  2*16(K), %xmm8
+	round	\arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 4+\arg8
+    vpaddq   W2, %xmm8, %xmm8
+	round	\arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 5+\arg8
+    vmovdqa  %xmm8, WK(4)
+
+    message_update2 W3, W4, W5, W6, W7, W0, W1, W2
+    vmovdqa  3*16(K), %xmm8
+	round	\arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 6+\arg8
+    vpaddq   W3, %xmm8, %xmm8
+	round	\arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 7+\arg8
+    vmovdqa  %xmm8, WK(6)
+
+    message_update2 W4, W5, W6, W7, W0, W1, W2, W3
+    movdqa  4*16(K), %xmm8
+	round	\arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 8+\arg8
+    paddq   W4, %xmm8
+	round	\arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 9+\arg8
+    movdqa  %xmm8, WK(8)
+
+    message_update2 W5, W6, W7, W0, W1, W2, W3, W4
+    vmovdqa  5*16(K), %xmm8
+	round	\arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 10+\arg8
+    vpaddq   W5, %xmm8, %xmm8
+	round	\arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 11+\arg8
+    vmovdqa  %xmm8, WK(10)
+
+    message_update2 W6, W7, W0, W1, W2, W3, W4, W5
+    vmovdqa  6*16(K), %xmm8
+	round	\arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 12+\arg8
+    vpaddq   W6, %xmm8, %xmm8
+	round	\arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 13+\arg8
+    vmovdqa  %xmm8, WK(12)
+
+    message_update2 W7, W0, W1, W2, W3, W4, W5, W6
+    vmovdqa  7*16(K), %xmm8
+	round	\arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 14+\arg8
+    vpaddq   W7, %xmm8, %xmm8
+	round	\arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 15+\arg8
+    vmovdqa  %xmm8, WK(14)
+
+    addq    $128, K
+	.endm
+
+    /*
+        16 rounds of hash update, load new input schedule W (in vector register xmm0-xmm7) and update WK = W + K (in stack)
+    */
+	.macro	rounds_schedule_initial arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+    vmovdqu  0*16(data), W0
+    vmovdqa  0*16(K), %xmm8
+    vpshufb  L_aligned_bswap, W0, W0
+	round	\arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 0+\arg8
+    vpaddq   W0, %xmm8, %xmm8
+	round	\arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 1+\arg8
+    vmovdqa  %xmm8, WK(0)
+
+    vmovdqu  1*16(data), W1
+    vmovdqa  1*16(K), %xmm8
+    vpshufb  L_aligned_bswap, W1, W1
+	round	\arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 2+\arg8
+    vpaddq   W1, %xmm8, %xmm8
+	round	\arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 3+\arg8
+    vmovdqa  %xmm8, WK(2)
+
+    vmovdqu  2*16(data), W2
+    vmovdqa  2*16(K), %xmm8
+    vpshufb  L_aligned_bswap, W2, W2
+	round	\arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 4+\arg8
+    vpaddq   W2, %xmm8, %xmm8
+	round	\arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 5+\arg8
+    vmovdqa  %xmm8, WK(4)
+
+    vmovdqu  3*16(data), W3
+    vmovdqa  3*16(K), %xmm8
+    vpshufb  L_aligned_bswap, W3, W3
+	round	\arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 6+\arg8
+    vpaddq   W3, %xmm8, %xmm8
+	round	\arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 7+\arg8
+    vmovdqa  %xmm8, WK(6)
+
+    vmovdqu  4*16(data), W4
+    vmovdqa  4*16(K), %xmm8
+    vpshufb  L_aligned_bswap, W4, W4
+	round	\arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 8+\arg8
+    vpaddq   W4, %xmm8, %xmm8
+	round	\arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 9+\arg8
+    vmovdqa  %xmm8, WK(8)
+
+    vmovdqu  5*16(data), W5
+    vmovdqa  5*16(K), %xmm8
+    vpshufb  L_aligned_bswap, W5, W5
+	round	\arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 10+\arg8
+    vpaddq   W5, %xmm8, %xmm8
+	round	\arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 11+\arg8
+    vmovdqa  %xmm8, WK(10)
+
+    vmovdqu  6*16(data), W6
+    vmovdqa  6*16(K), %xmm8
+    vpshufb  L_aligned_bswap, W6, W6
+	round	\arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 12+\arg8
+    vpaddq   W6, %xmm8, %xmm8
+	round	\arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 13+\arg8
+    vmovdqa  %xmm8, WK(12)
+
+    vmovdqu  7*16(data), W7
+    vmovdqa  7*16(K), %xmm8
+    vpshufb  L_aligned_bswap, W7, W7
+	round	\arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 14+\arg8
+    vpaddq   W7, %xmm8, %xmm8
+	round	\arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 15+\arg8
+    vmovdqa  %xmm8, WK(14)
+
+    addq    $128, K
+    addq    $128, data 
+	.endm
+
+    /*
+        16 rounds of hash update
+    */
+	.macro	rounds_schedule_final arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+	round	\arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 0+\arg8
+	round	\arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 1+\arg8
+
+	round	\arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 2+\arg8
+	round	\arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 3+\arg8
+
+	round	\arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 4+\arg8
+	round	\arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 5+\arg8
+
+	round	\arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 6+\arg8
+	round	\arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 7+\arg8
+
+	round	\arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 8+\arg8
+	round	\arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 9+\arg8
+
+	round	\arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 10+\arg8
+	round	\arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 11+\arg8
+
+	round	\arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 12+\arg8
+	round	\arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 13+\arg8
+
+	round	\arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 14+\arg8
+	round	\arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 15+\arg8
+	.endm
+
+	.text
+    .globl	_AccelerateCrypto_SHA512_compress_AVX1
+_AccelerateCrypto_SHA512_compress_AVX1:
+
+	// push callee-saved registers
+	push	%rbp
+    movq    %rsp, %rbp
+	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	// allocate stack space
+	sub		$stack_size, sp
+    andq    $-32, sp                // aligned sp to 32-bytes
+
+	// if kernel code, save used xmm registers
+#if BUILDKERNEL
+	vmovdqa	%ymm0, 0*32+ymm_save
+	vmovdqa	%ymm1, 1*32+ymm_save
+	vmovdqa	%ymm2, 2*32+ymm_save
+	vmovdqa	%ymm3, 3*32+ymm_save
+	vmovdqa	%ymm4, 4*32+ymm_save
+	vmovdqa	%ymm5, 5*32+ymm_save
+	vmovdqa	%ymm6, 6*32+ymm_save
+	vmovdqa	%ymm7, 7*32+ymm_save
+	vmovdqa	%ymm8, 8*32+ymm_save
+	vmovdqa	%ymm9, 9*32+ymm_save
+	vmovdqa	%ymm10, 10*32+ymm_save
+	vmovdqa	%ymm11, 11*32+ymm_save
+#endif
+
+    movq    num_blocks, _num_blocks
+
+	// set up bswap parameters in the aligned stack space and pointer to table K512[]
+	lea		CC_C_LABEL(sha512_K)(%rip), K
+
+	// load W[0:15] into xmm0-xmm7
+	vmovdqu	0*16(data), W0
+	vmovdqu	1*16(data), W1
+	vmovdqu	2*16(data), W2
+	vmovdqu	3*16(data), W3
+	vmovdqu	4*16(data), W4
+	vmovdqu	5*16(data), W5
+	vmovdqu	6*16(data), W6
+	vmovdqu	7*16(data), W7
+	addq	$128, data
+
+    vmovdqa  L_aligned_bswap, %xmm8
+	vpshufb	%xmm8, W0, W0
+	vpshufb	%xmm8, W1, W1
+	vpshufb	%xmm8, W2, W2
+	vpshufb	%xmm8, W3, W3
+	vpshufb	%xmm8, W4, W4
+	vpshufb	%xmm8, W5, W5
+	vpshufb	%xmm8, W6, W6
+	vpshufb	%xmm8, W7, W7
+
+	// compute WK[0:15] and save in stack
+	vpaddq	0*16(K), %xmm0, %xmm8
+	vpaddq	1*16(K), %xmm1, %xmm9
+	vpaddq	2*16(K), %xmm2, %xmm10
+	vpaddq	3*16(K), %xmm3, %xmm11
+	vmovdqa	%xmm8, WK(0)
+	vmovdqa	%xmm9, WK(2)
+	vmovdqa	%xmm10, WK(4)
+	vmovdqa	%xmm11, WK(6)
+
+	vpaddq	4*16(K), %xmm4, %xmm8
+	vpaddq	5*16(K), %xmm5, %xmm9
+	vpaddq	6*16(K), %xmm6, %xmm10
+	vpaddq	7*16(K), %xmm7, %xmm11
+	vmovdqa	%xmm8, WK(8)
+	vmovdqa	%xmm9, WK(10)
+	vmovdqa	%xmm10, WK(12)
+	vmovdqa	%xmm11, WK(14)
+    addq	$128, K
+
+L_loop:
+
+	// digests a-h = ctx->states;
+	mov		0*8(ctx), a
+	mov		1*8(ctx), b
+	mov		2*8(ctx), c
+	mov		3*8(ctx), d
+	mov		4*8(ctx), e
+	mov		5*8(ctx), f
+	mov		6*8(ctx), g
+	mov		7*8(ctx), h
+
+	// rounds 0:47 interleaved with W/WK update for rounds 16:63
+    rounds_schedule a, b, c, d, e, f, g, h, 16
+    rounds_schedule a, b, c, d, e, f, g, h, 32
+    rounds_schedule a, b, c, d, e, f, g, h, 48
+    rounds_schedule a, b, c, d, e, f, g, h, 64
+
+	// revert K to the beginning of K256[]
+	subq		$640, K
+	subq		$1, _num_blocks				// num_blocks--
+
+	je		L_final_block				// if final block, wrap up final rounds
+
+    rounds_schedule_initial a, b, c, d, e, f, g, h, 0
+
+	// ctx->states += digests a-h
+	add		a, 0*8(ctx)
+	add		b, 1*8(ctx)
+	add		c, 2*8(ctx)
+	add		d, 3*8(ctx)
+	add		e, 4*8(ctx)
+	add		f, 5*8(ctx)
+	add		g, 6*8(ctx)
+	add		h, 7*8(ctx)
+
+	jmp		L_loop				// branch for next block
+
+	// wrap up digest update round 48:63 for final block
+L_final_block:
+    rounds_schedule_final a, b, c, d, e, f, g, h, 0
+
+	// ctx->states += digests a-h
+	add		a, 0*8(ctx)
+	add		b, 1*8(ctx)
+	add		c, 2*8(ctx)
+	add		d, 3*8(ctx)
+	add		e, 4*8(ctx)
+	add		f, 5*8(ctx)
+	add		g, 6*8(ctx)
+	add		h, 7*8(ctx)
+
+	// if kernel, restore ymm0-ymm11
+#if BUILDKERNEL
+	vmovdqa	0*32+ymm_save, %ymm0
+	vmovdqa	1*32+ymm_save, %ymm1
+	vmovdqa	2*32+ymm_save, %ymm2
+	vmovdqa	3*32+ymm_save, %ymm3
+	vmovdqa	4*32+ymm_save, %ymm4
+	vmovdqa	5*32+ymm_save, %ymm5
+	vmovdqa	6*32+ymm_save, %ymm6
+	vmovdqa	7*32+ymm_save, %ymm7
+	vmovdqa	8*32+ymm_save, %ymm8
+	vmovdqa	9*32+ymm_save, %ymm9
+	vmovdqa	10*32+ymm_save, %ymm10
+	vmovdqa	11*32+ymm_save, %ymm11
+#endif
+
+	// free allocated stack memory
+    leaq    -40(%rbp), sp
+
+	// restore callee-saved registers
+	pop		%r15
+	pop		%r14
+	pop		%r13
+	pop		%r12
+	pop		%rbx
+	pop		%rbp
+
+	// return
+	ret
+
+	// data for using ssse3 pshufb instruction (big-endian loading of data)
+    CC_ASM_SECTION_CONST
+    .p2align  4
+
+L_bswap:
+    .quad   0x0001020304050607
+    .quad   0x08090a0b0c0d0e0f
+
+#endif      // x86_64
+
--- a/acceleratecrypto/Source/sha512/intel/sha512_compress_avx2.s
+++ b/acceleratecrypto/Source/sha512/intel/sha512_compress_avx2.s
@ -0,0 +1,552 @@
+# Copyright (c) (2016,2018,2019) Apple Inc. All rights reserved.
+#
+# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+# is contained in the License.txt file distributed with corecrypto) and only to 
+# people who accept that license. IMPORTANT:  Any license rights granted to you by 
+# Apple Inc. (if any) are limited to internal use within your organization only on 
+# devices and computers you own or control, for the sole purpose of verifying the 
+# security characteristics and correct functioning of the Apple Software.  You may 
+# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+#include <corecrypto/cc_config.h>
+
+/*
+	This file provides x86_64 avx2 hand implementation of the following function
+
+    void sha512_compress(uint64_t *state, size_t nblocks, const void *in);
+
+	sha512 algorithm per block description:
+
+		1. W(0:15) = big-endian (per 8 bytes) loading of input data (128 bytes)
+		2. load 8 digests (each 64bit) a-h from state
+		3. for r = 0:15
+				T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
+				d += T1;
+				h = T1 + Sigma0(a) + Maj(a,b,c)
+				permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
+		4. for r = 16:79
+				W[r] = W[r-16] + Gamma1(W[r-2]) + W[r-7] + Gamma0(W[r-15]);
+				T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
+				d += T1;
+				h = T1 + Sigma0(a) + Maj(a,b,c)
+				permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
+
+	In the assembly implementation:
+		- a circular window of message schedule W(r:r+15) is updated and stored in xmm0-xmm7 (or ymm0-ymm3/zmm0-zmm1 for avx2/avx512)
+		- its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
+		- the 8 digests (a-h) will be stored in GPR (%r8-%r15) 
+
+	----------------------------------------------------------------------------
+
+	our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block
+	into the last 16 rounds of its previous block:
+
+	----------------------------------------------------------------------------
+
+	load W(0:15) (big-endian per 8 bytes) into xmm0:xmm7
+	pre_calculate and store W+K(0:15) in stack
+
+L_loop:
+
+	load digests a-h from ctx->state;
+
+	for (r=0;r<64;r+=2) {
+		digests a-h update and permute round r:r+1
+		update W([r:r+1]%16) and WK([r:r+1]%16) for the next 8th iteration
+	}
+
+	num_block--;
+	if (num_block==0)	jmp L_last_block;
+
+	for (r=64;r<80;r+=2) {
+		digests a-h update and permute round r:r+1
+		load W([r:r+1]%16) (big-endian per 8 bytes) into xmm0:xmm7
+		pre_calculate and store W+K([r:r+1]%16) in stack
+	}
+
+	ctx->states += digests a-h;
+
+	jmp	L_loop;
+
+L_last_block:
+
+	for (r=64;r<80;r+=2) {
+		digests a-h update and permute round r:r+2
+	}
+
+	ctx->states += digests a-h;
+
+	------------------------------------------------------------------------
+
+	Apple CoreOS vector & numerics
+*/
+#if defined __x86_64__
+
+	// associate variables with registers or memory
+
+	#define	sp			%rsp
+	#define	ctx			%rdi
+	#define num_blocks	%rsi        // later move this to stack, use %rsi for temp variable u
+	#define	data        %rdx
+
+	#define	a			%r8
+	#define	b			%r9
+	#define	c			%r10
+	#define	d			%r11
+	#define	e			%r12
+	#define	f			%r13
+	#define	g			%r14
+	#define	h			%r15
+
+	#define	K			%rbx
+    #define _num_blocks  (-48)(%rbp)        // rbx/r12-r15 
+	#define	L_aligned_bswap	L_bswap(%rip)
+	#define stack_size	(8+32*8+128)	    // 8 (_num_blocks) + ymm save/restore + WK(0:15)
+	#define	ymm_save	128(sp)			    // starting address for ymm save/restore
+
+	// 3 local variables
+	#define	s	%rax
+	#define	t	%rcx
+	#define	u	%rsi
+
+	// a window (16 quad-words) of message scheule
+	#define	W0	%ymm0
+	#define	W1	%ymm1
+	#define	W2	%ymm2
+	#define	W3	%ymm3
+
+	// circular buffer for WK[(r:r+15)%16]
+	#define WK(x)   ((x)&15)*8(sp)
+
+// #define Ch(x,y,z)   (((x) & (y)) ^ ((~(x)) & (z)))
+
+	.macro Ch arg0, arg1, arg2
+#if 1
+    mov     \arg2, t
+    xor     \arg1, t
+    and     \arg0, t
+    xor     \arg2, t
+#else
+	mov		\arg0, t		// x
+	mov		\arg0, s		// x
+	not		t			// ~x
+	and		\arg1, s		// x & y
+	and		\arg2, t		// ~x & z
+	xor		s, t		// t = ((x) & (y)) ^ ((~(x)) & (z));
+#endif
+	.endm
+
+// #define Maj(x,y,z)  (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+
+	.macro	Maj arg0, arg1, arg2
+	mov	 	\arg1,	t // y
+	mov		\arg2,	s // z
+	xor		\arg2,	t // y^z
+	and		\arg1,	s // y&z
+	and		\arg0, 	t // x&(y^z)
+	xor		s,	t // Maj(x,y,z)
+	.endm
+
+// #define Gamma0(x)   (S64(1,  (x)) ^ S64(8, (x)) ^ R(7 ,   (x)))
+
+	// performs Gamma0_512 on 4 quad-words on an ymm registers
+	// use ymm6/ymm7 as intermediate registers
+	.macro	Gamma0 arg0
+	vpsrlq	$1, \arg0, %ymm6		// part of S64(1, x)
+	vpsllq	$56, \arg0, %ymm7		// part of S64(8, x)
+	vpsrlq	$7, \arg0, \arg0			// R(7, x)
+	vpxor	%ymm6, \arg0, \arg0
+	vpsrlq	$7, %ymm6, %ymm6	// part of S64(8, x)
+	vpxor	%ymm7, \arg0, \arg0
+	vpsllq	$7, %ymm7, %ymm7	// part of S64(1, x)
+	vpxor	%ymm6, \arg0, \arg0
+	vpxor	%ymm7, \arg0, \arg0
+	.endm
+
+// #define Gamma1(x)   (S64(19, (x)) ^ S64(61, (x)) ^ R(6,   (x)))
+
+	// performs Gamma1_512 on 4 words on an ymm registers
+	// use ymm6/ymm7 as intermediate registers
+	.macro	Gamma1 arg0
+	vpsrlq	$19, \arg0, %ymm6		// part of S64(19, x)
+	vpsllq	$3, \arg0, %ymm7		// part of S64(61, x)
+	vpsrlq	$6, \arg0, \arg0			// R(6, x)
+	vpxor	%ymm6, \arg0, \arg0
+	vpsrlq	$42, %ymm6, %ymm6	// part of S64(61, x)
+	vpxor	%ymm7, \arg0, \arg0
+	vpsllq	$42, %ymm7, %ymm7	// part of S64(19, x)
+	vpxor	%ymm6, \arg0, \arg0
+	vpxor	%ymm7, \arg0, \arg0
+	.endm
+
+    .macro  rightshift16 arg0, arg1
+    vpxor   \arg1, \arg1, \arg1
+    vperm2f128 $33, \arg1, \arg0, \arg1
+    .endm
+
+    .macro  leftshift16 arg0, arg1
+    vpxor   \arg1, \arg1, \arg1
+    vperm2f128 $2, \arg1, \arg0, \arg1
+    .endm
+
+    .macro  vpalignr8 arg0, arg1, arg2
+    vpblendd $3, \arg1, \arg0, \arg2 
+    vpermq $57, \arg2, \arg2
+    .endm
+
+    // W[r] = W[r-16] + Gamma1(W[r-2]) + W[r-7] + Gamma0(W[r-15]);
+    /*
+        W0 W1 W2 W3 
+        update 4 quad words in W0 += vext(W2,W3,#8) + Gamma0(vext(W0,W1, #8)) + Gamma1(W1<<16);
+                                W0 += Gamma1(vext(W3,W0, #16)). 
+    */
+    .macro  message_update4 arg0, arg1, arg2, arg3
+    vpblendd $3, \arg1, \arg0, %ymm5 
+    vpxor   %ymm4, %ymm4, %ymm4
+    vpermq $57, %ymm5, %ymm5           // ymm5 = W[r-15] = vpalignr8 \arg0, \arg1, %ymm5
+    vperm2f128 $33, %ymm4, \arg3, %ymm4   // ymm4 = [W[16] W[17] 0 0] half of W[r-2] = rightshift16 \arg3, %ymm4
+    Gamma0   %ymm5                  // Gamma0(W[r-15])
+    Gamma1   %ymm4                  // Gamma1(W[r-2]) half
+    vpaddq   %ymm5, \arg0, \arg0          // W0 += Gamma0([r-15]);
+    vpblendd $3, \arg3, \arg2, %ymm5
+    vpaddq   %ymm4, \arg0, \arg0          // W0 += Gamma1(W[r-2]) + Gamma0(vext(W0,W1, #8));
+    vpermq $57, %ymm5, %ymm5       // W[r-7] = vpalignr8 \arg2, \arg3, %ymm5     // W[r-7]
+    vpxor   %ymm4, %ymm4, %ymm4
+    vpaddq   %ymm5, \arg0, \arg0          // W0 += W[r-7]
+    vperm2f128 $2, %ymm4, \arg0, %ymm4 // leftshift16 \arg0, %ymm4  for W0<<16
+    Gamma1   %ymm4                  // Gamma1(W0<<16)
+    vpaddq   %ymm4, \arg0, \arg0          // W0 += Gamma1(W0<<16);
+    .endm 
+
+// #define Sigma0(x)   (S64(28,  (x)) ^ S64(34, (x)) ^ S64(39, (x)))
+
+	.macro	Sigma0 arg0
+	rorx	$28, \arg0, s		// S(28,  (x))
+	rorx	$34, \arg0, t		// S(34,  (x))
+	rorx	$11, s, u		// S(39,  (x))
+	xor		s, t			// S(28,  (x)) ^ S(34, (x))
+	xor		u, t		// t = (S(28,  (x)) ^ S(34, (x)) ^ S(39, (x)))
+	.endm
+
+// #define Sigma1(x)   (S(14,  (x)) ^ S(18, (x)) ^ S(41, (x)))
+
+	.macro	Sigma1 arg0
+	rorx	$14, \arg0, s		// S(14,  (x))
+	rorx	$18, \arg0, t		// S(18,  (x))
+	rorx	$27, s, u		// S(41,  (x))
+	xor		s, t			// S(14,  (x)) ^ S(18, (x))
+	xor		u, t			// t = (S(14,  (x)) ^ S(18, (x)) ^ S(41, (x)))
+	.endm
+
+	// per round digests update
+	.macro	round_ref arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+	Sigma1	\arg4				// t = T1
+	add		t, \arg7			// use h to store h+Sigma1(e)
+	Ch		\arg4, \arg5, \arg6		// t = Ch (e, f, g);
+	add		\arg7, t			// t = h+Sigma1(e)+Ch(e,f,g);
+	add		WK(\arg8), t		// h = T1
+	add		t, \arg3			// d += T1;
+	mov		t, \arg7			// h = T1
+	Sigma0	\arg0				// t = Sigma0(a);
+	add		t, \arg7			// h = T1 + Sigma0(a);
+	Maj		\arg0, \arg1, \arg2		// t = Maj(a,b,c)
+	add		t, \arg7			// h = T1 + Sigma0(a) + Maj(a,b,c);
+	.endm
+
+	.macro	round arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+
+	rorx	$14, \arg4, s		// S(14,  (x))
+    mov     \arg6, t           // Ch(e,f,g) : 1
+	rorx	$18, \arg4, u		// S(18,  (x))
+    xor     \arg5, t           // Ch(e,f,g) : 2
+	xor		s, u			// S(14,  (x)) ^ S(18, (x))
+    and     \arg4, t           // Ch(e,f,g) : 3
+	rorx	$27, s, s		// S(41,  (x))
+    xor     \arg6, t           // t = Ch(e,f,g);
+	xor		s, u			// u = Sigma1(e);
+	add		t, \arg7			// h = h+Ch(e,f,g);
+	add		u, \arg7			// h = h+Sigma1(e)+Ch(e,f,g);
+
+	add		WK(\arg8), \arg7		// h = T1
+	add		\arg7, \arg3			// d += T1;
+
+	rorx	$28, \arg0, s		// S(28,  (x))
+	rorx	$34, \arg0, u		// S(34,  (x))
+	xor		s, u			// S(28,  (x)) ^ S(34, (x))
+	rorx	$11, s, s		// S(39,  (x))
+	xor		s, u	    	// t = (S(28,  (x)) ^ S(34, (x)) ^ S(39, (x)))
+	add		u, \arg7			// h = T1 + Sigma0(a);
+
+	mov	 	\arg1,	t           // b
+	mov		\arg2,	s           // c
+	xor		\arg2,	t           // b^c
+	and		\arg1,	s           // b&c
+	and		\arg0,	t           // a&(b^c)
+	xor		s,	t           // t = Maj(a,b,c)
+
+	add		t, \arg7			// h = T1 + Sigma0(a) + Maj(a,b,c);
+
+	.endm
+
+    /*
+        16 rounds of hash update, update input schedule W (in vector register ymm0-ymm3) and WK = W + K (in stack)
+    */
+	.macro	rounds_schedule arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+
+    message_update4 W0, W1, W2, W3
+	round	\arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 0+\arg8
+	round	\arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 1+\arg8
+    vpaddq  0*32(K), W0, %ymm4
+	round	\arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 2+\arg8
+	round	\arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 3+\arg8
+    vmovdqa %ymm4, WK(0)
+
+    message_update4 W1, W2, W3, W0
+	round	\arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 4+\arg8
+	round	\arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 5+\arg8
+    vpaddq  1*32(K), W1, %ymm4
+	round	\arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 6+\arg8
+	round	\arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 7+\arg8
+    vmovdqa %ymm4, WK(4)
+
+    message_update4 W2, W3, W0, W1
+	round	\arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 8+\arg8
+	round	\arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 9+\arg8
+    vpaddq  2*32(K), W2, %ymm4
+	round	\arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 10+\arg8
+	round	\arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 11+\arg8
+    vmovdqa %ymm4, WK(8)
+
+    message_update4 W3, W0, W1, W2
+	round	\arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 12+\arg8
+	round	\arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 13+\arg8
+    vpaddq  3*32(K), W3, %ymm4
+	round	\arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 14+\arg8
+	round	\arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 15+\arg8
+    vmovdqa %ymm4, WK(12)
+
+    addq    $128, K
+	.endm
+
+    /*
+        16 rounds of hash update, load new input schedule W (in vector register xmm0-xmm7) and update WK = W + K (in stack)
+    */
+	.macro	rounds_schedule_initial arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+    vmovdqu 0*32(data), W0
+	round	\arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 0+\arg8
+    vpshufb L_aligned_bswap, W0, W0
+	round	\arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 1+\arg8
+    vpaddq  0*32(K), W0, %ymm4
+	round	\arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 2+\arg8
+	round	\arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 3+\arg8
+    vmovdqa %ymm4, WK(0)
+
+    
+    vmovdqu 1*32(data), W1
+	round	\arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 4+\arg8
+    vpshufb L_aligned_bswap, W1, W1
+	round	\arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 5+\arg8
+    vpaddq  1*32(K), W1, %ymm4
+	round	\arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 6+\arg8
+	round	\arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 7+\arg8
+    vmovdqa %ymm4, WK(4)
+
+    vmovdqu 2*32(data), W2
+	round	\arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 8+\arg8
+    vpshufb L_aligned_bswap, W2, W2
+	round	\arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 9+\arg8
+    vpaddq  2*32(K), W2, %ymm4
+
+	round	\arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 10+\arg8
+	round	\arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 11+\arg8
+    vmovdqa %ymm4, WK(8)
+
+    vmovdqu 3*32(data), W3
+	round	\arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 12+\arg8
+    vpshufb L_aligned_bswap, W3, W3
+	round	\arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 13+\arg8
+    vpaddq  3*32(K), W3, %ymm4
+
+	round	\arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 14+\arg8
+	round	\arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 15+\arg8
+    vmovdqa %ymm4, WK(12)
+
+    addq    $128, K
+    addq    $128, data 
+	.endm
+
+    /*
+        16 rounds of hash update
+    */
+	.macro	rounds_schedule_final arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+	round	\arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 0+\arg8
+	round	\arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 1+\arg8
+
+	round	\arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 2+\arg8
+	round	\arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 3+\arg8
+
+	round	\arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 4+\arg8
+	round	\arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 5+\arg8
+
+	round	\arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 6+\arg8
+	round	\arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 7+\arg8
+
+	round	\arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 8+\arg8
+	round	\arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 9+\arg8
+
+	round	\arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 10+\arg8
+	round	\arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 11+\arg8
+
+	round	\arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 12+\arg8
+	round	\arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 13+\arg8
+
+	round	\arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 14+\arg8
+	round	\arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 15+\arg8
+	.endm
+
+	.text
+    .globl	_AccelerateCrypto_SHA512_compress_AVX2
+_AccelerateCrypto_SHA512_compress_AVX2:
+
+	// push callee-saved registers
+	push	%rbp
+    movq    %rsp, %rbp
+	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	// allocate stack space
+	sub		$stack_size, sp
+    andq    $-32, sp                // aligned sp to 32-bytes
+
+	// if kernel code, save used xmm registers
+#if BUILDKERNEL
+	vmovdqa	%ymm0, 0*32+ymm_save
+	vmovdqa	%ymm1, 1*32+ymm_save
+	vmovdqa	%ymm2, 2*32+ymm_save
+	vmovdqa	%ymm3, 3*32+ymm_save
+	vmovdqa	%ymm4, 4*32+ymm_save
+	vmovdqa	%ymm5, 5*32+ymm_save
+	vmovdqa	%ymm6, 6*32+ymm_save
+	vmovdqa	%ymm7, 7*32+ymm_save
+#endif
+
+    movq    num_blocks, _num_blocks
+
+	// set up bswap parameters in the aligned stack space and pointer to table K512[]
+	lea		CC_C_LABEL(sha512_K)(%rip), K
+
+	// load W[0:15] into ymm0-ymm3
+	vmovdqu	0*32(data), W0
+	vmovdqu	1*32(data), W1
+	vmovdqu	2*32(data), W2
+	vmovdqu	3*32(data), W3
+	addq	$128, data
+
+    vmovdqa  L_aligned_bswap, %ymm4
+	vpshufb	%ymm4, W0, W0
+	vpshufb	%ymm4, W1, W1
+	vpshufb	%ymm4, W2, W2
+	vpshufb	%ymm4, W3, W3
+
+	// compute WK[0:15] and save in stack
+	vpaddq	0*32(K), W0, %ymm4
+	vpaddq	1*32(K), W1, %ymm5
+	vpaddq	2*32(K), W2, %ymm6
+	vpaddq	3*32(K), W3, %ymm7
+    addq	$128, K
+	vmovdqa	%ymm4, WK(0)
+	vmovdqa	%ymm5, WK(4)
+	vmovdqa	%ymm6, WK(8)
+	vmovdqa	%ymm7, WK(12)
+
+L_loop:
+
+	// digests a-h = ctx->states;
+	mov		0*8(ctx), a
+	mov		1*8(ctx), b
+	mov		2*8(ctx), c
+	mov		3*8(ctx), d
+	mov		4*8(ctx), e
+	mov		5*8(ctx), f
+	mov		6*8(ctx), g
+	mov		7*8(ctx), h
+
+	// rounds 0:47 interleaved with W/WK update for rounds 16:63
+    rounds_schedule a, b, c, d, e, f, g, h, 16
+    rounds_schedule a, b, c, d, e, f, g, h, 32
+    rounds_schedule a, b, c, d, e, f, g, h, 48
+    rounds_schedule a, b, c, d, e, f, g, h, 64
+
+	// revert K to the beginning of K256[]
+	subq		$640, K
+	subq		$1, _num_blocks				// num_blocks--
+
+	je		L_final_block				// if final block, wrap up final rounds
+
+    rounds_schedule_initial a, b, c, d, e, f, g, h, 0
+
+	// ctx->states += digests a-h
+	add		a, 0*8(ctx)
+	add		b, 1*8(ctx)
+	add		c, 2*8(ctx)
+	add		d, 3*8(ctx)
+	add		e, 4*8(ctx)
+	add		f, 5*8(ctx)
+	add		g, 6*8(ctx)
+	add		h, 7*8(ctx)
+
+	jmp		L_loop				// branch for next block
+
+	// wrap up digest update round 48:63 for final block
+L_final_block:
+    rounds_schedule_final a, b, c, d, e, f, g, h, 0
+
+	// ctx->states += digests a-h
+	add		a, 0*8(ctx)
+	add		b, 1*8(ctx)
+	add		c, 2*8(ctx)
+	add		d, 3*8(ctx)
+	add		e, 4*8(ctx)
+	add		f, 5*8(ctx)
+	add		g, 6*8(ctx)
+	add		h, 7*8(ctx)
+
+	// if kernel, restore xmm0-xmm7
+#if BUILDKERNEL
+	vmovdqa	0*32+ymm_save, %ymm0
+	vmovdqa	1*32+ymm_save, %ymm1
+	vmovdqa	2*32+ymm_save, %ymm2
+	vmovdqa	3*32+ymm_save, %ymm3
+	vmovdqa	4*32+ymm_save, %ymm4
+	vmovdqa	5*32+ymm_save, %ymm5
+	vmovdqa	6*32+ymm_save, %ymm6
+	vmovdqa	7*32+ymm_save, %ymm7
+#endif
+
+	// free allocated stack memory
+    leaq    -40(%rbp), sp
+
+	// restore callee-saved registers
+	pop		%r15
+	pop		%r14
+	pop		%r13
+	pop		%r12
+	pop		%rbx
+	pop		%rbp
+
+	// return
+	ret
+
+	// data for using ssse3 pshufb instruction (big-endian loading of data)
+    CC_ASM_SECTION_CONST
+    .p2align  5
+
+L_bswap:
+    .quad   0x0001020304050607
+    .quad   0x08090a0b0c0d0e0f
+    .quad   0x1011121314151617
+    .quad   0x18191a1b1c1d1e1f
+
+#endif      // x86_64
--- a/acceleratecrypto/Source/sha512/intel/sha512_compress_ssse3.s
+++ b/acceleratecrypto/Source/sha512/intel/sha512_compress_ssse3.s
@ -0,0 +1,619 @@
+# Copyright (c) (2016,2018,2019) Apple Inc. All rights reserved.
+#
+# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+# is contained in the License.txt file distributed with corecrypto) and only to 
+# people who accept that license. IMPORTANT:  Any license rights granted to you by 
+# Apple Inc. (if any) are limited to internal use within your organization only on 
+# devices and computers you own or control, for the sole purpose of verifying the 
+# security characteristics and correct functioning of the Apple Software.  You may 
+# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+#include <corecrypto/cc_config.h>
+
+/*
+	This file provides x86_64 hand implementation of the following function
+
+    void sha512_compress(uint64_t *state, size_t nblocks, const void *in);
+
+	sha512 algorithm per block description:
+
+		1. W(0:15) = big-endian (per 8 bytes) loading of input data (128 bytes)
+		2. load 8 digests (each 64bit) a-h from state
+		3. for r = 0:15
+				T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
+				d += T1;
+				h = T1 + Sigma0(a) + Maj(a,b,c)
+				permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
+		4. for r = 16:79
+				W[r] = W[r-16] + Gamma1(W[r-2]) + W[r-7] + Gamma0(W[r-15]);
+				T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
+				d += T1;
+				h = T1 + Sigma0(a) + Maj(a,b,c)
+				permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
+
+	In the assembly implementation:
+		- a circular window of message schedule W(r:r+15) is updated and stored in xmm0-xmm7 (or ymm0-ymm3/zmm0-zmm1 for avx1/avx2)
+		- its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
+		- the 8 digests (a-h) will be stored in GPR (%r8-%r15) 
+
+	----------------------------------------------------------------------------
+
+	our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block
+	into the last 16 rounds of its previous block:
+
+	----------------------------------------------------------------------------
+
+	load W(0:15) (big-endian per 8 bytes) into xmm0:xmm7
+	pre_calculate and store W+K(0:15) in stack
+
+L_loop:
+
+	load digests a-h from ctx->state;
+
+	for (r=0;r<64;r+=2) {
+		digests a-h update and permute round r:r+1
+		update W([r:r+1]%16) and WK([r:r+1]%16) for the next 8th iteration
+	}
+
+	num_block--;
+	if (num_block==0)	jmp L_last_block;
+
+	for (r=64;r<80;r+=2) {
+		digests a-h update and permute round r:r+1
+		load W([r:r+1]%16) (big-endian per 8 bytes) into xmm0:xmm7
+		pre_calculate and store W+K([r:r+1]%16) in stack
+	}
+
+	ctx->states += digests a-h;
+
+	jmp	L_loop;
+
+L_last_block:
+
+	for (r=64;r<80;r+=2) {
+		digests a-h update and permute round r:r+2
+	}
+
+	ctx->states += digests a-h;
+
+	------------------------------------------------------------------------
+
+	Apple CoreOS vector & numerics
+*/
+#if defined __x86_64__
+
+	// associate variables with registers or memory
+
+	#define	sp			%rsp
+	#define	ctx			%rdi
+	#define num_blocks	%rsi        // later move this to stack, use %rsi for temp variable u
+	#define	data        %rdx
+
+	#define	a			%r8
+	#define	b			%r9
+	#define	c			%r10
+	#define	d			%r11
+	#define	e			%r12
+	#define	f			%r13
+	#define	g			%r14
+	#define	h			%r15
+
+	#define	K			%rbx
+    #define _num_blocks  (-48)(%rbp)        // rbx/r12-r15 
+	#define stack_size	(8+16*12+128)	    // 8 (_num_blocks) + xmm0:xmm11 + WK(0:15)
+
+	#define	L_aligned_bswap	L_bswap(%rip)   // bswap : big-endian loading of 4-byte words
+	#define	xmm_save	128(sp)			    // starting address for xmm save/restore
+
+	// 3 local variables
+	#define	s	%rax
+	#define	t	%rcx
+	#define	u	%rsi
+
+	// a window (16 quad-words) of message scheule
+	#define	W0	%xmm0
+	#define	W1	%xmm1
+	#define	W2	%xmm2
+	#define	W3	%xmm3
+	#define	W4	%xmm4
+	#define	W5	%xmm5
+	#define	W6	%xmm6
+	#define	W7	%xmm7
+
+	// circular buffer for WK[(r:r+15)%16]
+	#define WK(x)   ((x)&15)*8(sp)
+
+// #define Ch(x,y,z)   (((x) & (y)) ^ ((~(x)) & (z)))
+
+	.macro Ch arg0, arg1, arg2
+#if 1
+    mov     \arg2, t
+    xor     \arg1, t
+    and     \arg0, t
+    xor     \arg2, t
+#else
+	mov		\arg0, t		// x
+	mov		\arg0, s		// x
+	not		t			// ~x
+	and		\arg1, s		// x & y
+	and		\arg2, t		// ~x & z
+	xor		s, t		// t = ((x) & (y)) ^ ((~(x)) & (z));
+#endif
+	.endm
+
+// #define Maj(x,y,z)  (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+
+	.macro	Maj arg0, arg1, arg2
+	mov	 	\arg1,	t // y
+	mov		\arg2,	s // z
+	xor		\arg2,	t // y^z
+	and		\arg1,	s // y&z
+	and		\arg0, 	t // x&(y^z)
+	xor		s,	t // Maj(x,y,z)
+	.endm
+
+// #define Gamma0(x)   (S64(1,  (x)) ^ S64(8, (x)) ^ R(7 ,   (x)))
+
+	// performs Gamma0_512 on 2 words on an xmm registers
+	// use xmm8/xmm9 as intermediate registers
+	.macro	Gamma0 arg0
+	movdqa	\arg0, %xmm8
+	movdqa	\arg0, %xmm9
+	psrlq	$7, \arg0			// R(7, x)
+	psrlq	$1, %xmm8		// part of S64(1, x)
+	psllq	$56, %xmm9		// part of S64(8, x)
+	pxor	%xmm8, \arg0
+	psrlq	$7, %xmm8		// part of S64(8, x)
+	pxor	%xmm9, \arg0
+	psllq	$7, %xmm9		// part of S64(1, x)
+	pxor	%xmm8, \arg0
+	pxor	%xmm9, \arg0
+	.endm
+
+// #define Gamma1(x)   (S64(19, (x)) ^ S64(61, (x)) ^ R(6,   (x)))
+
+	// performs Gamma1_512 on 2 words on an xmm registers
+	// use xmm8/xmm9 as intermediate registers
+	.macro	Gamma1 arg0
+	movdqa	\arg0, %xmm8
+	movdqa	\arg0, %xmm9
+	psrlq	$6, \arg0			// R(6, x)
+	psrlq	$19, %xmm8		// part of S64(19, x)
+	psllq	$3, %xmm9		// part of S64(61, x)
+	pxor	%xmm8, \arg0
+	psrlq	$42, %xmm8		// part of S64(61, x)
+	pxor	%xmm9, \arg0
+	psllq	$42, %xmm9		// part of S64(19, x)
+	pxor	%xmm8, \arg0
+	pxor	%xmm9, \arg0
+	.endm
+
+    // W[r] = W[r-16] + Gamma1(W[r-2]) + W[r-7] + Gamma0(W[r-15]);
+    /*
+        W0 W1 W2 W3 W4 W5 W6 W7
+        
+        update 2 quad words in W0 = W0 + Gamma1(W7) + vext(W4,W5) + Gamma0(vext(W0,W1)). 
+        use %xmm10, %xmm11 for temp
+    */
+    .macro  message_update2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
+    movdqa  \arg5, %xmm10
+    movdqa  \arg1, %xmm11
+    palignr $8, \arg4, %xmm10     // vext(W4,W5)
+    palignr $8, \arg0, %xmm11     // vext(W0,W1)
+    paddq   %xmm10, \arg0          // W0 + vext(W4,W5)
+    movdqa  \arg7, %xmm10
+    Gamma1  %xmm10              // Gamma1(W7)
+    Gamma0  %xmm11              // Gamma0(vext(W0,W1))
+    paddq   %xmm10, \arg0          // W0 + Gamma1(W7) + vext(W4,W5)
+    paddq   %xmm11, \arg0          // W0 + Gamma1(W7) + vext(W4,W5) + Gamma0(vext(W0,W1))
+    .endm 
+
+// #define Sigma0(x)   (S64(28,  (x)) ^ S64(34, (x)) ^ S64(39, (x)))
+
+	.macro	Sigma0 arg0
+	mov		\arg0, t			// x
+	mov		\arg0, s			// x
+	ror		$28, t			// S(28,  (x))
+	ror		$34, s			// S(34,  (x))
+	xor		s, t			// S(28,  (x)) ^ S(34, (x))
+	ror		$5, s			// S(39,  (x))
+	xor		s, t			// t = (S(28,  (x)) ^ S(34, (x)) ^ S(39, (x)))
+	.endm
+
+// #define Sigma1(x)   (S(14,  (x)) ^ S(18, (x)) ^ S(41, (x)))
+
+	.macro	Sigma1 arg0
+	mov		\arg0, s			// x
+	ror		$14, s			// S(14,  (x))
+	mov		s, t			// S(14,  (x))
+	ror		$4, s			// S(18, (x))
+	xor		s, t			// S(14,  (x)) ^ S(18, (x))
+	ror		$23, s			// S(41, (x))
+	xor		s, t			// t = (S(14,  (x)) ^ S(18, (x)) ^ S(41, (x)))
+	.endm
+
+	// per round digests update
+	.macro	round_ref arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+	Sigma1	\arg4				// t = Sigma1(e);
+	add		t, \arg7			// h = h+Sigma1(e)
+	Ch		\arg4, \arg5, \arg6		// t = Ch (e, f, g);
+	add		t, \arg7			// h = h+Sigma1(e)+Ch(e,f,g);
+	add		WK(\arg8), \arg7		// h = h+Sigma1(e)+Ch(e,f,g)+WK
+	add		\arg7, \arg3			// d += h;
+	Sigma0	\arg0				// t = Sigma0(a);
+	add		t, \arg7			// h += Sigma0(a);
+	Maj		\arg0, \arg1, \arg2		// t = Maj(a,b,c)
+	add		t, \arg7			// h = T1 + Sigma0(a) + Maj(a,b,c);
+	.endm
+
+	.macro	round arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+	mov		\arg4, s
+	mov		\arg0, t
+	ror		$(41-18), s
+	ror		$(39-34), t
+	xor		\arg4, s
+	mov		\arg5, u
+	xor		\arg0, t
+	ror		$(18-14), s
+	xor		\arg6, u
+	xor		\arg4, s
+	ror		$(34-28), t
+	and		\arg4, u
+	xor		\arg0, t
+	xor		\arg6, u
+	ror		$14, s
+	ror		$28, t
+	add		s, u
+	mov		\arg0, s
+	add		WK(\arg8), u
+	or		\arg2, s
+	add		u, \arg7
+	mov		\arg0, u
+	add		\arg7, \arg3
+	and		\arg1, s
+	and		\arg2, u
+	or		u, s
+	add		t, \arg7
+	add		s, \arg7	
+	.endm
+
+    /*
+        16 rounds of hash update, update input schedule W (in vector register xmm0-xmm7) and WK = W + K (in stack)
+    */
+	.macro	rounds_schedule arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+    message_update2 W0, W1, W2, W3, W4, W5, W6, W7
+    movdqa  0*16(K), %xmm8
+	round	\arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 0+\arg8
+    paddq   W0, %xmm8
+	round	\arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 1+\arg8
+    movdqa  %xmm8, WK(0)
+
+    message_update2 W1, W2, W3, W4, W5, W6, W7, W0
+    movdqa  1*16(K), %xmm8
+	round	\arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 2+\arg8
+    paddq   W1, %xmm8
+	round	\arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 3+\arg8
+    movdqa  %xmm8, WK(2)
+
+    message_update2 W2, W3, W4, W5, W6, W7, W0, W1
+    movdqa  2*16(K), %xmm8
+	round	\arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 4+\arg8
+    paddq   W2, %xmm8
+	round	\arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 5+\arg8
+    movdqa  %xmm8, WK(4)
+
+    message_update2 W3, W4, W5, W6, W7, W0, W1, W2
+    movdqa  3*16(K), %xmm8
+	round	\arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 6+\arg8
+    paddq   W3, %xmm8
+	round	\arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 7+\arg8
+    movdqa  %xmm8, WK(6)
+
+    message_update2 W4, W5, W6, W7, W0, W1, W2, W3
+    movdqa  4*16(K), %xmm8
+	round	\arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 8+\arg8
+    paddq   W4, %xmm8
+	round	\arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 9+\arg8
+    movdqa  %xmm8, WK(8)
+
+    message_update2 W5, W6, W7, W0, W1, W2, W3, W4
+    movdqa  5*16(K), %xmm8
+	round	\arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 10+\arg8
+    paddq   W5, %xmm8
+	round	\arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 11+\arg8
+    movdqa  %xmm8, WK(10)
+
+    message_update2 W6, W7, W0, W1, W2, W3, W4, W5
+    movdqa  6*16(K), %xmm8
+	round	\arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 12+\arg8
+    paddq   W6, %xmm8
+	round	\arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 13+\arg8
+    movdqa  %xmm8, WK(12)
+
+    message_update2 W7, W0, W1, W2, W3, W4, W5, W6
+    movdqa  7*16(K), %xmm8
+	round	\arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 14+\arg8
+    paddq   W7, %xmm8
+	round	\arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 15+\arg8
+    movdqa  %xmm8, WK(14)
+
+    addq    $128, K
+	.endm
+
+    /*
+        16 rounds of hash update, load new input schedule W (in vector register xmm0-xmm7) and update WK = W + K (in stack)
+    */
+	.macro	rounds_schedule_initial arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+    movdqu  0*16(data), W0
+    movdqa  0*16(K), %xmm8
+    pshufb  L_aligned_bswap, W0
+	round	\arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 0+\arg8
+    paddq   W0, %xmm8
+	round	\arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 1+\arg8
+    movdqa  %xmm8, WK(0)
+
+    movdqu  1*16(data), W1
+    movdqa  1*16(K), %xmm8
+    pshufb  L_aligned_bswap, W1
+	round	\arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 2+\arg8
+    paddq   W1, %xmm8
+	round	\arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 3+\arg8
+    movdqa  %xmm8, WK(2)
+
+    movdqu  2*16(data), W2
+    movdqa  2*16(K), %xmm8
+    pshufb  L_aligned_bswap, W2
+	round	\arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 4+\arg8
+    paddq   W2, %xmm8
+	round	\arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 5+\arg8
+    movdqa  %xmm8, WK(4)
+
+    movdqu  3*16(data), W3
+    movdqa  3*16(K), %xmm8
+    pshufb  L_aligned_bswap, W3
+	round	\arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 6+\arg8
+    paddq   W3, %xmm8
+	round	\arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 7+\arg8
+    movdqa  %xmm8, WK(6)
+
+    movdqu  4*16(data), W4
+    movdqa  4*16(K), %xmm8
+    pshufb  L_aligned_bswap, W4
+	round	\arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 8+\arg8
+    paddq   W4, %xmm8
+	round	\arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 9+\arg8
+    movdqa  %xmm8, WK(8)
+
+    movdqu  5*16(data), W5
+    movdqa  5*16(K), %xmm8
+    pshufb  L_aligned_bswap, W5
+	round	\arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 10+\arg8
+    paddq   W5, %xmm8
+	round	\arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 11+\arg8
+    movdqa  %xmm8, WK(10)
+
+    movdqu  6*16(data), W6
+    movdqa  6*16(K), %xmm8
+    pshufb  L_aligned_bswap, W6
+	round	\arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 12+\arg8
+    paddq   W6, %xmm8
+	round	\arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 13+\arg8
+    movdqa  %xmm8, WK(12)
+
+    movdqu  7*16(data), W7
+    movdqa  7*16(K), %xmm8
+    pshufb  L_aligned_bswap, W7
+	round	\arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 14+\arg8
+    paddq   W7, %xmm8
+	round	\arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 15+\arg8
+    movdqa  %xmm8, WK(14)
+
+    addq    $128, K
+    addq    $128, data 
+	.endm
+
+    /*
+        16 rounds of hash update
+    */
+	.macro	rounds_schedule_final arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+	round	\arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 0+\arg8
+	round	\arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 1+\arg8
+
+	round	\arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 2+\arg8
+	round	\arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 3+\arg8
+
+	round	\arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 4+\arg8
+	round	\arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 5+\arg8
+
+	round	\arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 6+\arg8
+	round	\arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 7+\arg8
+
+	round	\arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, 8+\arg8
+	round	\arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, \arg6, 9+\arg8
+
+	round	\arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, \arg5, 10+\arg8
+	round	\arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, \arg4, 11+\arg8
+
+	round	\arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, \arg3, 12+\arg8
+	round	\arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, \arg2, 13+\arg8
+
+	round	\arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, \arg1, 14+\arg8
+	round	\arg1, \arg2, \arg3, \arg4, \arg5, \arg6, \arg7, \arg0, 15+\arg8
+	.endm
+
+	.text
+    .globl	_AccelerateCrypto_SHA512_compress_ssse3
+_AccelerateCrypto_SHA512_compress_ssse3:
+
+	// push callee-saved registers
+	push	%rbp
+    movq    %rsp, %rbp
+	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	// allocate stack space
+	sub		$stack_size, sp
+
+	// if kernel code, save used xmm registers
+#if BUILDKERNEL
+	movdqa	%xmm0, 0*16+xmm_save
+	movdqa	%xmm1, 1*16+xmm_save
+	movdqa	%xmm2, 2*16+xmm_save
+	movdqa	%xmm3, 3*16+xmm_save
+	movdqa	%xmm4, 4*16+xmm_save
+	movdqa	%xmm5, 5*16+xmm_save
+	movdqa	%xmm6, 6*16+xmm_save
+	movdqa	%xmm7, 7*16+xmm_save
+	movdqa	%xmm8, 8*16+xmm_save
+	movdqa	%xmm9, 9*16+xmm_save
+	movdqa	%xmm10, 10*16+xmm_save
+	movdqa	%xmm11, 11*16+xmm_save
+#endif
+
+    movq    num_blocks, _num_blocks
+
+	// set up bswap parameters in the aligned stack space and pointer to table K512[]
+	lea		CC_C_LABEL(sha512_K)(%rip), K
+
+	// load W[0:15] into xmm0-xmm7
+	movdqu	0*16(data), W0
+	movdqu	1*16(data), W1
+	movdqu	2*16(data), W2
+	movdqu	3*16(data), W3
+	movdqu	4*16(data), W4
+	movdqu	5*16(data), W5
+	movdqu	6*16(data), W6
+	movdqu	7*16(data), W7
+	addq	$128, data
+
+    movdqa  L_aligned_bswap, %xmm8
+	pshufb	%xmm8, W0
+	pshufb	%xmm8, W1
+	pshufb	%xmm8, W2
+	pshufb	%xmm8, W3
+	pshufb	%xmm8, W4
+	pshufb	%xmm8, W5
+	pshufb	%xmm8, W6
+	pshufb	%xmm8, W7
+
+	// compute WK[0:15] and save in stack
+	movdqa	0*16(K), %xmm8
+	movdqa	1*16(K), %xmm9
+	movdqa	2*16(K), %xmm10
+	movdqa	3*16(K), %xmm11
+	paddq	%xmm0, %xmm8
+	paddq	%xmm1, %xmm9
+	paddq	%xmm2, %xmm10
+	paddq	%xmm3, %xmm11
+	movdqa	%xmm8, WK(0)
+	movdqa	%xmm9, WK(2)
+	movdqa	%xmm10, WK(4)
+	movdqa	%xmm11, WK(6)
+
+	movdqa	4*16(K), %xmm8
+	movdqa	5*16(K), %xmm9
+	movdqa	6*16(K), %xmm10
+	movdqa	7*16(K), %xmm11
+	paddq	%xmm4, %xmm8
+	paddq	%xmm5, %xmm9
+	paddq	%xmm6, %xmm10
+	paddq	%xmm7, %xmm11
+	movdqa	%xmm8, WK(8)
+	movdqa	%xmm9, WK(10)
+	movdqa	%xmm10, WK(12)
+	movdqa	%xmm11, WK(14)
+    addq	$128, K
+
+L_loop:
+
+	// digests a-h = ctx->states;
+	mov		0*8(ctx), a
+	mov		1*8(ctx), b
+	mov		2*8(ctx), c
+	mov		3*8(ctx), d
+	mov		4*8(ctx), e
+	mov		5*8(ctx), f
+	mov		6*8(ctx), g
+	mov		7*8(ctx), h
+
+	// rounds 0:47 interleaved with W/WK update for rounds 16:63
+    rounds_schedule a, b, c, d, e, f, g, h, 16
+    rounds_schedule a, b, c, d, e, f, g, h, 32
+    rounds_schedule a, b, c, d, e, f, g, h, 48
+    rounds_schedule a, b, c, d, e, f, g, h, 64
+
+	// revert K to the beginning of K256[]
+	subq		$640, K
+	subq		$1, _num_blocks				// num_blocks--
+
+	je		L_final_block				// if final block, wrap up final rounds
+
+    rounds_schedule_initial a, b, c, d, e, f, g, h, 0
+
+	// ctx->states += digests a-h
+	add		a, 0*8(ctx)
+	add		b, 1*8(ctx)
+	add		c, 2*8(ctx)
+	add		d, 3*8(ctx)
+	add		e, 4*8(ctx)
+	add		f, 5*8(ctx)
+	add		g, 6*8(ctx)
+	add		h, 7*8(ctx)
+
+	jmp		L_loop				// branch for next block
+
+	// wrap up digest update round 48:63 for final block
+L_final_block:
+    rounds_schedule_final a, b, c, d, e, f, g, h, 0
+
+	// ctx->states += digests a-h
+	add		a, 0*8(ctx)
+	add		b, 1*8(ctx)
+	add		c, 2*8(ctx)
+	add		d, 3*8(ctx)
+	add		e, 4*8(ctx)
+	add		f, 5*8(ctx)
+	add		g, 6*8(ctx)
+	add		h, 7*8(ctx)
+
+	// if kernel, restore xmm0-xmm7
+#if BUILDKERNEL
+	movdqa	0*16+xmm_save, %xmm0
+	movdqa	1*16+xmm_save, %xmm1
+	movdqa	2*16+xmm_save, %xmm2
+	movdqa	3*16+xmm_save, %xmm3
+	movdqa	4*16+xmm_save, %xmm4
+	movdqa	5*16+xmm_save, %xmm5
+	movdqa	6*16+xmm_save, %xmm6
+	movdqa	7*16+xmm_save, %xmm7
+	movdqa	8*16+xmm_save, %xmm8
+	movdqa	9*16+xmm_save, %xmm9
+	movdqa	10*16+xmm_save, %xmm10
+	movdqa	11*16+xmm_save, %xmm11
+#endif
+
+	// free allocated stack memory
+	add		$stack_size, sp
+
+	// restore callee-saved registers
+	pop		%r15
+	pop		%r14
+	pop		%r13
+	pop		%r12
+	pop		%rbx
+	pop		%rbp
+
+	// return
+	ret
+
+	// data for using ssse3 pshufb instruction (big-endian loading of data)
+    CC_ASM_SECTION_CONST
+    .p2align  4
+
+L_bswap:
+    .quad   0x0001020304050607
+    .quad   0x08090a0b0c0d0e0f
+
+#endif      // x86_64
--- a/acceleratecrypto/Source/sha512/sha512_K.c
+++ b/acceleratecrypto/Source/sha512/sha512_K.c
@ -0,0 +1,58 @@
+/* Copyright (c) (2016,2018-2020) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by
+ * Apple Inc. (if any) are limited to internal use within your organization only on
+ * devices and computers you own or control, for the sole purpose of verifying the
+ * security characteristics and correct functioning of the Apple Software.  You may
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#include <stdint.h>
+#include <corecrypto/cc_config.h>
+
+/* the K array */
+const uint64_t sha512_K[80] CC_ALIGNED(16) = {
+0x428a2f98d728ae22, 0x7137449123ef65cd,
+0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc,
+0x3956c25bf348b538, 0x59f111f1b605d019,
+0x923f82a4af194f9b, 0xab1c5ed5da6d8118,
+0xd807aa98a3030242, 0x12835b0145706fbe,
+0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2,
+0x72be5d74f27b896f, 0x80deb1fe3b1696b1,
+0x9bdc06a725c71235, 0xc19bf174cf692694,
+0xe49b69c19ef14ad2, 0xefbe4786384f25e3,
+0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65,
+0x2de92c6f592b0275, 0x4a7484aa6ea6e483,
+0x5cb0a9dcbd41fbd4, 0x76f988da831153b5,
+0x983e5152ee66dfab, 0xa831c66d2db43210,
+0xb00327c898fb213f, 0xbf597fc7beef0ee4,
+0xc6e00bf33da88fc2, 0xd5a79147930aa725,
+0x06ca6351e003826f, 0x142929670a0e6e70,
+0x27b70a8546d22ffc, 0x2e1b21385c26c926,
+0x4d2c6dfc5ac42aed, 0x53380d139d95b3df,
+0x650a73548baf63de, 0x766a0abb3c77b2a8,
+0x81c2c92e47edaee6, 0x92722c851482353b,
+0xa2bfe8a14cf10364, 0xa81a664bbc423001,
+0xc24b8b70d0f89791, 0xc76c51a30654be30,
+0xd192e819d6ef5218, 0xd69906245565a910,
+0xf40e35855771202a, 0x106aa07032bbd1b8,
+0x19a4c116b8d2d0c8, 0x1e376c085141ab53,
+0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8,
+0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb,
+0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3,
+0x748f82ee5defb2fc, 0x78a5636f43172f60,
+0x84c87814a1f0ab72, 0x8cc702081a6439ec,
+0x90befffa23631e28, 0xa4506cebde82bde9,
+0xbef9a3f7b2c67915, 0xc67178f2e372532b,
+0xca273eceea26619c, 0xd186b8c721c0c207,
+0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178,
+0x06f067aa72176fba, 0x0a637dc5a2c898a6,
+0x113f9804bef90dae, 0x1b710b35131c471b,
+0x28db77f523047d84, 0x32caab7b40c72493,
+0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c,
+0x4cc5d4becb3e42b6, 0x597f299cfc657e2a,
+0x5fcb6fab3ad6faec, 0x6c44198c4a475817
+};
+
--- a/cc/corecrypto/cc.h
+++ b/cc/corecrypto/cc.h
@ -0,0 +1,182 @@
+/* Copyright (c) (2010-2012,2014-2020) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by
+ * Apple Inc. (if any) are limited to internal use within your organization only on
+ * devices and computers you own or control, for the sole purpose of verifying the
+ * security characteristics and correct functioning of the Apple Software.  You may
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#ifndef _CORECRYPTO_CC_H_
+#define _CORECRYPTO_CC_H_
+
+#include <corecrypto/cc_config.h>
+#include <corecrypto/cc_error.h>
+#include <string.h>
+#include <stdint.h>
+
+#if __has_feature(attribute_availability_with_replacement)
+#if __has_feature(attribute_availability_bridgeos)
+  #ifndef __CC_BRIDGE_OS_DEPRECATED
+    #define __CC_BRIDGEOS_DEPRECATED(_dep, _msg) __attribute__((availability(bridgeos,deprecated=_dep, replacement=_msg)))
+  #endif
+#endif
+
+#ifndef __CC_BRIDGEOS_DEPRECATED
+  #define __CC_BRIDGEOS_DEPRECATED(_dep, _msg)
+#endif
+
+#define cc_deprecate_with_replacement(replacement_message, ios_version, macos_version, tvos_version, watchos_version, bridgeos_version) \
+__attribute__((availability(macos,deprecated=macos_version,       replacement=replacement_message)))\
+__attribute__((availability(ios,deprecated=ios_version,           replacement=replacement_message)))\
+__attribute__((availability(watchos,deprecated=watchos_version,   replacement=replacement_message)))\
+__attribute__((availability(tvos,deprecated=tvos_version,         replacement=replacement_message)))\
+__CC_BRIDGEOS_DEPRECATED(bridgeos_version, replacement_message)
+
+#else /* !__has_feature(attribute_availability_with_replacement) */
+
+#define cc_deprecate_with_replacement(replacement_message, ios_version, macos_version, tvos_version, watchos_version, bridgeos_version)
+
+#endif /* __has_feature(attribute_availability_with_replacement) */
+
+/* Provide a general purpose macro concat method. */
+#define cc_concat_(a, b) a##b
+#define cc_concat(a, b) cc_concat_(a, b)
+
+#if defined(_MSC_VER)
+#define __asm__(x)
+#endif
+
+/* Manage asserts here because a few functions in header public files do use asserts */
+#if CORECRYPTO_DEBUG
+#define cc_assert(x) assert(x)
+#else
+#define cc_assert(x)
+#endif
+
+#if CC_KERNEL
+#include <kern/assert.h>
+#elif CC_USE_S3
+#define assert(args)  // No assert in S3
+#else
+#include <assert.h>
+#endif
+
+/* Provide a static assert that can be used to create compile-type failures. */
+#define cc_static_assert(e,m)                                               \
+    enum { cc_concat(static_assert_, __COUNTER__) = 1/(int)(!!(e)) }
+
+/* Declare a struct element with a guarenteed alignment of _alignment_.
+   The resulting struct can be used to create arrays that are aligned by
+   a certain amount.  */
+#define cc_aligned_struct(_alignment_)  \
+typedef struct { \
+uint8_t b[_alignment_]; \
+} CC_ALIGNED(_alignment_)
+
+#if defined(__BIGGEST_ALIGNMENT__)
+#define CC_MAX_ALIGNMENT ((size_t)__BIGGEST_ALIGNMENT__)
+#else
+#define CC_MAX_ALIGNMENT ((size_t)16)
+#endif
+
+/* pads a given size to be a multiple of the biggest alignment for any type */
+#define cc_pad_align(_size_) ((_size_ + CC_MAX_ALIGNMENT - 1) & (~(CC_MAX_ALIGNMENT - 1)))
+
+/* number of array elements used in a cc_ctx_decl */
+#define cc_ctx_n(_type_, _size_) ((_size_ + sizeof(_type_) - 1) / sizeof(_type_))
+
+/* sizeof of a context declared with cc_ctx_decl */
+#define cc_ctx_sizeof(_type_, _size_) sizeof(_type_[cc_ctx_n(_type_, _size_)])
+
+/*
+  1. _alloca cannot be removed becasue this header file is compiled with both MSVC++ and with clang.
+  2. The _MSC_VER version of cc_ctx_decl() is not compatible with the way *_decl macros as used in CommonCrypto, AppleKeyStore and SecurityFrameworks. To observe the incompatibilities and errors, use below definition. Corecrypto itself, accepts both deinitions
+      #define cc_ctx_decl(_type_, _size_, _name_)  _type_ _name_ ## _array[cc_ctx_n(_type_, (_size_))]; _type_ *_name_ = _name_ ## _array
+  3. Never use sizeof() operator for the variables declared with cc_ctx_decl(), because it is not be compatible with the _MSC_VER version of cc_ctx_decl().
+ */
+#if defined(_MSC_VER)
+#include <malloc.h>
+#define cc_ctx_decl(_type_, _size_, _name_)  _type_ * _name_ = (_type_ *) _alloca(sizeof(_type_) * cc_ctx_n(_type_, _size_) )
+#define cc_ctx_decl_field(_type_, _size_, _name_)  _type_ _name_ [cc_ctx_n(_type_, _size_)]
+#else
+#define cc_ctx_decl(_type_, _size_, _name_)     \
+  _Pragma("GCC diagnostic push")                \
+  _Pragma("GCC diagnostic ignored \"-Wvla\"")   \
+  _type_ _name_ [cc_ctx_n(_type_, _size_)]      \
+  _Pragma("GCC diagnostic pop")
+#define cc_ctx_decl_field cc_ctx_decl
+#endif
+
+/*!
+ @brief cc_clear(len, dst) zeroizes array dst and it will not be optimized out.
+ @discussion It is used to clear sensitive data, particularly when the are defined in the stack
+ @param len number of bytes to be cleared in dst
+ @param dst input array
+ */
+CC_NONNULL((2))
+void cc_clear(size_t len, void *dst);
+
+// cc_zero is deprecated, please use cc_clear instead.
+cc_deprecate_with_replacement("cc_clear", 13.0, 10.15, 13.0, 6.0, 4.0)
+CC_NONNULL_ALL CC_INLINE
+void cc_zero(size_t len, void *dst)
+{
+    cc_clear(len, dst);
+}
+
+#define cc_copy(_size_, _dst_, _src_) memcpy(_dst_, _src_, _size_)
+
+CC_INLINE CC_NONNULL((2, 3, 4))
+void cc_xor(size_t size, void *r, const void *s, const void *t) {
+    uint8_t *_r=(uint8_t *)r;
+    const uint8_t *_s=(const uint8_t *)s;
+    const uint8_t *_t=(const uint8_t *)t;
+    while (size--) {
+        _r[size] = _s[size] ^ _t[size];
+    }
+}
+
+/*!
+ @brief cc_cmp_safe(num, pt1, pt2) compares two array ptr1 and ptr2 of num bytes.
+ @discussion The execution time/cycles is independent of the data and therefore guarantees no leak about the data. However, the execution time depends on num.
+ @param num  number of bytes in each array
+ @param ptr1 input array
+ @param ptr2 input array
+ @return  returns 0 if the num bytes starting at ptr1 are identical to the num bytes starting at ptr2 and 1 if they are different or if num is 0 (empty arrays).
+ */
+CC_NONNULL((2, 3))
+int cc_cmp_safe (size_t num, const void * ptr1, const void * ptr2);
+
+/* Exchange S and T of any type.  NOTE: Both and S and T are evaluated
+   mutliple times and MUST NOT be expressions. */
+#define CC_SWAP(S,T)  do { \
+    volatile __typeof__(S) _cc_swap_tmp = S; S = T; T = _cc_swap_tmp; \
+    _cc_swap_tmp = 0;\
+} while(0)
+
+/* Return the maximum value between S and T. */
+#define CC_MAX(S, T) ({__typeof__(S) _cc_max_s = S; __typeof__(T) _cc_max_t = T; _cc_max_s > _cc_max_t ? _cc_max_s : _cc_max_t;})
+
+/* Clone of CC_MAX() that evalutes S and T multiple times to allow nesting. */
+#define CC_MAX_EVAL(S, T) ((S) > (T) ? (S) : (T))
+
+/* Return the minimum value between S and T. */
+#define CC_MIN(S, T) ({__typeof__(S) _cc_min_s = S; __typeof__(T) _cc_min_t = T; _cc_min_s <= _cc_min_t ? _cc_min_s : _cc_min_t;})
+
+/*
+ When building with "-nostdinc" (i.e. iboot), ptrauth.h is in a non-standard location.
+ This requires a new flag to be used when building iboot: -ibuiltininc which is not
+ yet available.
+*/
+#if __has_feature(ptrauth_calls) && (CC_KERNEL || CC_USE_L4 || CC_USE_SEPROM)
+#include <ptrauth.h>
+#define CC_SPTR(_sn_, _n_) \
+    __ptrauth(ptrauth_key_process_independent_code, 1, ptrauth_string_discriminator("cc_" #_sn_ #_n_)) _n_
+#else
+#define CC_SPTR(_sn_, _n_) _n_
+#endif
+
+#endif /* _CORECRYPTO_CC_H_ */
--- a/cc/corecrypto/cc_absolute_time.h
+++ b/cc/corecrypto/cc_absolute_time.h
@ -0,0 +1,83 @@
+/* Copyright (c) (2016-2020) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by
+ * Apple Inc. (if any) are limited to internal use within your organization only on
+ * devices and computers you own or control, for the sole purpose of verifying the
+ * security characteristics and correct functioning of the Apple Software.  You may
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#ifndef cc_absolute_time_h
+#define cc_absolute_time_h
+
+#include <corecrypto/cc_config.h>
+#include <stdint.h>
+
+// For more info on mach_absolute_time() precision:
+//     https://developer.apple.com/library/mac/qa/qa1398/_index.html
+
+#if CC_USE_L4
+    #include <ert/time.h>
+    #define cc_absolute_time() ert_time_now()
+
+    // L4 doesn't use a scaling factor
+    #define cc_absolute_time_sf() (1.0 / 1000000000.0)
+#elif CC_KERNEL
+    #include <mach/mach_time.h>
+    #include <kern/clock.h>
+    #define cc_absolute_time() (mach_absolute_time())
+
+     // Scale factor to convert absolute time to seconds
+    #define cc_absolute_time_sf() ({                                        \
+        struct mach_timebase_info info;                                     \
+        clock_timebase_info(&info);                                         \
+        ((double)info.numer) / (1000000000.0 * info.denom);                 \
+    })
+#elif CC_DARWIN
+    #include <mach/mach_time.h>
+    #define cc_absolute_time() (mach_absolute_time())
+
+     // Scale factor to convert absolute time to seconds
+    #define cc_absolute_time_sf() ({                                        \
+        struct mach_timebase_info info;                                     \
+        mach_timebase_info(&info);                                          \
+        ((double)info.numer) / (1000000000.0 * info.denom);                 \
+    })
+#elif defined(_WIN32)
+    #include <windows.h>
+    CC_INLINE uint64_t cc_absolute_time(void) {
+        LARGE_INTEGER time;
+        QueryPerformanceCounter(&time); //resolution < 1us
+        return (uint64_t)time.QuadPart;
+     }
+
+     CC_INLINE double cc_absolute_time_sf(){
+        LARGE_INTEGER freq;
+        QueryPerformanceFrequency(&freq); //performance counter freq in Hz
+        return (double)1 / freq.QuadPart;
+     }
+
+#elif CC_LINUX
+    #if CORECRYPTO_SIMULATE_POSIX_ENVIRONMENT
+        #include <mach/mach_time.h>
+        #define cc_absolute_time() (mach_absolute_time()) // To test compilation on mac
+    #else
+        // The following is specific to non x86 (arm/mips/etc...) architectures on Linux.
+        #warning cc_absolute_time() has not been tested
+        #include <time.h>
+        #define NSEC_PER_USEC 1000ull
+        CC_INLINE uint64_t cc_absolute_time() {
+           struct timespec tm;
+           clock_gettime(CLOCK_THREAD_CPUTIME_ID, &tm);
+           return tm.tv_sec * 1000000000ull + tm.tv_nsec;
+        }
+    #endif // CORECRYPTO_SIMULATE_POSIX_ENVIRONMENT
+    #define cc_absolute_time_sf() (1.0 / 1000000000.0)
+
+#else
+    #warning Target OS is not defined. There should be a definition for cc_absolute_time() for the target OS/platform.
+#endif
+
+#endif /* cc_absolute_time_h */
--- a/cc/corecrypto/cc_config.h
+++ b/cc/corecrypto/cc_config.h
@ -0,0 +1,600 @@
+/* Copyright (c) (2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to 
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by 
+ * Apple Inc. (if any) are limited to internal use within your organization only on 
+ * devices and computers you own or control, for the sole purpose of verifying the 
+ * security characteristics and correct functioning of the Apple Software.  You may 
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#ifndef _CORECRYPTO_CC_CONFIG_H_
+#define _CORECRYPTO_CC_CONFIG_H_
+
+/* A word about configuration macros:
+
+    Conditional configuration macros specific to corecrypto should be named CORECRYPTO_xxx
+    or CCxx_yyy and be defined to be either 0 or 1 in this file. You can add an
+    #ifndef #error construct at the end of this file to make sure it's always defined.
+
+    They should always be tested using the #if directive, never the #ifdef directive.
+
+    No other conditional macros shall ever be used (except in this file)
+
+    Configuration Macros that are defined outside of corecrypto (eg: KERNEL, DEBUG, ...)
+    shall only be used in this file to define CCxxx macros.
+
+    External macros should be assumed to be either undefined, defined with no value,
+    or defined as true or false. We shall strive to build with -Wundef whenever possible,
+    so the following construct should be used to test external macros in this file:
+
+         #if defined(DEBUG) && (DEBUG)
+         #define CORECRYPTO_DEBUG 1
+         #else
+         #define CORECRYPTO_DEBUG 0
+         #endif
+
+
+    It is acceptable to define a conditional CC_xxxx macro in an implementation file,
+    to be used only in this file.
+
+    The current code is not guaranteed to follow those rules, but should be fixed to.
+
+    Corecrypto requires GNU and C99 compatibility.
+    Typically enabled by passing --gnu --c99 to the compiler (eg. armcc)
+
+*/
+
+//Do not set this macros to 1, unless you are developing/testing for Linux under macOS
+#define CORECRYPTO_SIMULATE_POSIX_ENVIRONMENT    0
+
+//Do not set these macros to 1, unless you are developing/testing for Windows under macOS
+#define CORECRYPTO_SIMULATE_WINDOWS_ENVIRONMENT 0
+#define CORECRYPTO_HACK_FOR_WINDOWS_DEVELOPMENT 0
+
+#if (defined(DEBUG) && (DEBUG)) || defined(_DEBUG) //MSVC defines _DEBUG
+/* CC_DEBUG is already used in CommonCrypto */
+ #define CORECRYPTO_DEBUG 1
+#else
+ #define CORECRYPTO_DEBUG 0
+#endif
+
+// This macro can be used to enable prints when a condition in the macro "cc_require"
+// is false. This is especially useful to confirm that negative testing fails
+// at the intended location
+#define CORECRYPTO_DEBUG_ENABLE_CC_REQUIRE_PRINTS 0
+
+#if defined(KERNEL) && (KERNEL)
+ #define CC_KERNEL 1 // KEXT, XNU repo or kernel components such as AppleKeyStore
+#else
+ #define CC_KERNEL 0
+#endif
+
+#if defined(__linux__) || CORECRYPTO_SIMULATE_POSIX_ENVIRONMENT
+ #define CC_LINUX 1
+#else
+ #define CC_LINUX 0
+#endif
+
+#if defined(__ANDROID__) && (__ANDROID__)
+ #define CC_ANDROID 1
+#else
+ #define CC_ANDROID 0
+#endif
+
+#if defined(USE_L4) && (USE_L4)
+ #define CC_USE_L4 1
+#else
+ #define CC_USE_L4 0
+#endif
+
+#if defined(RTKIT) && (RTKIT)
+ #define CC_RTKIT 1
+#else
+ #define CC_RTKIT 0
+#endif
+
+#if defined(RTKITROM) && (RTKITROM)
+#define CC_RTKITROM 1
+#else
+#define CC_RTKITROM 0
+#endif
+
+#if defined(USE_SEPROM) && (USE_SEPROM)
+ #define CC_USE_SEPROM 1
+#else
+ #define CC_USE_SEPROM 0
+#endif
+
+#if defined(USE_S3) && (USE_S3)
+ #define CC_USE_S3 1
+#else
+ #define CC_USE_S3 0
+#endif
+
+#if (defined(ICE_FEATURES_ENABLED)) || (defined(MAVERICK) && (MAVERICK))
+ #define CC_BASEBAND 1
+#else
+ #define CC_BASEBAND 0
+#endif
+
+#if defined(EFI) && (EFI)
+ #define CC_EFI 1
+#else
+ #define CC_EFI 0
+#endif
+
+#if defined(IBOOT) && (IBOOT)
+ #define CC_IBOOT 1
+#else
+ #define CC_IBOOT 0
+#endif
+
+#if defined(TARGET_OS_BRIDGE)
+ #define CC_BRIDGE TARGET_OS_BRIDGE
+#else
+ #define CC_BRIDGE 0
+#endif
+
+// Check if we're running on a generic, userspace platform, i.e., not in the kernel, SEP, etc.
+#ifndef CC_GENERIC_PLATFORM
+  #define CC_GENERIC_PLATFORM \
+            (!CC_RTKIT && !CC_KERNEL && !CC_USE_L4 && \
+             !CC_RTKITROM && !CC_EFI && !CC_IBOOT &&  \
+             !CC_USE_SEPROM && !CC_ANDROID && !CC_LINUX && \
+             !CC_BRIDGE)
+#endif
+
+// Defined by the XNU build scripts
+// Applies to code embedded in XNU but NOT to the kext
+#if defined(XNU_KERNEL_PRIVATE)
+ #define CC_XNU_KERNEL_PRIVATE 1
+#else
+ #define CC_XNU_KERNEL_PRIVATE 0
+#endif
+
+// handle unaligned data, if the cpu cannot. Currently for gladman AES and the C version of the SHA256
+#define CC_HANDLE_UNALIGNED_DATA CC_BASEBAND
+
+// BaseBand configuration
+#if CC_BASEBAND
+
+// -- ENDIANESS
+#if !defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__)
+ #if defined(ENDIAN_LITTLE) || (defined(__arm__) && !defined(__BIG_ENDIAN))
+  #define __LITTLE_ENDIAN__
+ #elif !defined(ENDIAN_BIG) && !defined(__BIG_ENDIAN)
+  #error Baseband endianess not defined.
+ #endif
+ #define AESOPT_ENDIAN_NO_FILE
+#endif
+
+// -- Architecture
+ #define CCN_UNIT_SIZE  4 // 32 bits
+
+// -- External function
+ #define assert ASSERT   // sanity
+
+// -- Warnings
+// Ignore irrelevant warnings after verification
+// #1254-D: arithmetic on pointer to void or function type
+// #186-D: pointless comparison of unsigned integer with zero
+// #546-D: transfer of control bypasses initialization of
+ #ifdef __arm__
+  #pragma diag_suppress 186, 1254,546
+ #elif defined(__GNUC__)
+// warning: pointer of type 'void *' used in arithmetic
+  #pragma GCC diagnostic ignored "-Wpointer-arith"
+ #endif // __arm__
+#define CC_SMALL_CODE 1
+
+#endif // CC_BASEBAND
+
+#if CC_RTKIT || CC_RTKITROM
+#define CC_SMALL_CODE 1
+#endif
+
+
+#ifndef CC_SMALL_CODE
+#define CC_SMALL_CODE 0
+#endif
+
+//CC_DARWIN indicates the availability of XNU kernel functions,
+//like what we have on OSX, iOS, tvOS, Watch OS
+#if (CC_USE_L4 || CC_RTKIT || CC_RTKITROM || CC_USE_SEPROM || CC_EFI || CC_LINUX || defined(_WIN32) || CC_BASEBAND  || CC_USE_S3 || CC_ANDROID)
+ #define CC_DARWIN 0
+#else
+ #define CC_DARWIN 1
+#endif
+
+//arm arch64 definition for gcc
+#if defined(__GNUC__) && defined(__aarch64__) && !defined(__arm64__)
+    #define __arm64__
+#endif
+
+#if !defined(CCN_UNIT_SIZE)
+ #if defined(__arm64__) || defined(__x86_64__)  || defined(_WIN64)
+  #define CCN_UNIT_SIZE  8
+ #elif defined(__arm__) || defined(__i386__) || defined(_WIN32)
+  #define CCN_UNIT_SIZE  4
+ #else
+  #error undefined architecture
+ #endif
+#endif /* !defined(CCN_UNIT_SIZE) */
+
+
+//this allows corecrypto Windows development using xcode
+#if defined(CORECRYPTO_SIMULATE_WINDOWS_ENVIRONMENT)
+ #if CORECRYPTO_SIMULATE_WINDOWS_ENVIRONMENT && CC_DARWIN && CORECRYPTO_DEBUG
+  #define CC_USE_ASM 0
+  #define CC_USE_HEAP_FOR_WORKSPACE 1
+   #if (CCN_UNIT_SIZE==8)
+    #define CCN_UINT128_SUPPORT_FOR_64BIT_ARCH 0
+   #else
+    #define CCN_UINT128_SUPPORT_FOR_64BIT_ARCH 1
+   #endif
+ #endif
+#endif
+
+#if !defined(CCN_UINT128_SUPPORT_FOR_64BIT_ARCH)
+ #if defined(_WIN64) && defined(_WIN32) && (CCN_UNIT_SIZE==8)
+  #define CCN_UINT128_SUPPORT_FOR_64BIT_ARCH 0
+ #elif defined(_WIN32)
+  #define CCN_UINT128_SUPPORT_FOR_64BIT_ARCH 1//should not be a problem
+ #else
+  #define CCN_UINT128_SUPPORT_FOR_64BIT_ARCH 1
+ #endif
+#endif
+
+#if defined(_MSC_VER)
+    #if defined(__clang__)
+        #define CC_ALIGNED(x) __attribute__ ((aligned(x))) //clang compiler
+    #else
+        #define CC_ALIGNED(x) __declspec(align(x)) //MS complier
+    #endif
+#else
+    #if __clang__ || CCN_UNIT_SIZE==8
+        #define CC_ALIGNED(x) __attribute__ ((aligned(x)))
+    #else
+        #define CC_ALIGNED(x) __attribute__ ((aligned((x)>8?8:(x))))
+    #endif
+#endif
+
+#if defined(__arm__)
+//this is copied from <arm/arch.h>, because <arm/arch.h> is not available on SEPROM environment
+#if defined (__ARM_ARCH_7A__) || defined (__ARM_ARCH_7S__) || defined (__ARM_ARCH_7F__) || defined (__ARM_ARCH_7K__) || defined(__ARM_ARCH_7EM__)
+  #define _ARM_ARCH_7
+ #endif
+
+ #if defined(__ARM_ARCH_6M__) || defined(__TARGET_ARCH_6S_M) || defined (__armv6m__)
+  #define _ARM_ARCH_6M
+ #endif
+#endif
+
+#if defined(__arm64__) || defined(__arm__)
+ #define CCN_IOS				   1
+ #define CCN_OSX				   0
+#elif defined(__x86_64__) || defined(__i386__)
+ #define CCN_IOS				   0
+ #define CCN_OSX				   1
+#endif
+
+#if CC_USE_S3
+/* For corecrypto kext, CC_STATIC should be undefined */
+ #define CC_STATIC              1
+#endif
+
+#if !defined(CC_USE_HEAP_FOR_WORKSPACE)
+ #if CC_USE_S3 || CC_USE_SEPROM || CC_RTKITROM
+  #define CC_USE_HEAP_FOR_WORKSPACE 0
+ #else
+  #define CC_USE_HEAP_FOR_WORKSPACE 1
+ #endif
+#endif
+
+/* memset_s is only available in few target */
+#if CC_USE_SEPROM || defined(__CC_ARM) \
+    || defined(__hexagon__) || CC_EFI
+ #define CC_HAS_MEMSET_S 0
+#else
+ #define CC_HAS_MEMSET_S 1
+#endif
+
+// Include target conditionals if available.
+#if defined(__has_include)     /* portability */
+#if __has_include(<TargetConditionals.h>)
+#include <TargetConditionals.h>
+#endif /* __has_include(<TargetConditionals.h>) */
+#endif /* defined(__has_include) */
+
+// Disable RSA Keygen on iBridge
+#if defined(TARGET_OS_BRIDGE) && TARGET_OS_BRIDGE && CC_KERNEL
+#define CC_DISABLE_RSAKEYGEN 1 /* for iBridge */
+#else
+#define CC_DISABLE_RSAKEYGEN 0 /* default */
+#endif
+
+#if (CCN_UNIT_SIZE == 8) && !( defined(_MSC_VER) && defined(__clang__))
+#define CCEC25519_CURVE25519_64BIT 1
+#else
+#define CCEC25519_CURVE25519_64BIT 0
+#endif
+
+//- functions implemented in assembly ------------------------------------------
+//this the list of corecrypto clients that use assembly and the clang compiler
+#if !(CC_DARWIN || CC_KERNEL || CC_USE_L4 || CC_IBOOT || CC_RTKIT || CC_RTKITROM || CC_USE_SEPROM || CC_USE_S3) && !defined(_WIN32) && CORECRYPTO_DEBUG
+ #warning "You are using the default corecrypto configuration, assembly optimizations may not be available for your platform"
+#endif
+
+// Enable assembler in Linux if CC_LINUX_ASM is defined
+#if CC_LINUX && defined(CC_LINUX_ASM) && CC_LINUX_ASM
+#define CC_USE_ASM 1
+#endif
+
+// Use this macro to strictly disable assembly regardless of cpu/os/compiler/etc.
+// Our assembly code is not gcc compatible. Clang defines the __GNUC__ macro as well.
+#if !defined(CC_USE_ASM)
+ #if defined(_WIN32) || CC_EFI || CC_BASEBAND || CC_XNU_KERNEL_PRIVATE || (defined(__GNUC__) && !defined(__clang__)) || defined(__ANDROID_API__) || CC_LINUX
+  #define CC_USE_ASM 0
+ #else
+  #define CC_USE_ASM 1
+ #endif
+#endif
+
+#define CC_CACHE_DESCRIPTORS CC_KERNEL
+
+//-(1) ARM V7
+#if defined(_ARM_ARCH_7) && __clang__ && CC_USE_ASM
+ #define CCN_DEDICATED_SQR      CC_SMALL_CODE
+ #define CCN_MUL_KARATSUBA      0 // no performance improvement
+ #define CCN_ADD_ASM            1
+ #define CCN_SUB_ASM            1
+ #define CCN_MUL_ASM            0
+ #define CCN_ADDMUL1_ASM        1
+ #define CCN_MUL1_ASM           1
+ #define CCN_CMP_ASM            1
+ #define CCN_ADD1_ASM           1
+ #define CCN_SUB1_ASM           1
+ #define CCN_N_ASM              1
+ #define CCN_SET_ASM            1
+ #define CCN_SHIFT_RIGHT_ASM    1
+ #if defined(__ARM_NEON__) 
+ #define CCN_SHIFT_LEFT_ASM     1
+ #else
+ #define CCN_SHIFT_LEFT_ASM     0
+ #endif
+ #define CCN_MULMOD_224_ASM     1
+ #define CCN_MULMOD_256_ASM     1
+ #define CCAES_ARM_ASM          1
+ #define CCAES_INTEL_ASM        0
+ #if CC_KERNEL || CC_USE_L4 || CC_IBOOT || CC_RTKIT || CC_RTKITROM || CC_USE_SEPROM || CC_USE_S3
+  #define CCAES_MUX             0
+ #else
+  #define CCAES_MUX             1
+ #endif
+ #define CCN_USE_BUILTIN_CLZ    1
+ #define CCSHA1_VNG_INTEL       0
+ #define CCSHA2_VNG_INTEL       0
+
+ #if defined(__ARM_NEON__) || CC_KERNEL
+  #define CCSHA1_VNG_ARM        1
+  #define CCSHA2_VNG_ARM        1
+ #else /* !defined(__ARM_NEON__) */
+  #define CCSHA1_VNG_ARM        0
+  #define CCSHA2_VNG_ARM        0
+ #endif /* !defined(__ARM_NEON__) */
+ #define CCSHA256_ARMV6M_ASM 0
+
+ #define CC_ACCELERATECRYPTO    1
+
+//-(2) ARM 64
+#elif defined(__arm64__) && __clang__ && CC_USE_ASM
+ #define CCN_DEDICATED_SQR      CC_SMALL_CODE
+ #define CCN_MUL_KARATSUBA      0 // 4*n CCN_UNIT extra memory required.
+ #define CCN_ADD_ASM            1
+ #define CCN_SUB_ASM            1
+ #define CCN_MUL_ASM            1
+ #define CCN_ADDMUL1_ASM        0
+ #define CCN_MUL1_ASM           0
+ #define CCN_CMP_ASM            1
+ #define CCN_ADD1_ASM           0
+ #define CCN_SUB1_ASM           0
+ #define CCN_N_ASM              1
+ #define CCN_SET_ASM            0
+ #define CCN_SHIFT_RIGHT_ASM    1
+ #define CCN_SHIFT_LEFT_ASM     1
+ #define CCN_MULMOD_224_ASM     1
+ #define CCN_MULMOD_256_ASM     1
+ #define CCAES_ARM_ASM          1
+ #define CCAES_INTEL_ASM        0
+ #define CCAES_MUX              0        // On 64bit SoC, asm is much faster than HW
+ #define CCN_USE_BUILTIN_CLZ    1
+ #define CCSHA1_VNG_INTEL       0
+ #define CCSHA2_VNG_INTEL       0
+ #define CCSHA1_VNG_ARM         1
+ #define CCSHA2_VNG_ARM         1
+ #define CCSHA256_ARMV6M_ASM    0
+
+ #define CC_ACCELERATECRYPTO    1
+
+//-(3) Intel 32/64
+#elif (defined(__x86_64__) || defined(__i386__)) && __clang__ && CC_USE_ASM
+ #define CCN_DEDICATED_SQR      1
+ #define CCN_MUL_KARATSUBA      0 // 4*n CCN_UNIT extra memory required.
+ /* These assembly routines only work for a single CCN_UNIT_SIZE. */
+ #if (defined(__x86_64__) && CCN_UNIT_SIZE == 8) || (defined(__i386__) && CCN_UNIT_SIZE == 4)
+  #define CCN_ADD_ASM            1
+  #define CCN_SUB_ASM            1
+  #define CCN_MUL_ASM            1
+ #else
+  #define CCN_ADD_ASM            0
+  #define CCN_SUB_ASM            0
+  #define CCN_MUL_ASM            0
+ #endif
+
+ #if (defined(__x86_64__) && CCN_UNIT_SIZE == 8)
+  #define CCN_CMP_ASM            1
+  #define CCN_N_ASM              1
+  #define CCN_SHIFT_RIGHT_ASM    1
+  #define CCN_SHIFT_LEFT_ASM     1
+ #else
+  #define CCN_CMP_ASM            0
+  #define CCN_N_ASM              0
+  #define CCN_SHIFT_RIGHT_ASM    0
+  #define CCN_SHIFT_LEFT_ASM     0
+ #endif
+
+ #define CCN_MULMOD_224_ASM     0
+ #if defined(__x86_64__) && CCN_UNIT_SIZE == 8
+  #define CCN_MULMOD_256_ASM    1
+  #define CCN_ADDMUL1_ASM       1
+  #define CCN_MUL1_ASM          1
+ #else
+  #define CCN_MULMOD_256_ASM    0
+  #define CCN_ADDMUL1_ASM       0
+  #define CCN_MUL1_ASM          0
+ #endif
+ #define CCN_ADD1_ASM           0
+ #define CCN_SUB1_ASM           0
+ #define CCN_SET_ASM            0
+ #define CCAES_ARM_ASM          0
+ #define CCAES_INTEL_ASM        1
+ #define CCAES_MUX              0
+ #define CCN_USE_BUILTIN_CLZ    0
+ #define CCSHA1_VNG_INTEL       1
+ #define CCSHA2_VNG_INTEL       1
+ #define CCSHA1_VNG_ARM         0
+ #define CCSHA2_VNG_ARM         0
+ #define CCSHA256_ARMV6M_ASM    0
+
+ #define CC_ACCELERATECRYPTO    1
+
+//-(4) disable assembly
+#else
+ #if CCN_UINT128_SUPPORT_FOR_64BIT_ARCH
+  #define CCN_DEDICATED_SQR     1
+ #else
+  #define CCN_DEDICATED_SQR     0 //when assembly is off and 128-bit integers are not supported, dedicated square is off. This is the case on Windows
+ #endif
+ #define CCN_MUL_KARATSUBA      0 // 4*n CCN_UNIT extra memory required.
+ #define CCN_ADD_ASM            0
+ #define CCN_SUB_ASM            0
+ #define CCN_MUL_ASM            0
+ #define CCN_ADDMUL1_ASM        0
+ #define CCN_MUL1_ASM           0
+ #define CCN_CMP_ASM            0
+ #define CCN_ADD1_ASM           0
+ #define CCN_SUB1_ASM           0
+ #define CCN_N_ASM              0
+ #define CCN_SET_ASM            0
+ #define CCN_SHIFT_RIGHT_ASM    0
+ #define CCN_SHIFT_LEFT_ASM     0
+ #define CCN_MULMOD_224_ASM     0
+ #define CCN_MULMOD_256_ASM     0
+ #define CCAES_ARM_ASM          0
+ #define CCAES_INTEL_ASM        0
+ #define CCAES_MUX              0
+ #define CCN_USE_BUILTIN_CLZ    0
+ #define CCSHA1_VNG_INTEL       0
+ #define CCSHA2_VNG_INTEL       0
+ #define CCSHA1_VNG_ARM         0
+ #define CCSHA2_VNG_ARM         0
+ #define CCSHA256_ARMV6M_ASM    0
+
+ #define CC_ACCELERATECRYPTO    0
+
+#endif
+
+#define CC_INLINE static inline
+
+#ifdef __GNUC__
+ #define CC_NORETURN __attribute__((__noreturn__))
+ #define CC_NOTHROW __attribute__((__nothrow__))
+ #define CC_NONNULL(N) __attribute__((__nonnull__ N))
+ #define CC_NONNULL4 CC_NONNULL((4))
+ #define CC_NONNULL_ALL __attribute__((__nonnull__))
+ #define CC_SENTINEL __attribute__((__sentinel__))
+ // Only apply the `CC_CONST` attribute to functions with no side-effects where the output is a strict function of pass by value input vars with no exterior side-effects.
+ // Specifically, do not apply CC_CONST if the function has any arguments that are pointers (directly, or indirectly)
+ #define CC_CONST __attribute__((__const__))
+ #define CC_PURE __attribute__((__pure__))
+ #define CC_WARN_RESULT __attribute__((__warn_unused_result__))
+ #define CC_MALLOC_CLEAR __attribute__((__malloc__))
+ #define CC_UNUSED __attribute__((unused))
+#else /* !__GNUC__ */
+/*! @parseOnly */
+ #define CC_UNUSED
+/*! @parseOnly */
+ #define CC_NONNULL(N)
+/*! @parseOnly */
+ #define CC_NONNULL4
+/*! @parseOnly */
+ #define CC_NORETURN
+/*! @parseOnly */
+ #define CC_NOTHROW
+/*! @parseOnly */
+ #define CC_NONNULL_ALL
+/*! @parseOnly */
+ #define CC_SENTINEL
+/*! @parseOnly */
+ #define CC_CONST
+/*! @parseOnly */
+ #define CC_PURE
+/*! @parseOnly */
+ #define CC_WARN_RESULT
+/*! @parseOnly */
+ #define CC_MALLOC_CLEAR
+#endif /* !__GNUC__ */
+
+
+// Bridge differences between MachO and ELF compiler/assemblers. */
+#if CC_LINUX
+#define CC_ASM_SECTION_CONST .rodata
+#define CC_ASM_PRIVATE_EXTERN .hidden
+#if CC_LINUX
+// We need to be sure that assembler can access relocated C
+// symbols. Sad but this is the quickest way to do that, at least with
+// our current linux compiler (clang-3.4).
+#define CC_C_LABEL(_sym) _sym@PLT
+#endif
+#define _IMM(x) $(x)
+#else /* !CC_LINUX */
+#define CC_ASM_SECTION_CONST .const
+#define CC_ASM_PRIVATE_EXTERN .private_extern
+#define CC_C_LABEL(_sym) _##_sym
+#define _IMM(x) $$(x)
+#endif /* !CC_LINUX */
+
+// Enable FIPSPOST function tracing only when supported. */
+#ifdef CORECRYPTO_POST_TRACE
+#define CC_FIPSPOST_TRACE 1
+#else
+#define CC_FIPSPOST_TRACE 0
+#endif
+
+#ifndef CC_INTERNAL_SDK
+#if __has_include(<System/i386/cpu_capabilities.h>)
+#define CC_INTERNAL_SDK 1
+#elif __has_include(<System/arm/cpu_capabilities.h>)
+#define CC_INTERNAL_SDK 1
+#else
+#define CC_INTERNAL_SDK 0
+#endif
+#endif
+
+// Currently thread sanitizer is only supported in local builds.
+// Please edit your "corecrypto_test" scheme to build with thread
+// sanitizer and then remove *all* variants of corecrypto_static
+// besides "normal"
+#if defined(__has_feature)
+    #if __has_feature(thread_sanitizer)
+        #define CC_TSAN 1
+    #else
+        #define CC_TSAN 0
+    #endif // __has_feature(thread_sanitizer)
+#else
+    #define CC_TSAN 0
+#endif // __has_feature
+
+#endif /* _CORECRYPTO_CC_CONFIG_H_ */
--- a/cc/corecrypto/cc_debug.h
+++ b/cc/corecrypto/cc_debug.h
@ -0,0 +1,76 @@
+/* Copyright (c) (2012,2014-2019) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by
+ * Apple Inc. (if any) are limited to internal use within your organization only on
+ * devices and computers you own or control, for the sole purpose of verifying the
+ * security characteristics and correct functioning of the Apple Software.  You may
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+//debug configuration header file
+#ifndef _CORECRYPTO_CCN_DEBUG_H_
+#define _CORECRYPTO_CCN_DEBUG_H_
+
+#include <corecrypto/cc_config.h>
+
+// DO NOT INCLUDE this HEADER file in CoreCrypto files added for XNU project or headers
+// included by external clients.
+
+// ========================
+// Printf for corecrypto
+// ========================
+#if CC_KERNEL
+    #include <pexpert/pexpert.h>
+    #define cc_printf(x...) kprintf(x)
+    #if !CONFIG_EMBEDDED
+        extern int printf(const char *format, ...) __printflike(1,2);
+    #endif
+#elif CC_USE_S3 || CC_IBOOT || CC_RTKIT || CC_RTKITROM
+    #include <stdio.h>
+    #define cc_printf(x...) printf(x)
+#elif defined(__ANDROID_API__)
+    #include <android/log.h>
+    #define cc_printf(x...) __android_log_print(ANDROID_LOG_DEBUG, "corecrypto", x);
+#else
+    #include <stdio.h>
+    #define cc_printf(x...) fprintf(stderr, x)
+#endif
+
+// ========================
+// Integer types
+// ========================
+
+#if CC_KERNEL
+/* Those are not defined in libkern */
+#define PRIx64 "llx"
+#define PRIx32 "x"
+#define PRIx16 "hx"
+#define PRIx8  "hhx"
+#else
+#include <inttypes.h>
+#endif
+
+#if  CCN_UNIT_SIZE == 8
+#define CCPRIx_UNIT ".016" PRIx64
+#elif  CCN_UNIT_SIZE == 4
+#define CCPRIx_UNIT ".08" PRIx32
+#elif CCN_UNIT_SIZE == 2
+#define CCPRIx_UNIT ".04" PRIx16
+#elif CCN_UNIT_SIZE == 1
+#define CCPRIx_UNIT ".02" PRIx8
+#else
+#error invalid CCN_UNIT_SIZE
+#endif
+
+// ========================
+// Print utilities for corecrypto
+// ========================
+
+#include <corecrypto/cc.h>
+
+/* Print a byte array of arbitrary size */
+void cc_print(const char *label, size_t count, const uint8_t *s);
+
+#endif /* _CORECRYPTO_CCN_DEBUG_H_ */
--- a/cc/corecrypto/cc_error.h
+++ b/cc/corecrypto/cc_error.h
@ -0,0 +1,165 @@
+/* Copyright (c) (2017,2018,2019,2020) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to 
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by 
+ * Apple Inc. (if any) are limited to internal use within your organization only on 
+ * devices and computers you own or control, for the sole purpose of verifying the 
+ * security characteristics and correct functioning of the Apple Software.  You may 
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#ifndef _CORECRYPTO_CC_ERROR_H_
+#define _CORECRYPTO_CC_ERROR_H_
+
+enum {
+    CCERR_OK = 0,
+
+    /* the default error code */
+    CCERR_INTERNAL = -1,
+
+    CCERR_INTEGRITY = -2,
+
+    CCERR_DEVICE = -3,
+    CCERR_INTERRUPTS = -4,
+    CCERR_CRYPTO_CONFIG = -5,
+    CCERR_PERMS = -6,
+    CCERR_PARAMETER = -7,
+    CCERR_MEMORY = -8,
+    CCERR_FILEDESC = -9,
+    CCERR_OUT_OF_ENTROPY = -10,
+    CCERR_ATFORK = -11,
+    CCERR_OVERFLOW = -12,
+
+    CCERR_MEMORY_ALLOC_FAIL = -13,
+
+    CCEC_GENERATE_KEY_DEFAULT_ERR = -14,
+    CCEC_GENERATE_KEY_TOO_MANY_TRIES = -15,
+    CCEC_GENERATE_KEY_MULT_FAIL = -16,
+    CCEC_GENERATE_KEY_AFF_FAIL = -17,
+    CCEC_GENERATE_KEY_CONSISTENCY = -18,
+    CCEC_GENERATE_NOT_ON_CURVE = -19,
+    CCEC_GENERATE_NOT_ENOUGH_ENTROPY = -20,
+    CCEC_GENERATE_NOT_SUPPORTED = -21,
+    CCEC_GENERATE_INVALID_INPUT = -22,
+
+    // Program error: buffer too small or encrypted message is too small
+    CCRSA_INVALID_INPUT = -23,
+    // Invalid crypto configuration: Hash length versus RSA key size
+    CCRSA_INVALID_CONFIG = -24,
+    CCRSA_ENCODING_ERROR = -25,
+    CCRSA_DECODING_ERROR = -26,
+
+    // The data is invalid (we won't say more for security)
+    CCRSA_PRIVATE_OP_ERROR = -27,
+    CCRSA_KEY_ERROR = -28,
+
+    // Key generation specific
+    CCRSA_KEYGEN_PRIME_NOT_FOUND = -29,
+    CCRSA_KEYGEN_PRIME_NEED_NEW_SEED = -30,
+    CCRSA_KEYGEN_PRIME_TOO_MANY_ITERATIONS = -31,
+    CCRSA_KEYGEN_PRIME_SEED_GENERATION_ERROR = -32,
+    CCRSA_KEYGEN_MODULUS_CRT_INV_ERROR = -33,
+    CCRSA_KEYGEN_NEXT_PRIME_ERROR = -34,
+    CCRSA_KEYGEN_SEED_X_ERROR = -35,
+    CCRSA_KEYGEN_SEED_r_ERROR = -36,
+    CCRSA_KEYGEN_KEYGEN_CONSISTENCY_FAIL = -37,
+    CCRSA_KEYGEN_R1R2_SIZE_ERROR = -38,
+    CCRSA_KEYGEN_PQ_DELTA_ERROR = -39,
+
+    CCRSA_FIPS_KEYGEN_DISABLED = -40,
+
+    CCZP_INV_ERROR = -41,
+    CCZP_INV_NO_INVERSE = -42,
+    CCZP_INV_INVALID_INPUT = -43,
+
+    CCZ_INVALID_INPUT_ERROR = -44,
+    CCZ_INVALID_RADIX_ERROR = -45,
+
+    CCDH_ERROR_DEFAULT = -46,
+    CCDH_GENERATE_KEY_TOO_MANY_TRIES = -47,
+    CCDH_NOT_SUPPORTED_CONFIGURATION = -48,
+    CCDH_SAFETY_CHECK = -49,
+    CCDH_PUBLIC_KEY_MISSING = -50,
+    CCDH_INVALID_DOMAIN_PARAMETER = -51,
+    CCDH_INVALID_INPUT = -52,
+    CCDH_DOMAIN_PARAMETER_MISMATCH = -53,
+    CCDH_GENERATE_KEY_CONSISTENCY = -54,
+
+    CCSRP_ERROR_DEFAULT = -55,
+    CCSRP_GENERATE_KEY_TOO_MANY_TRIES = -56,
+    CCSRP_NOT_SUPPORTED_CONFIGURATION = -57,
+    CCSRP_SAFETY_CHECK = -58,
+    CCSRP_PUBLIC_KEY_MISSING = -59,
+    CCSRP_INVALID_DOMAIN_PARAMETER = -60,
+
+    CCDRBG_STATUS_ERROR = -61,
+    CCDRBG_STATUS_NEED_RESEED = -62,
+    CCDRBG_STATUS_PARAM_ERROR = -63,
+    // If this value is returned, the caller must abort or panic the process for
+    // security reasons. for example in the case of catastrophic error in
+    // http://csrc.nist.gov/publications/drafts/800-90/sp800_90a_r1_draft.pdf
+    // ccdrbg calls abort() or panic(), if they are available in the system.
+    CCDRBG_STATUS_ABORT = -64,
+
+    CCKPRNG_NEED_ENTROPY = -65,
+    CCKPRNG_ABORT = -66,
+
+    CCMODE_INVALID_INPUT = -67,
+    CCMODE_INVALID_CALL_SEQUENCE = -68,
+    CCMODE_INTEGRITY_FAILURE = -69,
+    CCMODE_NOT_SUPPORTED = -70,
+    CCMODE_INTERNAL_ERROR = -71,
+
+    // Configuration or unexpected issue
+    CCPOST_GENERIC_FAILURE = -72,
+    CCPOST_LIBRARY_ERROR = -73,
+    CCPOST_INTEGRITY_ERROR = -74,
+    // Output of the algo is not as expected
+    CCPOST_KAT_FAILURE = -75,
+
+    CCKPRNG_SEEDFILE_OPEN = -76,
+    CCKPRNG_SEEDFILE_READ = -78,
+    CCKPRNG_SEEDFILE_WRITE = -79,
+    CCKPRNG_SEEDFILE_CHMOD = -80,
+    CCKPRNG_SEEDFILE_CHOWN = -81,
+    CCKPRNG_RANDOMDEV_OPEN = -82,
+    CCKPRNG_RANDOMDEV_WRITE = -83,
+    CCKPRNG_GETENTROPY = -84,
+
+    CCSAE_HUNTPECK_EXCEEDED_MAX_TRIALS = -85,
+
+    CCERR_CALL_SEQUENCE = -86,
+
+    CCVRF_POINT_DECODE_FAILURE = -87,
+    CCVRF_POINT_INVALID_PUBLIC_KEY = -88,
+    CCVRF_VERIFY_FAILURE = -89,
+
+    // Error codes for Authenticated Encryption Modes
+    CCMODE_TAG_LENGTH_REQUEST_TOO_LONG = -100,
+    CCMODE_TAG_LENGTH_TOO_SHORT = -101,
+    CCMODE_NONCE_EMPTY = -102,
+    CCMODE_AD_EMPTY = -103,
+    CCMODE_DECRYPTION_OR_VERIFICATION_ERR=-104,
+    CCMODE_BUFFER_OUT_IN_OVERLAP = -105,
+
+    CCSAE_NOT_ENOUGH_COMMIT_PARTIAL_CALLS = -132,
+    CCSAE_GENERATE_COMMIT_CALL_AGAIN = -133,
+
+    CCERR_VALID_SIGNATURE = CCERR_OK,
+    CCERR_INVALID_SIGNATURE = -146,
+
+    CCERR_IOSERVICE_GETMATCHING = -147,
+    CCERR_IOSERVICE_OPEN = -148,
+    CCERR_IOCONNECT_CALL = -149,
+    
+    CCEC_KEY_CANNOT_BE_UNIT = -160,
+    CCEC_COMPRESSED_POINT_ENCODING_ERROR = -161,
+
+    CCERR_RNG_NOT_SEEDED = -162,
+};
+
+#define CCDRBG_STATUS_OK CCERR_OK
+#define CCKPRNG_OK CCERR_OK
+
+#endif /* _CORECRYPTO_CC_ERROR_H_ */
--- a/cc/corecrypto/cc_fault_canary.h
+++ b/cc/corecrypto/cc_fault_canary.h
@ -0,0 +1,29 @@
+/* Copyright (c) (2019,2020) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by
+ * Apple Inc. (if any) are limited to internal use within your organization only on
+ * devices and computers you own or control, for the sole purpose of verifying the
+ * security characteristics and correct functioning of the Apple Software.  You may
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#ifndef corecrypto_cc_fault_canary_h
+#define corecrypto_cc_fault_canary_h
+
+#include "cc.h"
+
+#define CC_FAULT_CANARY_SIZE 16
+typedef uint8_t cc_fault_canary_t[CC_FAULT_CANARY_SIZE];
+
+extern const cc_fault_canary_t CCEC_FAULT_CANARY;
+extern const cc_fault_canary_t CCRSA_PKCS1_FAULT_CANARY;
+extern const cc_fault_canary_t CCRSA_PSS_FAULT_CANARY;
+
+#define CC_FAULT_CANARY_MEMCPY(_dst_, _src_) memcpy(_dst_, _src_, CC_FAULT_CANARY_SIZE)
+#define CC_FAULT_CANARY_CLEAR(_name_) memset(_name_, 0x00, CC_FAULT_CANARY_SIZE)
+
+#define CC_FAULT_CANARY_EQUAL(_a_, _b_) (cc_cmp_safe(CC_FAULT_CANARY_SIZE, _a_, _b_) == 0)
+
+#endif /* corecrypto_cc_fault_canary_h */
--- a/cc/corecrypto/cc_fault_canary_internal.h
+++ b/cc/corecrypto/cc_fault_canary_internal.h
@ -0,0 +1,27 @@
+/* Copyright (c) (2019,2020) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by
+ * Apple Inc. (if any) are limited to internal use within your organization only on
+ * devices and computers you own or control, for the sole purpose of verifying the
+ * security characteristics and correct functioning of the Apple Software.  You may
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#ifndef corecrypto_cc_fault_canary_internal_h
+#define corecrypto_cc_fault_canary_internal_h
+
+/*!
+@function   cc_fault_canary_set
+@abstract   Set the output `fault_canary_out` to the value `fault_canary` if the two inputs are equal.
+
+@param fault_canary_out  Output fault canary value
+@param fault_canary      Fault canary for a specific operation (e.g. CCEC_FAULT_CANARY for ECC signing)
+@param nbytes            Byte length of inputs in1 and in2
+@param in1               Input one
+@param in2               Input two
+*/
+void cc_fault_canary_set(cc_fault_canary_t fault_canary_out, const cc_fault_canary_t fault_canary, size_t nbytes, const uint8_t *in1, const uint8_t *in2);
+
+#endif /* corecrypto_cc_fault_canary_internal_h */
--- a/cc/corecrypto/cc_internal.h
+++ b/cc/corecrypto/cc_internal.h
@ -0,0 +1,16 @@
+/* Copyright (c) (2019) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to 
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by 
+ * Apple Inc. (if any) are limited to internal use within your organization only on 
+ * devices and computers you own or control, for the sole purpose of verifying the 
+ * security characteristics and correct functioning of the Apple Software.  You may 
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <corecrypto/cc_priv.h>
+
+extern bool cc_rdrand(uint64_t *rand);
--- a/cc/corecrypto/cc_macros.h
+++ b/cc/corecrypto/cc_macros.h
@ -0,0 +1,150 @@
+/* Copyright (c) (2012,2015,2016,2017,2019) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to 
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by 
+ * Apple Inc. (if any) are limited to internal use within your organization only on 
+ * devices and computers you own or control, for the sole purpose of verifying the 
+ * security characteristics and correct functioning of the Apple Software.  You may 
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#ifndef _CORECRYPTO_CC_MACROS_H_
+#define _CORECRYPTO_CC_MACROS_H_
+
+#include <corecrypto/cc_config.h>
+
+#ifndef __CC_DEBUG_ASSERT_COMPONENT_NAME_STRING
+#define __CC_DEBUG_ASSERT_COMPONENT_NAME_STRING ""
+#endif
+
+#ifndef __CC_DEBUG_ASSERT_PRODUCTION_CODE
+#define __CC_DEBUG_ASSERT_PRODUCTION_CODE !CORECRYPTO_DEBUG
+#endif
+
+#if CORECRYPTO_DEBUG_ENABLE_CC_REQUIRE_PRINTS
+
+#if !CC_KERNEL
+    #include <string.h> // for strstr
+#endif // !CC_KERNEL
+
+CC_UNUSED static char *cc_strstr(const char *file) {
+#if CC_KERNEL
+    (void) file;
+#else
+    const char cc_char []="corecrypto";
+    char *p=strstr(file, cc_char);
+    if (p) return (p+strlen(cc_char)+1);
+#endif
+    return NULL;
+}
+
+#define __CC_DEBUG_REQUIRE_MESSAGE(name, assertion, label, message, file, line, value) \
+{char *___t = cc_strstr(file); cc_printf( "require: %s, %s%s:%d\n", assertion, (message!=0) ? message : "", ___t==NULL?file:___t, line);}
+
+#endif // CORECRYPTO_DEBUG_ENABLE_CC_REQUIRE_PRINTS
+
+#ifndef cc_require
+#if (__CC_DEBUG_ASSERT_PRODUCTION_CODE) || (!CORECRYPTO_DEBUG_ENABLE_CC_REQUIRE_PRINTS)
+  #if defined(_WIN32) && defined (__clang__)
+    #define cc_require(assertion, exceptionLabel) \
+       do { \
+           if (!(assertion) ) { \
+              goto exceptionLabel; \
+           } \
+        } while ( 0 )
+  #else
+    #define cc_require(assertion, exceptionLabel) \
+        do { \
+            if ( __builtin_expect(!(assertion), 0) ) { \
+                goto exceptionLabel; \
+            } \
+        } while ( 0 )
+ #endif
+#else
+    #define cc_require(assertion, exceptionLabel) \
+        do { \
+            if ( __builtin_expect(!(assertion), 0) ) { \
+                __CC_DEBUG_REQUIRE_MESSAGE(__CC_DEBUG_ASSERT_COMPONENT_NAME_STRING, \
+                    #assertion, #exceptionLabel, 0, __FILE__, __LINE__,  0); \
+                goto exceptionLabel; \
+            } \
+        } while ( 0 )
+#endif
+#endif
+
+#ifndef cc_require_action
+#if __CC_DEBUG_ASSERT_PRODUCTION_CODE || (!CORECRYPTO_DEBUG_ENABLE_CC_REQUIRE_PRINTS)
+  #if defined(_WIN32) && defined(__clang__)
+    #define cc_require_action(assertion, exceptionLabel, action)                \
+        do                                                                      \
+        {                                                                       \
+            if (!(assertion))                                                   \
+            {                                                                   \
+                {                                                               \
+                    action;                                                     \
+                }                                                               \
+                goto exceptionLabel;                                            \
+            }                                                                   \
+        } while ( 0 )
+  #else
+    #define cc_require_action(assertion, exceptionLabel, action)                \
+        do                                                                      \
+        {                                                                       \
+            if ( __builtin_expect(!(assertion), 0) )                            \
+            {                                                                   \
+                {                                                               \
+                    action;                                                     \
+                }                                                               \
+                goto exceptionLabel;                                            \
+            }                                                                   \
+        } while ( 0 )
+  #endif
+#else
+    #define cc_require_action(assertion, exceptionLabel, action)                \
+        do                                                                      \
+        {                                                                       \
+            if ( __builtin_expect(!(assertion), 0) )                            \
+            {                                                                   \
+                __CC_DEBUG_REQUIRE_MESSAGE(                                      \
+                    __CC_DEBUG_ASSERT_COMPONENT_NAME_STRING,                    \
+                    #assertion, #exceptionLabel, 0,   __FILE__, __LINE__, 0);   \
+                {                                                               \
+                    action;                                                     \
+                }                                                               \
+                goto exceptionLabel;                                            \
+            }                                                                   \
+        } while ( 0 )
+#endif
+#endif
+
+#ifndef cc_require_or_return
+#if (__CC_DEBUG_ASSERT_PRODUCTION_CODE) || (!CORECRYPTO_DEBUG_ENABLE_CC_REQUIRE_PRINTS)
+  #if defined(_WIN32) && defined (__clang__)
+    #define cc_require_or_return(assertion, value)                                  \
+       do {                                                                         \
+           if (!(assertion) ) {                                                     \
+              return value;                                                         \
+           }                                                                        \
+        } while ( 0 )
+  #else
+    #define cc_require_or_return(assertion, value)                                  \
+        do {                                                                        \
+            if ( __builtin_expect(!(assertion), 0) ) {                              \
+                return value;                                                       \
+            }                                                                       \
+        } while ( 0 )
+ #endif
+#else
+    #define cc_require_or_return(assertion, value)                                  \
+        do {                                                                        \
+            if ( __builtin_expect(!(assertion), 0) ) {                              \
+                __CC_DEBUG_REQUIRE_MESSAGE(__CC_DEBUG_ASSERT_COMPONENT_NAME_STRING, \
+                    #assertion, #exceptionLabel, 0, __FILE__, __LINE__,  0);        \
+                return value;                                                       \
+            }                                                                       \
+        } while ( 0 )
+#endif
+#endif
+
+#endif /* _CORECRYPTO_CC_MACROS_H_ */
--- a/cc/corecrypto/cc_memory.h
+++ b/cc/corecrypto/cc_memory.h
@ -0,0 +1,192 @@
+/* Copyright (c) (2014,2015,2016,2017,2018,2019,2020) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by
+ * Apple Inc. (if any) are limited to internal use within your organization only on
+ * devices and computers you own or control, for the sole purpose of verifying the
+ * security characteristics and correct functioning of the Apple Software.  You may
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#include "cc_config.h"
+
+#ifndef corecrypto_cc_memory_h
+#define corecrypto_cc_memory_h
+
+#if CORECRYPTO_DEBUG && !defined(_WIN32) && !defined(_WIN64)
+#define CC_ALLOC_DEBUG 1
+#endif
+
+struct ws_dbg {
+    const void *p;
+    const char *file;
+    int line;
+    const char *func;
+};
+
+#if defined(CC_ALLOC_DEBUG)
+extern struct ws_dbg g_ws_dbg;
+#endif
+
+#include <corecrypto/cc_config.h>
+#include <corecrypto/cc_error.h>
+#include "cc_debug.h"
+#include <corecrypto/cc_priv.h>
+
+CC_INLINE void cc_alloc_debug(CC_UNUSED const void *p, CC_UNUSED const char *file, CC_UNUSED int line, CC_UNUSED const char *func)
+{
+#if defined(CC_ALLOC_DEBUG)
+    // Contract for some client is to have a single malloc at a time
+    cc_assert(g_ws_dbg.p == NULL);
+    g_ws_dbg = (struct ws_dbg){ p, file, line, func };
+#endif
+}
+
+CC_INLINE void cc_free_debug(CC_UNUSED const void *p)
+{
+#if defined(CC_ALLOC_DEBUG)
+    // Contract for some client is to have a single malloc at a time
+    cc_assert(g_ws_dbg.p == p); // Free the address we allocated
+    g_ws_dbg = (struct ws_dbg){};
+#endif
+}
+
+// =============================================================================
+//   Declare workspace with memory in STACK
+//  This is the least preferred option since most corecrypto client have
+//  small stack. It is still useful when needing small allocations and errors
+//  can't be easily propagated
+// =============================================================================
+
+// Declare a variable in stack and use its address
+// Only uses this when we don't have a way to propagate error
+#define CC_DECL_WORKSPACE_STACK(ws, n)                 \
+    cc_unit ws##_buf[(n)];                             \
+    cc_ws ws##_ctx = { &ws##_buf[0], &ws##_buf[(n)] }; \
+    cc_ws_t ws = &ws##_ctx;                            \
+    cc_alloc_debug(ws->start, __FILE__, __LINE__, __func__);
+
+// Reset pointers to avoid future reference
+#define CC_FREE_WORKSPACE_STACK(ws)  \
+    cc_free_debug(ws->start);        \
+    ws->start = NULL;                \
+    ws->end = NULL;
+
+#define CC_CLEAR_AND_FREE_WORKSPACE_STACK(ws)             \
+    cc_try_abort_if(ws->start > ws->end, "free ws");      \
+    ccn_clear((cc_size)(ws->end - ws->start), ws->start); \
+    CC_FREE_WORKSPACE_STACK(ws);
+
+// =============================================================================
+//   Declare workspace in the region correspding to HEAP or STACK
+// depending on the setting of CC_USE_HEAP_FOR_WORKSPACE
+// This should be the preference for large memory allocations but it requires
+// to propagate error in case of allocation failure
+// =============================================================================
+#if CC_USE_HEAP_FOR_WORKSPACE
+
+// Malloc/free functions to be used
+#if CC_KERNEL
+#include <IOKit/IOLib.h>
+#include <vm/pmap.h>
+CC_INLINE void *cc_malloc_clear(size_t s)
+{
+    void *p = NULL;
+    if (pmap_in_ppl()) {
+        if (s > PAGE_SIZE) {
+            panic("PPL cc_malloc_clear trying to allocate %zu > PAGE_SIZE", s);
+        }
+
+        p = pmap_claim_reserved_ppl_page();
+    } else {
+        p = IOMalloc(s);
+    }
+    if (p != NULL) {
+        memset(p, 0, s);
+    }
+    return p;
+}
+CC_INLINE void cc_free(void *p, size_t size)
+{
+    if (pmap_in_ppl()) {
+        if (size > PAGE_SIZE) {
+            panic("PPL cc_malloc_clear trying to free %zu > PAGE_SIZE", size);
+        }
+
+        pmap_free_reserved_ppl_page(p);
+
+        return;
+    }
+
+    IOFree(p, size);
+}
+#else // !CC_KERNEL
+#include <stdlib.h>
+CC_INLINE void *cc_malloc_clear(size_t s)
+{
+    void *p = malloc(s);
+    if (p != NULL) {
+        memset(p, 0, s);
+    }
+    return p;
+}
+CC_INLINE void cc_free(void *p, size_t size CC_UNUSED)
+{
+    free(p);
+}
+
+#endif // !CC_KERNEL
+
+#define CC_DECL_WORKSPACE_OR_FAIL(ws, n)               \
+    cc_unit *ws##_buf = (cc_unit *) cc_malloc_clear(ccn_sizeof_n((n)));  \
+    cc_ws ws##_ctx = { &ws##_buf[0], &ws##_buf[(n)] }; \
+    cc_ws_t ws = &ws##_ctx;                            \
+    if (NULL == ws->start)                             \
+        return CCERR_MEMORY_ALLOC_FAIL;                \
+    cc_alloc_debug(ws->start, __FILE__, __LINE__, __func__);
+
+// Free and reset pointers to avoid future references
+#define CC_FREE_WORKSPACE(ws)                                                 \
+    cc_free_debug(ws->start);                                                 \
+    cc_try_abort_if(ws->start > ws->end, "free ws");                          \
+    cc_free(ws->start, (size_t)(ws->end - ws->start) * sizeof(ws->start[0])); \
+    ws->start = NULL;                                                         \
+    ws->end = NULL;
+
+#else // !CC_USE_HEAP_FOR_WORKSPACE
+
+// Declare a variable in stack and use its address
+// Could use alloca but alloca is not so portable, and not secure.
+#define CC_DECL_WORKSPACE_OR_FAIL CC_DECL_WORKSPACE_STACK
+
+// Reset pointers to avoid future reference
+#define CC_FREE_WORKSPACE CC_FREE_WORKSPACE_STACK
+
+#endif // !CC_USE_HEAP_FOR_WORKSPACE
+
+// =============================================================================
+//   Common
+// =============================================================================
+
+#define CC_CLEAR_AND_FREE_WORKSPACE(ws)                       \
+        cc_try_abort_if(ws->start > ws->end, "clear ws");     \
+        ccn_clear((cc_size)(ws->end - ws->start), ws->start); \
+        CC_FREE_WORKSPACE(ws);
+
+// To allocate array of n cc_unit in the WS
+#define CC_DECL_BP_WS(ws, bp) cc_unit *bp = ws->start;
+#define CC_FREE_BP_WS(ws, bp) ws->start = bp;
+#define CC_ALLOC_WS(ws, n) \
+    ws->start;             \
+    ws->start += n;        \
+    cc_try_abort_if(ws->start > ws->end, "alloc ws");
+
+#if CC_KERNEL
+#include <libkern/section_keywords.h>
+#define CC_READ_ONLY_LATE(_t) SECURITY_READ_ONLY_LATE(_t)
+#else
+#define CC_READ_ONLY_LATE(_t) _t
+#endif
+
+#endif // corecrypto_cc_memory_h
--- a/cc/corecrypto/cc_priv.h
+++ b/cc/corecrypto/cc_priv.h
@ -0,0 +1,818 @@
+/* Copyright (c) (2010,2011,2012,2014,2015,2016,2017,2018,2019,2020) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by
+ * Apple Inc. (if any) are limited to internal use within your organization only on
+ * devices and computers you own or control, for the sole purpose of verifying the
+ * security characteristics and correct functioning of the Apple Software.  You may
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#ifndef _CORECRYPTO_CC_PRIV_H_
+#define _CORECRYPTO_CC_PRIV_H_
+
+#include <corecrypto/cc.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+// Fork handlers for the stateful components of corecrypto.
+void cc_atfork_prepare(void);
+void cc_atfork_parent(void);
+void cc_atfork_child(void);
+
+#ifndef __has_builtin
+#define __has_builtin(x) 0
+#endif
+
+#ifndef __DECONST
+#define __DECONST(type, var) ((type)(uintptr_t)(const void *)(var))
+#endif
+
+/* defines the following macros :
+
+ CC_ARRAY_LEN: returns the number of elements in an array
+
+ CC_STORE32_BE : store 32 bit value in big endian in unaligned buffer.
+ CC_STORE32_LE : store 32 bit value in little endian in unaligned buffer.
+ CC_STORE64_BE : store 64 bit value in big endian in unaligned buffer.
+ CC_STORE64_LE : store 64 bit value in little endian in unaligned buffer.
+
+ CC_LOAD32_BE : load 32 bit value in big endian from unaligned buffer.
+ CC_LOAD32_LE : load 32 bit value in little endian from unaligned buffer.
+ CC_LOAD64_BE : load 64 bit value in big endian from unaligned buffer.
+ CC_LOAD64_LE : load 64 bit value in little endian from unaligned buffer.
+
+ CC_ROR  : Rotate Right 32 bits. Rotate count can be a variable.
+ CC_ROL  : Rotate Left 32 bits. Rotate count can be a variable.
+ CC_RORc : Rotate Right 32 bits. Rotate count must be a constant.
+ CC_ROLc : Rotate Left 32 bits. Rotate count must be a constant.
+
+ CC_ROR64  : Rotate Right 64 bits. Rotate count can be a variable.
+ CC_ROL64  : Rotate Left 64 bits. Rotate count can be a variable.
+ CC_ROR64c : Rotate Right 64 bits. Rotate count must be a constant.
+ CC_ROL64c : Rotate Left 64 bits. Rotate count must be a constant.
+
+ CC_BSWAP  : byte swap a 32 bits variable.
+
+ CC_H2BE32 : convert a 32 bits value between host and big endian order.
+ CC_H2LE32 : convert a 32 bits value between host and little endian order.
+
+ CC_BSWAP64  : byte swap a 64 bits variable
+
+ CC_READ_LE32 : read a 32 bits little endian value
+
+ CC_WRITE_LE32 : write a 32 bits little endian value
+ CC_WRITE_LE64 : write a 64 bits little endian value
+
+ CC_H2BE64 : convert a 64 bits value between host and big endian order
+ CC_H2LE64 : convert a 64 bits value between host and little endian order
+
+*/
+
+// RTKitOSPlatform should replace CC_MEMCPY with memcpy
+#define CC_MEMCPY(D,S,L) cc_memcpy((D),(S),(L))
+#define CC_MEMMOVE(D,S,L) cc_memmove((D),(S),(L))
+#define CC_MEMSET(D,V,L) cc_memset((D),(V),(L))
+
+#if __has_builtin(__builtin___memcpy_chk) && !defined(_MSC_VER)
+#define cc_memcpy(dst, src, len) __builtin___memcpy_chk((dst), (src), (len), __builtin_object_size((dst), 1))
+#define cc_memcpy_nochk(dst, src, len) __builtin___memcpy_chk((dst), (src), (len), __builtin_object_size((dst), 0))
+#else
+#define cc_memcpy(dst, src, len) memcpy((dst), (src), (len))
+#define cc_memcpy_nochk(dst, src, len) memcpy((dst), (src), (len))
+#endif
+
+#if __has_builtin(__builtin___memmove_chk) && !defined(_MSC_VER)
+#define cc_memmove(dst, src, len) __builtin___memmove_chk((dst), (src), (len), __builtin_object_size((dst), 1))
+#else
+#define cc_memmove(dst, src, len) memmove((dst), (src), (len))
+#endif
+
+#if __has_builtin(__builtin___memset_chk) && !defined(_MSC_VER)
+#define cc_memset(dst, val, len) __builtin___memset_chk((dst), (val), (len), __builtin_object_size((dst), 1))
+#else
+#define cc_memset(dst, val, len) memset((dst), (val), (len))
+#endif
+
+#define CC_ARRAY_LEN(x) (sizeof((x))/sizeof((x)[0]))
+
+// MARK: - Loads and Store
+
+// MARK: -- 32 bits - little endian
+
+// MARK: --- Default version
+
+#define	CC_STORE32_LE(x, y) do {                                    \
+    ((unsigned char *)(y))[3] = (unsigned char)(((x)>>24)&255);		\
+    ((unsigned char *)(y))[2] = (unsigned char)(((x)>>16)&255);		\
+    ((unsigned char *)(y))[1] = (unsigned char)(((x)>>8)&255);		\
+    ((unsigned char *)(y))[0] = (unsigned char)((x)&255);			\
+} while(0)
+
+#define	CC_LOAD32_LE(x, y) do {                                     \
+x = ((uint32_t)(((const unsigned char *)(y))[3] & 255)<<24) |			    \
+    ((uint32_t)(((const unsigned char *)(y))[2] & 255)<<16) |			    \
+    ((uint32_t)(((const unsigned char *)(y))[1] & 255)<<8)  |			    \
+    ((uint32_t)(((const unsigned char *)(y))[0] & 255));				    \
+} while(0)
+
+// MARK: -- 64 bits - little endian
+
+#define	CC_STORE64_LE(x, y) do {                                    \
+    ((unsigned char *)(y))[7] = (unsigned char)(((x)>>56)&255);     \
+    ((unsigned char *)(y))[6] = (unsigned char)(((x)>>48)&255);		\
+    ((unsigned char *)(y))[5] = (unsigned char)(((x)>>40)&255);		\
+    ((unsigned char *)(y))[4] = (unsigned char)(((x)>>32)&255);		\
+    ((unsigned char *)(y))[3] = (unsigned char)(((x)>>24)&255);		\
+    ((unsigned char *)(y))[2] = (unsigned char)(((x)>>16)&255);		\
+    ((unsigned char *)(y))[1] = (unsigned char)(((x)>>8)&255);		\
+    ((unsigned char *)(y))[0] = (unsigned char)((x)&255);			\
+} while(0)
+
+#define	CC_LOAD64_LE(x, y) do {                                     \
+x = (((uint64_t)(((const unsigned char *)(y))[7] & 255))<<56) |           \
+    (((uint64_t)(((const unsigned char *)(y))[6] & 255))<<48) |           \
+    (((uint64_t)(((const unsigned char *)(y))[5] & 255))<<40) |           \
+    (((uint64_t)(((const unsigned char *)(y))[4] & 255))<<32) |           \
+    (((uint64_t)(((const unsigned char *)(y))[3] & 255))<<24) |           \
+    (((uint64_t)(((const unsigned char *)(y))[2] & 255))<<16) |           \
+    (((uint64_t)(((const unsigned char *)(y))[1] & 255))<<8)  |           \
+    (((uint64_t)(((const unsigned char *)(y))[0] & 255)));                \
+} while(0)
+
+// MARK: -- 32 bits - big endian
+// MARK: --- intel version
+
+#if (defined(__i386__) || defined(__x86_64__)) && !defined(_MSC_VER)
+
+#define CC_STORE32_BE(x, y)     \
+    __asm__ __volatile__ (      \
+    "bswapl %0     \n\t"        \
+    "movl   %0,(%1)\n\t"        \
+    "bswapl %0     \n\t"        \
+    ::"r"(x), "r"(y))
+
+#define CC_LOAD32_BE(x, y)      \
+    __asm__ __volatile__ (      \
+    "movl (%1),%0\n\t"          \
+    "bswapl %0\n\t"             \
+    :"=r"(x): "r"(y))
+
+#else
+// MARK: --- default version
+#define	CC_STORE32_BE(x, y) do {                                \
+    ((unsigned char *)(y))[0] = (unsigned char)(((x)>>24)&255);	\
+    ((unsigned char *)(y))[1] = (unsigned char)(((x)>>16)&255);	\
+    ((unsigned char *)(y))[2] = (unsigned char)(((x)>>8)&255);	\
+    ((unsigned char *)(y))[3] = (unsigned char)((x)&255);       \
+} while(0)
+
+#define	CC_LOAD32_BE(x, y) do {                             \
+x = ((uint32_t)(((const unsigned char *)(y))[0] & 255)<<24) |	    \
+    ((uint32_t)(((const unsigned char *)(y))[1] & 255)<<16) |		\
+    ((uint32_t)(((const unsigned char *)(y))[2] & 255)<<8)  |		\
+    ((uint32_t)(((const unsigned char *)(y))[3] & 255));          \
+} while(0)
+
+#endif
+
+// MARK: -- 64 bits - big endian
+
+// MARK: --- intel 64 bits version
+
+#if defined(__x86_64__) && !defined (_MSC_VER)
+
+#define	CC_STORE64_BE(x, y)   \
+__asm__ __volatile__ (        \
+"bswapq %0     \n\t"          \
+"movq   %0,(%1)\n\t"          \
+"bswapq %0     \n\t"          \
+::"r"(x), "r"(y))
+
+#define	CC_LOAD64_BE(x, y)    \
+__asm__ __volatile__ (        \
+"movq (%1),%0\n\t"            \
+"bswapq %0\n\t"               \
+:"=r"(x): "r"(y))
+
+#else
+
+// MARK: --- default version
+
+#define CC_STORE64_BE(x, y) do {                                    \
+    ((unsigned char *)(y))[0] = (unsigned char)(((x)>>56)&255);		\
+    ((unsigned char *)(y))[1] = (unsigned char)(((x)>>48)&255);		\
+    ((unsigned char *)(y))[2] = (unsigned char)(((x)>>40)&255);		\
+    ((unsigned char *)(y))[3] = (unsigned char)(((x)>>32)&255);		\
+    ((unsigned char *)(y))[4] = (unsigned char)(((x)>>24)&255);		\
+    ((unsigned char *)(y))[5] = (unsigned char)(((x)>>16)&255);		\
+    ((unsigned char *)(y))[6] = (unsigned char)(((x)>>8)&255);		\
+    ((unsigned char *)(y))[7] = (unsigned char)((x)&255);			\
+} while(0)
+
+#define	CC_LOAD64_BE(x, y) do {                                     \
+x = (((uint64_t)(((const unsigned char *)(y))[0] & 255))<<56) |           \
+    (((uint64_t)(((const unsigned char *)(y))[1] & 255))<<48) |           \
+    (((uint64_t)(((const unsigned char *)(y))[2] & 255))<<40) |           \
+    (((uint64_t)(((const unsigned char *)(y))[3] & 255))<<32) |           \
+    (((uint64_t)(((const unsigned char *)(y))[4] & 255))<<24) |           \
+    (((uint64_t)(((const unsigned char *)(y))[5] & 255))<<16) |           \
+    (((uint64_t)(((const unsigned char *)(y))[6] & 255))<<8)  |          	\
+    (((uint64_t)(((const unsigned char *)(y))[7] & 255)));	            \
+} while(0)
+
+#endif
+
+// MARK: - 32-bit Rotates
+
+#if defined(_MSC_VER)
+// MARK: -- MSVC version
+
+#include <stdlib.h>
+#if !defined(__clang__)
+ #pragma intrinsic(_lrotr,_lrotl)
+#endif
+#define	CC_ROR(x,n) _lrotr(x,n)
+#define	CC_ROL(x,n) _lrotl(x,n)
+#define	CC_RORc(x,n) _lrotr(x,n)
+#define	CC_ROLc(x,n) _lrotl(x,n)
+
+#elif (defined(__i386__) || defined(__x86_64__))
+// MARK: -- intel asm version
+
+CC_INLINE uint32_t CC_ROL(uint32_t word, int i)
+{
+    __asm__ ("roll %%cl,%0"
+         :"=r" (word)
+         :"0" (word),"c" (i));
+    return word;
+}
+
+CC_INLINE uint32_t CC_ROR(uint32_t word, int i)
+{
+    __asm__ ("rorl %%cl,%0"
+         :"=r" (word)
+         :"0" (word),"c" (i));
+    return word;
+}
+
+/* Need to be a macro here, because 'i' is an immediate (constant) */
+#define CC_ROLc(word, i)                \
+({  uint32_t _word=(word);              \
+    __asm__ __volatile__ ("roll %2,%0"  \
+        :"=r" (_word)                   \
+        :"0" (_word),"I" (i));          \
+    _word;                              \
+})
+
+
+#define CC_RORc(word, i)                \
+({  uint32_t _word=(word);              \
+    __asm__ __volatile__ ("rorl %2,%0"  \
+        :"=r" (_word)                   \
+        :"0" (_word),"I" (i));          \
+    _word;                              \
+})
+
+#else
+
+// MARK: -- default version
+
+CC_INLINE uint32_t CC_ROL(uint32_t word, int i)
+{
+    return ( (word<<(i&31)) | (word>>(32-(i&31))) );
+}
+
+CC_INLINE uint32_t CC_ROR(uint32_t word, int i)
+{
+    return ( (word>>(i&31)) | (word<<(32-(i&31))) );
+}
+
+#define	CC_ROLc(x, y) CC_ROL(x, y)
+#define	CC_RORc(x, y) CC_ROR(x, y)
+
+#endif
+
+// MARK: - 64 bits rotates
+
+#if defined(__x86_64__) && !defined(_MSC_VER) //clang _MSVC doesn't support GNU-style inline assembly
+// MARK: -- intel 64 asm version
+
+CC_INLINE uint64_t CC_ROL64(uint64_t word, int i)
+{
+    __asm__("rolq %%cl,%0"
+        :"=r" (word)
+        :"0" (word),"c" (i));
+    return word;
+}
+
+CC_INLINE uint64_t CC_ROR64(uint64_t word, int i)
+{
+    __asm__("rorq %%cl,%0"
+        :"=r" (word)
+        :"0" (word),"c" (i));
+    return word;
+}
+
+/* Need to be a macro here, because 'i' is an immediate (constant) */
+#define CC_ROL64c(word, i)      \
+({                              \
+    uint64_t _word=(word);      \
+    __asm__("rolq %2,%0"        \
+        :"=r" (_word)           \
+        :"0" (_word),"J" (i));  \
+    _word;                      \
+})
+
+#define CC_ROR64c(word, i)      \
+({                              \
+    uint64_t _word=(word);      \
+    __asm__("rorq %2,%0"        \
+        :"=r" (_word)           \
+        :"0" (_word),"J" (i));  \
+    _word;                      \
+})
+
+
+#else /* Not x86_64  */
+
+// MARK: -- default C version
+
+CC_INLINE uint64_t CC_ROL64(uint64_t word, int i)
+{
+    return ( (word<<(i&63)) | (word>>(64-(i&63))) );
+}
+
+CC_INLINE uint64_t CC_ROR64(uint64_t word, int i)
+{
+    return ( (word>>(i&63)) | (word<<(64-(i&63))) );
+}
+
+#define	CC_ROL64c(x, y) CC_ROL64(x, y)
+#define	CC_ROR64c(x, y) CC_ROR64(x, y)
+
+#endif
+
+
+// MARK: - Byte Swaps
+
+#if __has_builtin(__builtin_bswap32)
+#define CC_BSWAP32(x) __builtin_bswap32(x)
+#else
+CC_INLINE uint32_t CC_BSWAP32(uint32_t x)
+{
+    return
+        ((x & 0xff000000) >> 24) |
+        ((x & 0x00ff0000) >>  8) |
+        ((x & 0x0000ff00) <<  8) |
+        ((x & 0x000000ff) << 24);
+}
+#endif
+
+#if __has_builtin(__builtin_bswap64)
+#define CC_BSWAP64(x) __builtin_bswap64(x)
+#else
+CC_INLINE uint64_t CC_BSWAP64(uint64_t x)
+{
+    return
+        ((x & 0xff00000000000000ULL) >> 56) |
+        ((x & 0x00ff000000000000ULL) >> 40) |
+        ((x & 0x0000ff0000000000ULL) >> 24) |
+        ((x & 0x000000ff00000000ULL) >>  8) |
+        ((x & 0x00000000ff000000ULL) <<  8) |
+        ((x & 0x0000000000ff0000ULL) << 24) |
+        ((x & 0x000000000000ff00ULL) << 40) |
+        ((x & 0x00000000000000ffULL) << 56);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define CC_H2BE32(x) CC_BSWAP32(x)
+#define CC_H2LE32(x) (x)
+#define CC_H2BE64(x) CC_BSWAP64(x)
+#define CC_H2LE64(x) (x)
+#else
+#define CC_H2BE32(x) (x)
+#define CC_H2LE32(x) CC_BSWAP32(x)
+#define CC_H2BE64(x) (x)
+#define CC_H2LE64(x) CC_BSWAP64(x)
+#endif
+
+#define	CC_READ_LE32(ptr) \
+( (uint32_t)( \
+((uint32_t)((const uint8_t *)(ptr))[0]) | \
+(((uint32_t)((const uint8_t *)(ptr))[1]) <<  8) | \
+(((uint32_t)((const uint8_t *)(ptr))[2]) << 16) | \
+(((uint32_t)((const uint8_t *)(ptr))[3]) << 24)))
+
+#define	CC_WRITE_LE32(ptr, x) \
+do { \
+((uint8_t *)(ptr))[0] = (uint8_t)( (x)        & 0xFF); \
+((uint8_t *)(ptr))[1] = (uint8_t)(((x) >>  8) & 0xFF); \
+((uint8_t *)(ptr))[2] = (uint8_t)(((x) >> 16) & 0xFF); \
+((uint8_t *)(ptr))[3] = (uint8_t)(((x) >> 24) & 0xFF); \
+} while(0)
+
+#define	CC_WRITE_LE64(ptr, x) \
+do { \
+((uint8_t *)(ptr))[0] = (uint8_t)( (x)        & 0xFF); \
+((uint8_t *)(ptr))[1] = (uint8_t)(((x) >>  8) & 0xFF); \
+((uint8_t *)(ptr))[2] = (uint8_t)(((x) >> 16) & 0xFF); \
+((uint8_t *)(ptr))[3] = (uint8_t)(((x) >> 24) & 0xFF); \
+((uint8_t *)(ptr))[4] = (uint8_t)(((x) >> 32) & 0xFF); \
+((uint8_t *)(ptr))[5] = (uint8_t)(((x) >> 40) & 0xFF); \
+((uint8_t *)(ptr))[6] = (uint8_t)(((x) >> 48) & 0xFF); \
+((uint8_t *)(ptr))[7] = (uint8_t)(((x) >> 56) & 0xFF); \
+} while(0)
+
+/* extract a byte portably */
+#ifdef _MSC_VER
+#define cc_byte(x, n) ((unsigned char)((x) >> (8 * (n))))
+#else
+#define cc_byte(x, n) (((x) >> (8 * (n))) & 255)
+#endif
+
+/* Count leading zeros (for nonzero inputs) */
+
+/*
+ *  On i386 and x86_64, we know clang and GCC will generate BSR for
+ *  __builtin_clzl.  This instruction IS NOT constant time on all micro-
+ *  architectures, but it *is* constant time on all micro-architectures that
+ *  have been used by Apple, and we expect that to continue to be the case.
+ *
+ *  When building for x86_64h with clang, this produces LZCNT, which is exactly
+ *  what we want.
+ *
+ *  On arm and arm64, we know that clang and GCC generate the constant-time CLZ
+ *  instruction from __builtin_clzl( ).
+ */
+
+#if defined(_WIN32)
+/* We use the Windows implementations below. */
+#elif defined(__x86_64__) || defined(__i386__) || defined(__arm64__) || defined(__arm__)
+/* We use a thought-to-be-good version of __builtin_clz. */
+#elif defined __GNUC__
+#warning Using __builtin_clz() on an unknown architecture; it may not be constant-time.
+/* If you find yourself seeing this warning, file a radar for someone to
+ * check whether or not __builtin_clz() generates a constant-time
+ * implementation on the architecture you are targeting.  If it does, append
+ * the name of that architecture to the list of "safe" architectures above.  */
+#endif
+
+CC_INLINE CC_CONST unsigned cc_clz32_fallback(uint32_t data)
+{
+    unsigned int b = 0;
+    unsigned int bit = 0;
+    // Work from LSB to MSB
+    for (int i = 0; i < 32; i++) {
+        bit = (data >> i) & 1;
+        // If the bit is 0, update the "leading bits are zero" counter "b".
+        b += (1 - bit);
+        /* If the bit is 0, (bit - 1) is 0xffff... therefore b is retained.
+         * If the bit is 1, (bit - 1) is 0 therefore b is set to 0.
+         */
+        b &= (bit - 1);
+    }
+    return b;
+}
+
+CC_INLINE CC_CONST unsigned cc_clz64_fallback(uint64_t data)
+{
+    unsigned int b = 0;
+    unsigned int bit = 0;
+    // Work from LSB to MSB
+    for (int i = 0; i < 64; i++) {
+        bit = (data >> i) & 1;
+        // If the bit is 0, update the "leading bits are zero" counter.
+        b += (1 - bit);
+        /* If the bit is 0, (bit - 1) is 0xffff... therefore b is retained.
+         * If the bit is 1, (bit - 1) is 0 therefore b is set to 0.
+         */
+        b &= (bit - 1);
+    }
+    return b;
+}
+
+CC_INLINE CC_CONST unsigned cc_ctz32_fallback(uint32_t data)
+{
+    unsigned int b = 0;
+    unsigned int bit = 0;
+    // Work from MSB to LSB
+    for (int i = 31; i >= 0; i--) {
+        bit = (data >> i) & 1;
+        // If the bit is 0, update the "trailing zero bits" counter.
+        b += (1 - bit);
+        /* If the bit is 0, (bit - 1) is 0xffff... therefore b is retained.
+         * If the bit is 1, (bit - 1) is 0 therefore b is set to 0.
+         */
+        b &= (bit - 1);
+    }
+    return b;
+}
+
+CC_INLINE CC_CONST unsigned cc_ctz64_fallback(uint64_t data)
+{
+    unsigned int b = 0;
+    unsigned int bit = 0;
+    // Work from MSB to LSB
+    for (int i = 63; i >= 0; i--) {
+        bit = (data >> i) & 1;
+        // If the bit is 0, update the "trailing zero bits" counter.
+        b += (1 - bit);
+        /* If the bit is 0, (bit - 1) is 0xffff... therefore b is retained.
+         * If the bit is 1, (bit - 1) is 0 therefore b is set to 0.
+         */
+        b &= (bit - 1);
+    }
+    return b;
+}
+
+/*!
+  @function cc_clz32
+  @abstract Count leading zeros of a nonzero 32-bit value
+
+  @param data A nonzero 32-bit value
+
+  @result Count of leading zeros of @p data
+
+  @discussion @p data is assumed to be nonzero.
+*/
+CC_INLINE CC_CONST unsigned cc_clz32(uint32_t data) {
+    cc_assert(data != 0);
+#if defined(_WIN32)
+    return cc_clz32_fallback(data);
+#elif defined(__x86_64__) || defined(__i386__) || defined(__arm64__) || defined(__arm__) || defined(__GNUC__)
+    cc_static_assert(sizeof(unsigned) == 4, "clz relies on an unsigned int being 4 bytes");
+    return (unsigned)__builtin_clz(data);
+#else
+    return cc_clz32_fallback(data);
+#endif
+}
+
+/*!
+  @function cc_clz64
+  @abstract Count leading zeros of a nonzero 64-bit value
+
+  @param data A nonzero 64-bit value
+
+  @result Count of leading zeros of @p data
+
+  @discussion @p data is assumed to be nonzero.
+*/
+CC_INLINE CC_CONST unsigned cc_clz64(uint64_t data) {
+    cc_assert(data != 0);
+#if defined(_WIN32)
+    return cc_clz64_fallback(data);
+#elif defined(__x86_64__) || defined(__i386__) || defined(__arm64__) || defined(__arm__) || defined(__GNUC__)
+    return (unsigned)__builtin_clzll(data);
+#else
+    return cc_clz64_fallback(data);
+#endif
+}
+
+/*!
+  @function cc_ctz32
+  @abstract Count trailing zeros of a nonzero 32-bit value
+
+  @param data A nonzero 32-bit value
+
+  @result Count of trailing zeros of @p data
+
+  @discussion @p data is assumed to be nonzero.
+*/
+CC_INLINE CC_CONST unsigned cc_ctz32(uint32_t data) {
+    cc_assert(data != 0);
+#if defined(_WIN32)
+    return cc_ctz32_fallback(data);
+#elif defined(__x86_64__) || defined(__i386__) || defined(__arm64__) || defined(__arm__) || defined(__GNUC__)
+    cc_static_assert(sizeof(unsigned) == 4, "ctz relies on an unsigned int being 4 bytes");
+    return (unsigned)__builtin_ctz(data);
+#else
+    return cc_ctz32_fallback(data);
+#endif
+}
+
+/*!
+  @function cc_ctz64
+  @abstract Count trailing zeros of a nonzero 64-bit value
+
+  @param data A nonzero 64-bit value
+
+  @result Count of trailing zeros of @p data
+
+  @discussion @p data is assumed to be nonzero.
+*/
+CC_INLINE CC_CONST unsigned cc_ctz64(uint64_t data) {
+    cc_assert(data != 0);
+#if defined(_WIN32)
+    return cc_ctz64_fallback(data);
+#elif defined(__x86_64__) || defined(__i386__) || defined(__arm64__) || defined(__arm__) || defined(__GNUC__)
+    return (unsigned)__builtin_ctzll(data);
+#else
+    return cc_ctz64_fallback(data);
+#endif
+}
+
+/*!
+  @function cc_ffs32_fallback
+  @abstract Find first bit set in a 32-bit value
+
+  @param data A 32-bit value
+
+  @result One plus the index of the least-significant bit set in @p data or, if @p data is zero, zero
+ */
+CC_INLINE CC_CONST unsigned cc_ffs32_fallback(int32_t data)
+{
+    unsigned b = 0;
+    unsigned bit = 0;
+    unsigned seen = 0;
+
+    // Work from LSB to MSB
+    for (int i = 0; i < 32; i++) {
+        bit = ((uint32_t)data >> i) & 1;
+
+        // Track whether we've seen a 1 bit.
+        seen |= bit;
+
+        // If the bit is 0 and we haven't seen a 1 yet, increment b.
+        b += (1 - bit) & (seen - 1);
+    }
+
+    // If we saw a 1, return b + 1, else 0.
+    return (~(seen - 1)) & (b + 1);
+}
+
+/*!
+  @function cc_ffs64_fallback
+  @abstract Find first bit set in a 64-bit value
+
+  @param data A 64-bit value
+
+  @result One plus the index of the least-significant bit set in @p data or, if @p data is zero, zero
+ */
+CC_INLINE CC_CONST unsigned cc_ffs64_fallback(int64_t data)
+{
+    unsigned b = 0;
+    unsigned bit = 0;
+    unsigned seen = 0;
+
+    // Work from LSB to MSB
+    for (int i = 0; i < 64; i++) {
+        bit = ((uint64_t)data >> i) & 1;
+
+        // Track whether we've seen a 1 bit.
+        seen |= bit;
+
+        // If the bit is 0 and we haven't seen a 1 yet, increment b.
+        b += (1 - bit) & (seen - 1);
+    }
+
+    // If we saw a 1, return b + 1, else 0.
+    return (~(seen - 1)) & (b + 1);
+}
+
+/*!
+  @function cc_ffs32
+  @abstract Find first bit set in a 32-bit value
+
+  @param data A 32-bit value
+
+  @result One plus the index of the least-significant bit set in @p data or, if @p data is zero, zero
+ */
+CC_INLINE CC_CONST unsigned cc_ffs32(int32_t data)
+{
+    cc_static_assert(sizeof(int) == 4, "ffs relies on an int being 4 bytes");
+#ifdef _WIN32
+    return cc_ffs32_fallback(data);
+#else
+    return (unsigned)__builtin_ffs(data);
+#endif
+}
+
+/*!
+  @function cc_ffs64
+  @abstract Find first bit set in a 64-bit value
+
+  @param data A 64-bit value
+
+  @result One plus the index of the least-significant bit set in @p data or, if @p data is zero, zero
+ */
+CC_INLINE CC_CONST unsigned cc_ffs64(int64_t data)
+{
+#ifdef _WIN32
+    return cc_ffs64_fallback(data);
+#else
+    return (unsigned)__builtin_ffsll(data);
+#endif
+}
+
+#define cc_add_overflow __builtin_add_overflow
+#define cc_mul_overflow __builtin_mul_overflow
+
+/* HEAVISIDE_STEP (shifted by one)
+   function f(x): x->0, when x=0
+                  x->1, when x>0
+   Can also be seen as a bitwise operation:
+      f(x): x -> y
+        y[0]=(OR x[i]) for all i (all bits)
+        y[i]=0 for all i>0
+   Run in constant time (log2(<bitsize of x>))
+   Useful to run constant time checks
+*/
+#define CC_HEAVISIDE_STEP(r, s) {                       \
+    const uint64_t _s = (uint64_t)s;                    \
+    const uint64_t _t = (_s & 0xffffffff) | (_s >> 32); \
+    r = (__typeof__(r))((0xffffffff + _t) >> 32);       \
+}
+
+/* Return 1 if x mod 4 =1,2,3, 0 otherwise */
+#define CC_CARRY_2BITS(x) (((x>>1) | x) & 0x1)
+#define CC_CARRY_3BITS(x) (((x>>2) | (x>>1) | x) & 0x1)
+
+#define cc_ceiling(a,b)  (((a)+((b)-1))/(b))
+#define CC_BITLEN_TO_BYTELEN(x) cc_ceiling((x), 8)
+
+/*!
+ @brief     cc_muxp(s, a, b) is equivalent to z = s ? a : b, but it executes in constant time
+ @param a	input pointer
+ @param b	input pointer
+ @param s	The selection parameter s must be 0 or 1. if s is integer 1 a is returned. If s is integer 0, b is returned. Otherwise, the output is undefined.
+ @return    Returns a, if s is 1 and b if s is 0
+ */
+void *cc_muxp(int s, const void *a, const void *b);
+
+/*!
+ @brief     CC_MUXU(r, s, a, b) is equivalent to r = s ? a : b, but executes in constant time
+ @param a   Input a
+ @param b   Input b
+ @param s   Selection parameter s. Must be 0 or 1.
+ @param r   Output, set to a if s=1, or b if s=0.
+ */
+#define CC_MUXU(r, s, a, b)                           \
+    {                                                 \
+        __typeof__(r) _cond = (__typeof__(r))((s)-1); \
+        r = (~_cond & (a)) | (_cond & (b));           \
+    }
+
+#define CC_PROVIDES_ABORT (!(CC_USE_SEPROM || CC_USE_S3 || CC_BASEBAND || CC_EFI || CC_IBOOT || CC_RTKITROM))
+
+/*!
+ @function cc_abort
+ @abstract Abort execution unconditionally
+ */
+CC_NORETURN
+void cc_abort(const char *msg);
+
+/*!
+  @function cc_try_abort
+  @abstract Abort execution iff the platform provides a function like @p abort() or @p panic()
+
+  @discussion If the platform does not provide a means to abort execution, this function does nothing; therefore, callers should return an error code after calling this function.
+*/
+#if CC_PROVIDES_ABORT
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wmissing-noreturn"
+
+CC_INLINE
+void cc_try_abort(const char *msg)
+{
+    cc_abort(msg);
+}
+
+#pragma clang diagnostic pop
+
+#else
+
+CC_INLINE
+void cc_try_abort(CC_UNUSED const char *msg)
+{
+
+}
+
+#endif
+
+#if __has_builtin(__builtin_expect)
+ #define CC_UNLIKELY(cond) __builtin_expect(cond, 0)
+#else
+ #define CC_UNLIKELY(cond) cond
+#endif
+
+CC_INLINE
+void cc_try_abort_if(bool condition, const char *msg)
+{
+    if (CC_UNLIKELY(condition)) {
+        cc_try_abort(msg);
+    }
+}
+
+/*
+  Unfortunately, since we export this symbol, this declaration needs
+  to be in a public header to satisfy TAPI.
+
+  See fipspost_trace_priv.h for more details.
+*/
+extern const void *fipspost_trace_vtable;
+
+#endif /* _CORECRYPTO_CC_PRIV_H_ */
--- a/cc/corecrypto/cc_runtime_config.h
+++ b/cc/corecrypto/cc_runtime_config.h
@ -0,0 +1,90 @@
+/* Copyright (c) (2012,2014,2015,2016,2017,2018,2019,2020) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to 
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by 
+ * Apple Inc. (if any) are limited to internal use within your organization only on 
+ * devices and computers you own or control, for the sole purpose of verifying the 
+ * security characteristics and correct functioning of the Apple Software.  You may 
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#ifndef CORECRYPTO_CC_RUNTIME_CONFIG_H_
+#define CORECRYPTO_CC_RUNTIME_CONFIG_H_
+
+#include <corecrypto/cc_config.h>
+
+#if defined(__x86_64__) || defined(__i386__)
+
+#if CC_KERNEL
+    #include <i386/cpuid.h>
+    #define CC_HAS_RDRAND() ((cpuid_features() & CPUID_FEATURE_RDRAND) != 0)
+    #define CC_HAS_AESNI() ((cpuid_features() & CPUID_FEATURE_AES) != 0)
+    #define CC_HAS_SupplementalSSE3() ((cpuid_features() & CPUID_FEATURE_SSSE3) != 0)
+    #define CC_HAS_AVX1() ((cpuid_features() & CPUID_FEATURE_AVX1_0) != 0)
+    #define CC_HAS_AVX2() ((cpuid_info()->cpuid_leaf7_features & CPUID_LEAF7_FEATURE_AVX2) != 0)
+    #define CC_HAS_AVX512_AND_IN_KERNEL()    ((cpuid_info()->cpuid_leaf7_features & CPUID_LEAF7_FEATURE_AVX512F) !=0)
+    #define CC_HAS_BMI2() ((cpuid_info()->cpuid_leaf7_features & CPUID_LEAF7_FEATURE_BMI2) != 0)
+    #define CC_HAS_ADX() ((cpuid_info()->cpuid_leaf7_features & CPUID_LEAF7_FEATURE_ADX) != 0)
+
+#elif CC_DARWIN && CC_INTERNAL_SDK
+    #include <System/i386/cpu_capabilities.h>
+    #define CC_HAS_RDRAND() (_get_cpu_capabilities() & kHasRDRAND)
+    #define CC_HAS_AESNI() (_get_cpu_capabilities() & kHasAES)
+    #define CC_HAS_SupplementalSSE3() (_get_cpu_capabilities() & kHasSupplementalSSE3)
+    #define CC_HAS_AVX1() (_get_cpu_capabilities() & kHasAVX1_0)
+    #define CC_HAS_AVX2() (_get_cpu_capabilities() & kHasAVX2_0)
+    #define CC_HAS_AVX512_AND_IN_KERNEL() 0
+    #define CC_HAS_BMI2() (_get_cpu_capabilities() & kHasBMI2)
+    #define CC_HAS_ADX() (_get_cpu_capabilities() & kHasADX)
+
+#else
+    #define CC_HAS_AESNI() __builtin_cpu_supports("aes")
+    #define CC_HAS_SupplementalSSE3() __builtin_cpu_supports("ssse3")
+    #define CC_HAS_AVX1() __builtin_cpu_supports("avx")
+    #define CC_HAS_AVX2() __builtin_cpu_supports("avx2")
+    #define CC_HAS_AVX512_AND_IN_KERNEL() 0
+    #define CC_HAS_BMI2() __builtin_cpu_supports("bmi2")
+#if CC_LINUX || !CC_INTERNAL_SDK
+    #include <cpuid.h>
+    #include <stdbool.h>
+
+    CC_INLINE bool _cpu_supports_rdrand()
+    {
+        unsigned int eax, ebx, ecx, edx;
+        __cpuid(1, eax, ebx, ecx, edx);
+        return ecx & bit_RDRND;
+    }
+
+    CC_INLINE bool _cpu_supports_adx()
+    {
+        unsigned int eax, ebx, ecx, edx;
+        __cpuid_count(7, 0, eax, ebx, ecx, edx);
+        return ebx & bit_ADX;
+    }
+
+    #define CC_HAS_RDRAND() _cpu_supports_rdrand()
+    #define CC_HAS_ADX() _cpu_supports_adx()
+#else
+    #define CC_HAS_RDRAND() 0
+    #define CC_HAS_ADX() 0
+#endif
+
+#endif
+
+#endif  // defined(__x86_64__) || defined(__i386__)
+
+#if defined(__arm64__)
+
+#if CC_DARWIN && CC_INTERNAL_SDK
+    #include <System/arm/cpu_capabilities.h>
+    #define CC_HAS_SHA512() (_get_cpu_capabilities() & kHasARMv82SHA512)
+    #define CC_HAS_SHA3() (_get_cpu_capabilities() & kHasARMv82SHA3)
+#else
+    #define CC_HAS_SHA512() (0)
+    #define CC_HAS_SHA3() (0)
+#endif
+
+#endif // defined(__arm64__)
+
+#endif /* CORECRYPTO_CC_RUNTIME_CONFIG_H_ */
--- a/cc/corecrypto/ccarm_intrinsic_compatability.h
+++ b/cc/corecrypto/ccarm_intrinsic_compatability.h
@ -0,0 +1,174 @@
+/* Copyright (c) (2013,2015,2016,2019) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to 
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by 
+ * Apple Inc. (if any) are limited to internal use within your organization only on 
+ * devices and computers you own or control, for the sole purpose of verifying the 
+ * security characteristics and correct functioning of the Apple Software.  You may 
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+
+#ifndef corecrypto_arm_aes_compatability_h
+#define corecrypto_arm_aes_compatability_h
+
+// #include <Availability.h>
+#include <sys/cdefs.h>
+
+#if defined(__clang__) && ((defined(__apple_build_version__) && __apple_build_version__ > 5010000))
+#define __USES_V_CRYPTO_INTRINSICS 1
+#else
+#define __USES_V_CRYPTO_INTRINSICS 0
+#endif
+
+
+//  AES INSTRUCTIONS
+// aese.16b	v0, v1
+// aesd.16b	v0, v1
+// aesmc.16b	v0, v1
+// aesimc.16b	v0, v1
+
+// SHA1 INTRINSICS
+// sha1su0.4s	v0, v1, v2
+// sha1su1.4s	v0, v1
+// sha1c.4s	v0, v1, v2		// or q0, s1, v2.4s
+// sha1m.4s	v0, v1, v2		// or q0, s1, v2.4s
+// sha1p.4s	v0, v1, v2		// or q0, s1, v2.4s
+// sha1h.4s	v0, v1		// or s0, s1
+
+// SHA256 INTRINSICS
+// sha256su0.4s	v0, v1
+// sha256su1.4s	v0, v1, v2
+// sha256h.4s		v0, v1, v2		// or q0, q1, v2.4s
+// sha256h2.4s	v0, v1, v2		// or q0, q1, v2.4s
+
+
+#if __USES_V_CRYPTO_INTRINSICS == 1
+.macro	AESE
+aese.16b v$0, v$1
+.endm
+
+.macro	AESD
+aesd.16b v$0, v$1
+.endm
+
+.macro	AESMC
+aesmc.16b v$0, v$1
+.endm
+
+.macro	AESIMC
+aesimc.16b v$0, v$1
+.endm
+
+
+#else
+
+.macro	AESE
+aese q$0, q$1
+.endm
+
+.macro	AESD
+aesd q$0, q$1
+.endm
+
+.macro	AESMC
+aesmc q$0, q$1
+.endm
+
+.macro	AESIMC
+aesimc q$0, q$1
+.endm
+
+#endif
+
+#if __USES_V_CRYPTO_INTRINSICS == 1
+
+.macro SHA1SU0
+sha1su0	v$0.4s, v$1.4s, v$2.4s
+.endm
+
+.macro SHA1SU1
+sha1su1	v$0.4s, v$1.4s
+.endm
+
+.macro SHA1C
+sha1c	q$0, s$1, v$2.4s
+.endm
+
+.macro SHA1M
+sha1m	q$0, s$1, v$2.4s
+.endm
+
+.macro SHA1P
+sha1p	q$0, s$1, v$2.4s
+.endm
+
+.macro SHA1H
+sha1h	s$0, s$1
+.endm
+
+.macro SHA256SU0
+sha256su0    v$0.4s, v$1.4s
+.endm
+
+.macro SHA256SU1
+sha256su1    v$0.4s, v$1.4s, v$2.4s
+.endm
+
+.macro SHA256H
+sha256h    q$0, q$1, v$2.4s
+.endm
+
+.macro SHA256H2
+sha256h2    q$0, q$1, v$2.4s
+.endm
+
+#else
+
+.macro SHA1SU0
+sha1su0	q$0, q$1, q$2
+.endm
+
+.macro SHA1SU1
+sha1su1	q$0, q$1
+.endm
+
+.macro SHA1C
+sha1c	q$0, q$1, q$2
+.endm
+
+.macro SHA1M
+sha1m	q$0, q$1, q$2
+.endm
+
+.macro SHA1P
+sha1p	q$0, q$1, q$2
+.endm
+
+.macro SHA1H
+sha1h	q$0, q$1
+.endm
+
+.macro SHA256SU0
+sha256su0    q$0, q$1
+.endm
+
+.macro SHA256SU1
+sha256su1    q$0, q$1, q$2
+.endm
+
+.macro SHA256H
+sha256h    q$0, q$1, q$2
+.endm
+
+.macro SHA256H2
+sha256h2    q$0, q$1, q$2
+.endm
+
+#endif
+#endif /*corecrypto_arm_aes_compatability_h*/
+
+
+
+
--- a/cc/corecrypto/ccarm_pac_bti_macros.h
+++ b/cc/corecrypto/ccarm_pac_bti_macros.h
@ -0,0 +1,46 @@
+/* Copyright (c) (2011,2015,2016,2018-2020) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by
+ * Apple Inc. (if any) are limited to internal use within your organization only on
+ * devices and computers you own or control, for the sole purpose of verifying the
+ * security characteristics and correct functioning of the Apple Software.  You may
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#ifndef _CORECRYPTO_CCARM_PAC_BTI_MACROS_H_
+#define _CORECRYPTO_CCARM_PAC_BTI_MACROS_H_
+
+/*
+ * This file defines commonly used macros in handwritten assembly
+ * for making functions BTI and PAC compatible.
+ */
+
+#ifndef __arm64e__
+#define __arm64e__ 0
+#endif
+
+.macro SIGN_LR
+#if __arm64e__
+        pacibsp
+#endif
+.endmacro
+
+.macro AUTH_LR_AND_RET
+#if __arm64e__
+        retab
+#else
+        ret
+#endif
+.endmacro
+
+.macro BRANCH_TARGET_CALL
+#if __arm64e__
+        hint #34 /* bti c */
+#endif
+.endmacro
+
+
+
+#endif /* _CORECRYPTO_CCARM_PAC_BTI_MACROS_H_ */
--- a/cc/crypto_test/crypto_test_cc.c
+++ b/cc/crypto_test/crypto_test_cc.c
@ -0,0 +1,596 @@
+/* Copyright (c) (2014,2015,2016,2018,2019) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to 
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by 
+ * Apple Inc. (if any) are limited to internal use within your organization only on 
+ * devices and computers you own or control, for the sole purpose of verifying the 
+ * security characteristics and correct functioning of the Apple Software.  You may 
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#include <corecrypto/cc_priv.h>
+#include "../corecrypto_test/include/testmore.h"
+#include "testbyteBuffer.h"
+#include <stdbool.h>
+#include <limits.h>
+
+#define CC_SECURITY_TEST
+
+#if (CC == 0)
+entryPoint(cc_tests,"cc")
+#else
+
+#ifdef CC_SECURITY_TEST
+#include <corecrypto/ccrng_test.h>
+#include "cccycles.h"
+#include "ccstats.h"
+#include "ccconstanttime.h"
+#endif
+
+
+// Disable the static analyzer for the code below since we do voluntary access to
+// uninitialized memory area in stack
+
+#ifdef __clang_analyzer__
+int stack_clear_test(size_t size);
+#endif
+
+#ifndef __clang_analyzer__
+
+#if defined(__has_feature) && __has_feature(address_sanitizer)
+#define CC_NO_SANITIZE __attribute__((no_sanitize_address))
+#else
+#define CC_NO_SANITIZE
+#endif // __has_feature
+
+#define STACK_MAGIC 0xC0DEBA5E
+
+CC_NO_SANITIZE static void
+stack_dirty(size_t size)
+{
+    volatile uint32_t array[size];
+    for (size_t i=0;i<size;i++)
+    {
+        array[i]=STACK_MAGIC;
+    }
+}
+
+CC_NO_SANITIZE static void
+stack_clear(size_t size)
+{
+    uint32_t array[size];
+    cc_clear(sizeof(array),array);
+}
+
+CC_NO_SANITIZE static int
+stack_test(size_t size)
+{
+    volatile uint32_t array[size];
+    for (size_t i=0;i<size;i++)
+    {
+        if (array[i]==STACK_MAGIC)
+        {
+            return 1; //error stack was not cleared.
+        }
+    }
+    return 0;
+}
+
+CC_NO_SANITIZE static int
+stack_clear_test(size_t size)
+{
+    stack_dirty(size);
+    stack_clear(size);
+    return stack_test(size);
+}
+
+#endif  /* __clang_analyzer__ */
+// Static analyzer re-enabled.
+
+#define CLZ_RANDOM_TESTS 10000
+
+static void
+clz_tests(void) {
+    int i;
+    uint64_t r64;
+    uint32_t r32;
+    struct ccrng_state *rng = global_test_rng;
+
+    is(cc_clz32_fallback(2863311530), cc_clz32(2863311530), "clz32 1010... pattern");
+    is(cc_clz64_fallback(12297829382473034410U), cc_clz64(12297829382473034410U), "clz64 1010... pattern");
+    is(cc_clz32_fallback(1431655765), cc_clz32(1431655765), "clz32 0101... pattern");
+    is(cc_clz64_fallback(6148914691236517205U), cc_clz64(6148914691236517205U), "clz64 0101... pattern");
+
+    for (i = 0; i < 32; i++) {
+        is(cc_clz32_fallback(1U << i), cc_clz32(1U << i), "clz32");
+        is(cc_clz32_fallback((1U << i) + 1), cc_clz32((1U << i) + 1), "clz32 + 1");
+        is(cc_clz32_fallback((1U << i) + (1U << 16)), cc_clz32((1U << i) + (1U << 16)), "clz32 + 1 << 16");
+    }
+
+    for (i = 0; i < 64; i++) {
+        is(cc_clz64_fallback(1ULL << i), cc_clz64(1ULL << i), "clz64");
+        is(cc_clz64_fallback((1ULL << i) + 1), cc_clz64((1ULL << i) + 1), "clz64 + 1");
+        is(cc_clz64_fallback((1ULL << i) + UINT_MAX + 1), cc_clz64((1ULL << i) + UINT_MAX + 1), "clz64 + 1 << 32");
+    }
+
+    for (i = 0; i < CLZ_RANDOM_TESTS; i++)
+    {
+        ccrng_generate(rng, sizeof(r64), &r64);
+        is(cc_clz64_fallback(r64), cc_clz64(r64), "clz64 random");
+        r32 = r64 >> 32;
+        is(cc_clz32_fallback(r32), cc_clz32(r32), "clz32 random");
+    }
+}
+
+#define CTZ_RANDOM_TESTS 10000
+
+static void
+ctz_tests(void) {
+    int i;
+    uint64_t r64;
+    uint32_t r32;
+    struct ccrng_state *rng = global_test_rng;
+
+    is(cc_ctz32_fallback(2863311530), cc_ctz32(2863311530), "ctz32 1010... pattern");
+    is(cc_ctz64_fallback(12297829382473034410U), cc_ctz64(12297829382473034410U), "ctz64 1010... pattern");
+    is(cc_ctz32_fallback(1431655765), cc_ctz32(1431655765), "ctz32 0101... pattern");
+    is(cc_ctz64_fallback(6148914691236517205U), cc_ctz64(6148914691236517205U), "ctz64 0101... pattern");
+
+    for (i = 0; i < 32; i++) {
+        is(cc_ctz32_fallback(1U << i), cc_ctz32(1U << i), "ctz32");
+        is(cc_ctz32_fallback((1U << i) + 1), cc_ctz32((1U << i) + 1), "ctz32 + 1");
+        is(cc_ctz32_fallback((1U << i) + (1U << 16)), cc_ctz32((1U << i) + (1U << 16)), "ctz32 + 1 << 16");
+    }
+
+    for (i = 0; i < 64; i++) {
+        is(cc_ctz64_fallback(1ULL << i), cc_ctz64(1ULL << i), "ctz64");
+        is(cc_ctz64_fallback((1ULL << i) + 1), cc_ctz64((1ULL << i) + 1), "ctz64 + 1");
+        is(cc_ctz64_fallback((1ULL << i) + UINT_MAX + 1), cc_ctz64((1ULL << i) + UINT_MAX + 1), "ctz64 + 1 << 32");
+    }
+
+    for (i = 0; i < CTZ_RANDOM_TESTS; i++)
+    {
+        ccrng_generate(rng, sizeof(r64), &r64);
+        is(cc_ctz64_fallback(r64), cc_ctz64(r64), "ctz64 random");
+        r32 = r64 >> 32;
+        is(cc_ctz32_fallback(r32), cc_ctz32(r32), "ctz32 random");
+    }
+}
+
+#define FFS_RANDOM_TESTS 10000
+
+static void
+ffs_tests(void) {
+    int i;
+    int64_t r64;
+    int32_t r32;
+    struct ccrng_state *rng = global_test_rng;
+
+    is(cc_ffs32_fallback(0), cc_ffs32(0), "ffs32 zero");
+    is(cc_ffs64_fallback(0), cc_ffs64(0), "ffs64 zero");
+    is(cc_ffs32_fallback((int32_t)2863311530), cc_ffs32((int32_t)2863311530), "ffs32 1010... pattern");
+    is(cc_ffs64_fallback((int64_t)12297829382473034410U), cc_ffs64((int64_t)12297829382473034410U), "ffs64 1010... pattern");
+    is(cc_ffs32_fallback(1431655765), cc_ffs32(1431655765), "ffs32 0101... pattern");
+    is(cc_ffs64_fallback(6148914691236517205), cc_ffs64(6148914691236517205), "ffs64 0101... pattern");
+
+    for (i = 0; i < 32; i++) {
+        is(cc_ffs32_fallback(1 << i), cc_ffs32(1 << i), "ffs32");
+        is(cc_ffs32_fallback((1 << i) + 1), cc_ffs32((1 << i) + 1), "ffs32 + 1");
+        is(cc_ffs32_fallback((1 << i) + (1 << 16)), cc_ffs32((1 << i) + (1 << 16)), "ffs32 + 1 << 16");
+    }
+
+    for (i = 0; i < 64; i++) {
+        is(cc_ffs64_fallback(1LL << i), cc_ffs64(1LL << i), "ffs64");
+        is(cc_ffs64_fallback((1LL << i) + 1), cc_ffs64((1LL << i) + 1), "ffs64 + 1");
+        is(cc_ffs64_fallback((1LL << i) + UINT_MAX + 1), cc_ffs64((1LL << i) + UINT_MAX + 1), "ffs64 + 1 << 32");
+    }
+
+    for (i = 0; i < FFS_RANDOM_TESTS; i++) {
+        ccrng_generate(rng, sizeof(r64), &r64);
+        is(cc_ffs64_fallback(r64), cc_ffs64(r64), "ffs64 random");
+        r32 = r64 >> 32;
+        is(cc_ffs32_fallback(r32), cc_ffs32(r32), "ffs32 random");
+    }
+}
+
+static void
+Rotate_Tests(void) {
+    int c=1;
+    uint32_t result32=0xaaaaaaaa;
+    uint64_t result64=0xaaaaaaaaaaaaaaaa;
+
+    /* The first argument is NOT a variable on purpose */
+    is(result32, CC_ROL(0x55555555, c), "CC_ROL 1");
+
+    is(result32, CC_ROLc(0x55555555, 1), "CC_ROLc 1");
+
+    is(result64, CC_ROL64(0x5555555555555555, c), "CC_ROL64 1");
+
+    is(result64, CC_ROL64c(0x5555555555555555, 1), "CC_ROL64c 1");
+
+    is(result32, CC_ROR(0x55555555, c), "CC_ROR 1");
+
+    is(result32, CC_RORc(0x55555555, 1), "CC_RORc 1");
+
+    is(result64, CC_ROR64(0x5555555555555555, c), "CC_ROR64 1");
+
+    is(result64, CC_ROR64c(0x5555555555555555, 1), "CC_ROR64c 1");
+}
+
+static void
+mux_Tests(void) {
+
+    uint8_t i8;
+    uint16_t i16;
+    uint32_t i32;
+    uint64_t i64;
+
+    CC_MUXU(i8,0,(uint8_t)0xAB,(uint8_t)0xBA);
+    is(i8,0xBA,"sizeof(uint8_t)!=1");
+    CC_MUXU(i8,1,(uint8_t)0xBA,(uint8_t)0xAB);
+    is(i8,0xBA,"sizeof(uint8_t)!=1");
+
+    CC_MUXU(i16,0,(uint16_t)0xAB00,(uint16_t)0xBA00);
+    is(i16,0xBA00,"sizeof(uint8_t)!=1");
+    CC_MUXU(i16,1,(uint16_t)0xBA00,(uint16_t)0xAB00);
+    is(i16,0xBA00,"sizeof(uint8_t)!=1");
+
+    CC_MUXU(i32,0,(uint32_t)0xAB00BEEF,(uint32_t)0xBA00BEEF);
+    is(i32,0xBA00BEEF,"sizeof(uint8_t)!=1");
+    CC_MUXU(i32,1,(uint32_t)0xBA00BEEF,(uint32_t)0xAB00BEEF);
+    is(i32,0xBA00BEEF,"sizeof(uint8_t)!=1");
+
+    CC_MUXU(i64,0,(uint64_t)0xAB00BEEF11223344,(uint64_t)0xBA00BEEF11223344);
+    is(i64,0xBA00BEEF11223344,"sizeof(uint8_t)!=1");
+    CC_MUXU(i32,1,(uint64_t)0xBA00BEEF11223344,(uint64_t)0xAB00BEEF11223344);
+    is(i64,0xBA00BEEF11223344,"sizeof(uint8_t)!=1");
+}
+
+static void
+HEAVISIDE_STEP_Tests(void)
+{
+    uint8_t i8;
+    uint16_t i16;
+    uint32_t i32;
+    uint64_t i64;
+    size_t i; // loop index
+    uint8_t err=0,nb_test=0;
+
+    // Sanity check on intended lengths
+    ok(sizeof(uint8_t) == 1, "sizeof(uint8_t)!=1");
+    ok(sizeof(uint16_t) == 2, "sizeof(uint16_t)!=2");
+    ok(sizeof(uint32_t) == 4, "sizeof(uint32_t)!=4");
+    ok(sizeof(uint64_t) == 8, "sizeof(uint64_t)!=1");
+
+    for (i=0;i<8*sizeof(i8);i++)
+    {
+        nb_test++;
+        CC_HEAVISIDE_STEP(i8,((uint8_t)1<<i));
+        if (i8!=1) err++;
+    }
+    ok(err==0,"CC_HEAVISIDE_STEP(i8)");
+
+    for (i=0;i<8*sizeof(i16);i++)
+    {
+        nb_test++;
+        CC_HEAVISIDE_STEP(i16,((uint16_t)1<<i));
+        if (i16!=1) err++;
+    }
+    ok(err==0,"CC_HEAVISIDE_STEP(i16)");
+
+    for (i=0;i<8*sizeof(i32);i++)
+    {
+        nb_test++;
+        CC_HEAVISIDE_STEP(i32,((uint32_t)1<<i));
+        if (i32!=1) err++;
+    }
+    ok(err==0,"CC_HEAVISIDE_STEP(i32)");
+
+    for (i=0;i<8*sizeof(i64);i++)
+    {
+        nb_test++;
+        CC_HEAVISIDE_STEP(i64,((uint64_t)1<<i));
+        if (i64!=1) err++;
+    }
+    ok(err==0,"CC_HEAVISIDE_STEP(i64)");
+
+    ok(err + (64+32+16+8)-nb_test==0, "CC HEAVISIDE_STEP test failed");
+}
+
+static void
+cmp_secure_functionalTests(void) {
+#define ARRAY_SIZE 10
+
+    // --- Bytes
+    uint8_t array1[ARRAY_SIZE]={1,2,3,4,5,6,7,8,9,0};
+    uint8_t array2[ARRAY_SIZE];
+
+    memcpy(array2,array1,sizeof(array1));
+    // Equal
+    ok(cc_cmp_safe(sizeof(array1), array1,array2)==0, "array1 to array2");
+    ok(cc_cmp_safe(sizeof(array1), array2,array1)==0, "array2 to array1");
+
+    // length is zero
+    ok(cc_cmp_safe(0, array2,array1)!=0, "Array of size 0");
+
+    // Equal but first byte
+    array1[0]++;
+    ok(cc_cmp_safe(sizeof(array1), array1,array2)!=0, "first byte");
+    array1[0]--;
+
+    // Equal but last byte
+    array1[sizeof(array1)-1]++;
+    ok(cc_cmp_safe(sizeof(array1), array1,array2)!=0, "last byte");
+    array1[sizeof(array1)-1]--;
+
+    // --- cc_units
+    uint64_t u64_array1[ARRAY_SIZE]={};
+    for (size_t i=0;i<ARRAY_SIZE;i++) u64_array1[i]=i;
+    uint64_t u64_array2[ARRAY_SIZE];
+    uint64_t tmp;
+
+    memcpy(u64_array2,u64_array1,sizeof(u64_array1));
+    // Equal
+    ok(cc_cmp_safe(sizeof(u64_array1), u64_array1,u64_array2)==0, "array1 to array2");
+    ok(cc_cmp_safe(sizeof(u64_array1), u64_array2,u64_array1)==0, "array2 to array1");
+
+    // length is zero
+    ok(cc_cmp_safe(0, u64_array2,u64_array1)!=0, "Array of size 0");
+
+    // Equal but first byte
+    ((uint8_t *)u64_array1)[0]++;
+    ok(cc_cmp_safe(sizeof(u64_array1),u64_array1,u64_array2)!=0, "first byte");
+    ((uint8_t *)u64_array1)[0]--;
+
+    // Equal but last byte
+    CC_LOAD64_BE(tmp,&u64_array1[ARRAY_SIZE-1]);
+    CC_STORE64_BE(tmp^0x80,&u64_array1[ARRAY_SIZE-1]);
+    ok(cc_cmp_safe(sizeof(u64_array1), u64_array1,u64_array2)!=0, "last byte");
+    CC_STORE64_BE(tmp,&u64_array1[ARRAY_SIZE-1]);
+}
+
+#ifdef CC_SECURITY_TEST
+
+//======================================================================
+// Constant time verification parameters
+//======================================================================
+
+// Number of iteration of test where timings are not taken into account.
+// Made to reach a stable performance state
+#define CC_WARMUP        10
+
+// Each sample is the average time for many iteration with identical inputs
+#define CC_TIMING_REPEAT  150
+
+// Number of sample for the statistical analysis
+// typically 100~1000 is a good range
+#define CC_TIMING_SAMPLES 200
+
+// In case of failure, try many times
+// This is to reduce false positives due to noise/timing accuracy.
+// If implementation is not constant time, the behavior will be consistent
+// So that this does not reduce the detection power.
+#define CC_TIMING_RETRIES 10
+
+// Two statitical tools are available: T-test and Wilcoxon.
+// T-test assumes that the distribution to be compared are normal
+// Wilcoxon measure offset between distribution.
+// Due to potential switches between performance state or occasional
+// latencies, Wilcoxon is recommended.
+// > Set to 1 to use T-test instead of Wilcoxon
+#define T_TEST  1
+
+// Number of iteration of the full test (to play with to evaluate chances of false positives)
+#define CMP_SECURITY_TEST_ITERATION 1
+
+// Quantile for the repeated timing. Empirical value.
+#define CC_TIMING_PERCENTILE 9
+
+//======================================================================
+
+static const int verbose=1;
+
+#define TEST_LAST_BYTE 1
+#define TEST_FIRST_BYTE 2
+#define TEST_RANDOM 3
+#define TEST_EQUAL 4
+
+static int
+cmp_secure_timeconstantTests(size_t length, struct ccrng_state *rng, uint32_t test_id) {
+
+    // Random for messages
+    uint8_t array1[length];
+    uint8_t array2[length];
+    int failure_cnt=0;
+    int early_abort=1;
+    uint32_t j,sample_counter;
+    bool retry=true;
+
+    if (length<=0) {goto errOut;}
+    j=0;
+    while(retry)
+    {
+        sample_counter=0; // Index of current sample
+        measurement_t timing_sample[2*CC_TIMING_SAMPLES];
+
+        for (size_t i=0;i<2*CC_TIMING_SAMPLES+(CC_WARMUP/CC_TIMING_REPEAT);i++)
+        {
+            ccrng_generate(rng,length,array1);
+            volatile int cmp_result;
+            if ((i&1) == 0)
+            {
+                // -------------------------
+                //      Random
+                // -------------------------
+                switch(test_id) {
+                    // All equal, except last byte
+                    case TEST_LAST_BYTE:
+                        memcpy(array2,array1,length);
+                        array2[length-1]^=1;
+                        break;
+                    // All equal, except first byte
+                    case TEST_FIRST_BYTE:
+                        memcpy(array2,array1,length);
+                        array2[0]^=1;
+                        break;
+                    // Random
+                    case TEST_RANDOM:
+                        ccrng_generate(rng,length,array2);
+                        break;
+                    // All equal
+                    case TEST_EQUAL:
+                        memcpy(array2,array1,length);
+                        break;
+                    default:
+                        return 0; // failure
+                }
+
+            }
+            else
+            {
+                // -------------------------
+                //      Equal
+                // -------------------------
+                memcpy(array2,array1,length);
+            }
+#if 1
+            // Actual function to test
+            TIMING_WITH_QUANTILE(timing_sample[sample_counter].timing,
+                                 CC_TIMING_REPEAT,
+                                 CC_TIMING_PERCENTILE,
+                                 cmp_result=cc_cmp_safe(length, array1, array2),errOut);
+#else
+            // Reference which can be expected to fail
+            TIMING_WITH_QUANTILE(timing_sample[sample_counter].timing,
+                                 CC_TIMING_REPEAT,
+                                 CC_TIMING_PERCENTILE,
+                                 cmp_result=memcmp(array1, array2,length),errOut);
+#endif
+            timing_sample[sample_counter].group=sample_counter&1;
+#if CC_WARMUP
+            if (i>=CC_WARMUP/CC_TIMING_REPEAT)
+#endif
+            {
+                sample_counter++;
+            }
+        }
+#if CCN_OSX
+        if (verbose>1) {
+            char file_name[64];
+            snprintf(file_name,sizeof(file_name),"corecrypto_test_cc_cmp_timings_%.2zu.csv",length);
+            export_measurement_to_file(file_name,timing_sample,sample_counter);
+        }
+#endif
+        // Process results
+#if T_TEST
+        // T test
+        int status=T_test_isRejected(timing_sample,sample_counter);
+#else
+        // Wilcoxon Rank-Sum Test
+        int status=WilcoxonRankSumTest(timing_sample,sample_counter);
+#endif
+        if (status!=0)
+        {
+            j++; // retry counter
+            if (j>=CC_TIMING_RETRIES)
+            {
+                diag("Constant timing FAILED for len %d after %d attempts",length,j);
+                //ok_or_fail((status==0),"Decrypt+padding constant timing");
+                failure_cnt++;
+                break;
+            }
+        }
+        else
+        {
+            if ((verbose>1) && (j>0)) diag("Constant timing ok for len %d after %d attempts (of %d)",length,j+1,CC_TIMING_RETRIES);
+            break;
+        }
+    } // retry
+
+    early_abort=0;
+errOut:
+    if (failure_cnt || early_abort)
+    {
+        return 0;
+    }
+    return 1;
+}
+
+#define CMP_SECURITY_TEST_MAX_LENGTH 2048
+static void
+memcmp_secure_securityTests(void) {
+
+    // Random for messages
+    struct ccrng_state *rng = global_test_rng;
+    for (size_t i=0;i<CMP_SECURITY_TEST_ITERATION;i++)
+    {
+        size_t r;
+        ccrng_generate(rng,sizeof(r),&r);
+        r=(r%CMP_SECURITY_TEST_MAX_LENGTH)+1;
+        ok(cmp_secure_timeconstantTests(r,rng,TEST_FIRST_BYTE), "Time constant check, first byte difference");
+        ok(cmp_secure_timeconstantTests(r,rng,TEST_LAST_BYTE), "Time constant check, last byte difference");
+        ok(cmp_secure_timeconstantTests(r,rng,TEST_RANDOM), "Time constant check, random");
+        ok(cmp_secure_timeconstantTests(r,rng,TEST_EQUAL), "Time constant check of equal input - if it fails, it's a test issue");
+    }
+}
+#endif // CC_SECURITY_TEST
+
+#ifdef CC_SECURITY_TEST
+#define kPlan_ccSecurityTestNb 5
+#else
+#define kPlan_ccSecurityTestNb 0
+#endif
+
+int cc_tests(TM_UNUSED int argc, TM_UNUSED char *const *argv)
+{
+    int num_tests = 36 + kPlan_ccSecurityTestNb;
+    num_tests += 292 + 2 * CLZ_RANDOM_TESTS; // clz_tests
+    num_tests += 292 + 2 * CTZ_RANDOM_TESTS; // ctz_tests
+    num_tests += 294 + 2 * FFS_RANDOM_TESTS; // ffs_tests
+    plan_tests(num_tests);
+
+    clz_tests();
+    ctz_tests();
+    ffs_tests();
+
+    //For Windows port, many unsigned longs have been replaced with size_t.
+    //This test makes sure corecrypto is agnostic to the change.
+    //This test can be removed leter on.
+#if defined(_WIN64) && defined(_WIN32) 
+    ok(sizeof(size_t)!=sizeof(unsigned long),
+#else
+    ok(sizeof(size_t)==sizeof(unsigned long),
+#endif
+    "Historically, corecrypto assumes size_t and long have the same size. Fon Win64, that is not the case");
+
+
+    if(verbose) diag("Stack cleanup");
+    ok(stack_clear_test(100)==0, "Stack clearing");
+
+    if(verbose) diag("mux test");
+    mux_Tests();
+
+    if(verbose) diag("HEAVISIDE_STEP test");
+    HEAVISIDE_STEP_Tests();
+
+    if(verbose) diag("Rotate test");
+    Rotate_Tests();
+
+    if(verbose) diag("Secure comparison test");
+    cmp_secure_functionalTests();
+
+#ifdef CC_SECURITY_TEST
+    if(verbose) diag("Secure comparison security test");
+    memcmp_secure_securityTests();
+#endif // CC_SECURITY_TEST
+
+    // Silence code coverage
+    const char *label = "corecrypto";
+    const uint8_t *buffer = (const uint8_t *)label;
+    cc_print("label", strlen(label), buffer);
+
+    return 0;
+}
+
+#endif //CC
--- a/cc/src/cc_abort.c
+++ b/cc/src/cc_abort.c
@ -0,0 +1,54 @@
+/* Copyright (c) (2015,2016,2019) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to 
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by 
+ * Apple Inc. (if any) are limited to internal use within your organization only on 
+ * devices and computers you own or control, for the sole purpose of verifying the 
+ * security characteristics and correct functioning of the Apple Software.  You may 
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#include <corecrypto/cc_priv.h>
+
+//cc_abort() is implemented to comply with by FIPS 140-2, when DRBG produces
+//two equal consecutive blocks.
+
+#if !CC_PROVIDES_ABORT
+
+#error "This environment does not provide an abort()/panic()-like function"
+
+#elif CC_KERNEL
+
+#include <kern/debug.h>
+void cc_abort(const char * msg)
+{
+    panic("%s", msg);
+}
+
+#elif CC_USE_L4
+
+#include <sys/panic.h>
+#include <stdarg.h>
+void cc_abort(const char * msg)
+{
+    sys_panic(msg);
+}
+
+#elif CC_RTKIT
+
+#include <RTK_platform.h>
+void cc_abort(const char * msg)
+{
+    RTK_abort("%s", msg);
+}
+
+#else
+
+#include <stdlib.h>
+void cc_abort(const char * msg CC_UNUSED)
+{
+    abort();
+}
+
+#endif
--- a/cc/src/cc_atfork_child.c
+++ b/cc/src/cc_atfork_child.c
@ -0,0 +1,18 @@
+/* Copyright (c) (2019) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to 
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by 
+ * Apple Inc. (if any) are limited to internal use within your organization only on 
+ * devices and computers you own or control, for the sole purpose of verifying the 
+ * security characteristics and correct functioning of the Apple Software.  You may 
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#include <corecrypto/cc_priv.h>
+#include <corecrypto/ccrng_cryptographic.h>
+
+void cc_atfork_child(void)
+{
+    ccrng_cryptographic_atfork_child();
+}
--- a/cc/src/cc_atfork_parent.c
+++ b/cc/src/cc_atfork_parent.c
@ -0,0 +1,18 @@
+/* Copyright (c) (2019) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to 
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by 
+ * Apple Inc. (if any) are limited to internal use within your organization only on 
+ * devices and computers you own or control, for the sole purpose of verifying the 
+ * security characteristics and correct functioning of the Apple Software.  You may 
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#include <corecrypto/cc_priv.h>
+#include <corecrypto/ccrng_cryptographic.h>
+
+void cc_atfork_parent(void)
+{
+    ccrng_cryptographic_atfork_parent();
+}
--- a/cc/src/cc_atfork_prepare.c
+++ b/cc/src/cc_atfork_prepare.c
@ -0,0 +1,18 @@
+/* Copyright (c) (2019) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to 
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by 
+ * Apple Inc. (if any) are limited to internal use within your organization only on 
+ * devices and computers you own or control, for the sole purpose of verifying the 
+ * security characteristics and correct functioning of the Apple Software.  You may 
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#include <corecrypto/cc_priv.h>
+#include <corecrypto/ccrng_cryptographic.h>
+
+void cc_atfork_prepare(void)
+{
+    ccrng_cryptographic_atfork_prepare();
+}
--- a/cc/src/cc_clear.c
+++ b/cc/src/cc_clear.c
@ -0,0 +1,35 @@
+/* Copyright (c) (2014,2015,2016,2017,2018,2019) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to 
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by 
+ * Apple Inc. (if any) are limited to internal use within your organization only on 
+ * devices and computers you own or control, for the sole purpose of verifying the 
+ * security characteristics and correct functioning of the Apple Software.  You may 
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#include <corecrypto/cc.h>
+#include "corecrypto/fipspost_trace.h"
+
+#if ( CC_HAS_MEMSET_S == 1 ) && (defined( __STDC_WANT_LIB_EXT1__ ) && ( __STDC_WANT_LIB_EXT1__ == 1 ) )
+void cc_clear(size_t len, void *dst)
+{
+    FIPSPOST_TRACE_EVENT;
+    memset_s(dst,len,0,len);
+}
+#elif defined(_WIN32) && !defined(__clang__) //Clang with Microsoft CodeGen, doesn't support SecureZeroMemory
+#include <windows.h>
+static void cc_clear(size_t len, void *dst)
+{
+    SecureZeroMemory(dst, len);
+}
+#else
+void cc_clear(size_t len, void *dst)
+{
+    FIPSPOST_TRACE_EVENT;
+    volatile char *vptr = (volatile char *)dst;
+    while (len--)
+        *vptr++ = '\0';
+}
+#endif
--- a/cc/src/cc_cmp_safe.c
+++ b/cc/src/cc_cmp_safe.c
@ -0,0 +1,26 @@
+/* Copyright (c) (2014,2015,2019) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to 
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by 
+ * Apple Inc. (if any) are limited to internal use within your organization only on 
+ * devices and computers you own or control, for the sole purpose of verifying the 
+ * security characteristics and correct functioning of the Apple Software.  You may 
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#include <corecrypto/cc_priv.h>
+
+int cc_cmp_safe (size_t num, const void * ptr1, const void * ptr2)
+{
+    size_t i;
+    const uint8_t *s=(const uint8_t *)ptr1;
+    const uint8_t *t=(const uint8_t *)ptr2;
+    uint8_t flag=((num<=0)?1:0); // If 0 return an error
+    for (i=0;i<num;i++)
+    {
+        flag|=(s[i]^t[i]);
+    }
+    CC_HEAVISIDE_STEP(flag,flag); // flag=(flag==0)?0:1;
+    return flag; // 0 iff all bytes were equal, 1 if there is any difference
+}
--- a/cc/src/cc_debug.c
+++ b/cc/src/cc_debug.c
@ -0,0 +1,26 @@
+/* Copyright (c) (2014,2015,2016,2017,2018,2019) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to 
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by 
+ * Apple Inc. (if any) are limited to internal use within your organization only on 
+ * devices and computers you own or control, for the sole purpose of verifying the 
+ * security characteristics and correct functioning of the Apple Software.  You may 
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#include <corecrypto/cc_priv.h>
+#include "cc_debug.h"
+#include "cc_memory.h"
+
+#if CORECRYPTO_DEBUG
+struct ws_dbg g_ws_dbg;
+#endif
+
+void cc_print(const char *label, size_t count, const uint8_t *s) {
+    cc_printf("%s { %zu, ",label, count);
+    for (size_t ix=0; ix<count ; ix++) {
+        cc_printf("%.02x", s[ix]);
+    }
+    cc_printf(" }\n");
+}
--- a/cc/src/cc_fault_canary.c
+++ b/cc/src/cc_fault_canary.c
@ -0,0 +1,35 @@
+/* Copyright (c) (2019,2020) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by
+ * Apple Inc. (if any) are limited to internal use within your organization only on
+ * devices and computers you own or control, for the sole purpose of verifying the
+ * security characteristics and correct functioning of the Apple Software.  You may
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#include <corecrypto/cc_fault_canary.h>
+#include <corecrypto/cc_fault_canary_internal.h>
+
+const cc_fault_canary_t CCEC_FAULT_CANARY = { 0xce, 0x3c, 0xed, 0x46, 0x6b, 0x11, 0xbf, 0x08, 0x13, 0xa0, 0xd4, 0xbf, 0x89, 0x60, 0xeb, 0x56 };
+const cc_fault_canary_t CCRSA_PSS_FAULT_CANARY = { 0xef, 0x49, 0xba, 0x59, 0x22, 0xfe, 0x10, 0xdd, 0x84, 0x4f, 0x24,
+    0xd6, 0xad, 0xc0, 0xa9, 0x93 };
+const cc_fault_canary_t CCRSA_PKCS1_FAULT_CANARY = { 0xea, 0xc5, 0x4a, 0x7c, 0x9f, 0x28, 0xdf, 0x10, 0xb6, 0xe9, 0x3e, 0xb9, 0x1c, 0xd3, 0x3a, 0xc5 };
+
+void cc_fault_canary_set(cc_fault_canary_t fault_canary_out, const cc_fault_canary_t fault_canary, size_t nbytes, const uint8_t *in1, const uint8_t *in2)
+{    
+    // We need to be careful with our xor's.
+    // The first loop XORs the actual fault canary value
+    for (size_t ci = 0; ci < CC_FAULT_CANARY_SIZE; ci++) {
+        size_t bi = ci % nbytes;
+        fault_canary_out[ci] = in1[bi] ^ in2[bi] ^ fault_canary[ci];
+    }
+
+    // The second loop XORs the existing value in the input fault canary buffer.
+    for (size_t i = CC_FAULT_CANARY_SIZE; i < nbytes; i++) {
+        size_t bi = i % nbytes;
+        size_t ci = i % sizeof(CCEC_FAULT_CANARY);
+        fault_canary_out[ci] = in1[bi] ^ in2[bi] ^ fault_canary_out[ci];
+    }
+}
--- a/cc/src/cc_muxp.c
+++ b/cc/src/cc_muxp.c
@ -0,0 +1,27 @@
+/* Copyright (c) (2015,2016,2018,2019) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to 
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by 
+ * Apple Inc. (if any) are limited to internal use within your organization only on 
+ * devices and computers you own or control, for the sole purpose of verifying the 
+ * security characteristics and correct functioning of the Apple Software.  You may 
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#include <corecrypto/cc_priv.h>
+/*
+Per C99 ISO/IEC 9899:1999 §6.5.8 and 6.5.9 Relational operator:
+Each of the operators < , > , <= , >=, ==, !=  yield 1 if the specified relation is true and 0 if it is false. ... The result type is integer.
+Also applies to other revisions of the C standard such as C11.
+*/
+// returns z= s ? a : b in constant time, when a and be are pointers. s must be either 0 or 1.
+void *cc_muxp(int s, const void *a, const void *b)
+{
+    cc_assert(s==1 || s==0);
+    uintptr_t ia = (uintptr_t) a;
+    uintptr_t ib = (uintptr_t) b;
+    uintptr_t cond =~((uintptr_t)s-(uintptr_t)1);//s?~zero:zero; see above
+    uintptr_t rc = (cond&ia)|(~cond&ib);
+    return (void *)rc;
+}
--- a/cc/src/cc_rdrand.c
+++ b/cc/src/cc_rdrand.c
@ -0,0 +1,39 @@
+/* Copyright (c) (2019) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to 
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by 
+ * Apple Inc. (if any) are limited to internal use within your organization only on 
+ * devices and computers you own or control, for the sole purpose of verifying the 
+ * security characteristics and correct functioning of the Apple Software.  You may 
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#include <corecrypto/cc_runtime_config.h>
+#include "cc_internal.h"
+
+#if defined(__x86_64__)
+
+bool cc_rdrand(uint64_t *rand)
+{
+    bool ok;
+
+    if (CC_HAS_RDRAND()) {
+        asm volatile ("rdrand %0; setc %1" : "=r"(rand), "=qm"(ok) : : "cc");
+    } else {
+        *rand = 0;
+        ok = false;
+    }
+
+    return ok;
+}
+
+#else
+
+bool cc_rdrand(uint64_t *rand)
+{
+    *rand = 0;
+    return false;
+}
+
+#endif
--- a/cc/xcunit/cc_composeString.m
+++ b/cc/xcunit/cc_composeString.m
@ -0,0 +1,25 @@
+/* Copyright (c) (2014,2015,2019) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to 
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by 
+ * Apple Inc. (if any) are limited to internal use within your organization only on 
+ * devices and computers you own or control, for the sole purpose of verifying the 
+ * security characteristics and correct functioning of the Apple Software.  You may 
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#import "cc_unit.h"
+
+NSString *cc_composeString(NSString *format, ...) {
+    if (!format) return @"";
+    
+    NSString *composedString;
+    va_list args;
+    
+    va_start(args, format);
+    composedString = [[[NSString alloc] initWithFormat:format arguments:args] autorelease];
+    va_end(args);
+    
+    return composedString;
+}
--- a/cc/xcunit/cc_hex_string.m
+++ b/cc/xcunit/cc_hex_string.m
@ -0,0 +1,21 @@
+/* Copyright (c) (2010,2014,2015,2016,2019) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to 
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by 
+ * Apple Inc. (if any) are limited to internal use within your organization only on 
+ * devices and computers you own or control, for the sole purpose of verifying the 
+ * security characteristics and correct functioning of the Apple Software.  You may 
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#import "cc_unit.h"
+
+NSString *cc_hex_string(size_t len, const unsigned char *s) {
+    NSMutableString *r = [[NSMutableString alloc] initWithCapacity: 3 + len * 8];
+    for (size_t ix = 0; ix < len; ++ix) {
+        [r appendFormat: @"%.02x", s[ix]];
+    }
+    [r autorelease];
+    return r;
+}
--- a/cc/xcunit/cc_unit.h
+++ b/cc/xcunit/cc_unit.h
@ -0,0 +1,85 @@
+/* Copyright (c) (2014,2015,2016,2017,2019) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to 
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by 
+ * Apple Inc. (if any) are limited to internal use within your organization only on 
+ * devices and computers you own or control, for the sole purpose of verifying the 
+ * security characteristics and correct functioning of the Apple Software.  You may 
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#import <XCTest/XCTest.h>
+#import <corecrypto/ccrng_test.h>
+#import <corecrypto/ccrng_system.h>
+
+
+NSString *cc_hex_string(size_t len, const unsigned char *s);
+NSString *cc_composeString(NSString *format, ...);
+
+#define XCAssertMemEquals(len, a1, a2, description, ...) \
+({ \
+    @try { \
+        const void *_a1value = (a1); \
+        const void *_a2value = (a2); \
+        size_t _lenvalue = (len); \
+        if (memcmp(_a1value, _a2value, _lenvalue) != 0) {\
+            NSString *_expression = cc_composeString(description, ##__VA_ARGS__); \
+            NSString *_a1encoded = cc_hex_string(_lenvalue, _a1value); \
+            NSString *_a2encoded = cc_hex_string(_lenvalue, _a2value); \
+            XCTFail(@"%@\n%@\n should be \n%@",_expression, _a1encoded, _a2encoded);\
+        }\
+    }\
+    @catch (NSException *exception) {\
+        XCTFail(@"An exception caught");\
+    }\
+})
+
+#define XCAssertCharsEquals(len, a1, a2, description, ...) \
+({ \
+    @try { \
+        const void *_a1value = (a1); \
+        const void *_a2value = (a2); \
+        size_t _lenvalue = (len); \
+        if (memcmp(_a1value, _a2value, _lenvalue) != 0) { \
+            NSString *_expression = cc_composeString(description, ##__VA_ARGS__); \
+            NSString *_a1encoded = cc_hex_string(_lenvalue, _a1value); \
+            NSString *_a2encoded = cc_hex_string(_lenvalue, _a2value); \
+            XCTFail(@"%@\n%@\n should be \n%@",_expression, _a1encoded, _a2encoded);\
+        } \
+    } \
+    @catch (NSException *exception) {\
+        XCTFail(@"An exception caught");\
+    }\
+})
+
+
+// When choosing the input seed, it must have the format "\x00\x01\x02\x03"...
+#define XCTestRNG(rngname,input_seed) \
+    struct ccrng_test_state _test_rng; \
+    struct ccrng_state* rngname=(struct ccrng_state*)&_test_rng; \
+    size_t  seedlen=sizeof(input_seed)-1; \
+    uint8_t random_seed[16]; \
+    uint8_t *seed=(uint8_t *)input_seed; \
+    if (input_seed==NULL || seedlen<=0) \
+    {\
+        seed=random_seed; \
+        seedlen=sizeof(random_seed); \
+        struct ccrng_system_state system_rng; \
+        XCTAssert(ccrng_system_init(&system_rng)==0); \
+        XCTAssert(ccrng_generate((struct ccrng_state *)&system_rng, seedlen, random_seed)==0); \
+        ccrng_system_done(&system_rng); \
+    } else {\
+        printf("Forced "); \
+        seed=(uint8_t *)input_seed; \
+    } \
+    XCTAssert(ccrng_test_init(&_test_rng, seedlen,seed,"")==0); \
+    NSString *_seed_encoded = cc_hex_string(seedlen, seed); \
+    printf("XCTestRNG seed: %s {", [_seed_encoded UTF8String]); \
+    for (size_t i=0;i<seedlen;i++) printf("\\x%02x",seed[i]); \
+    printf("}\n"); \
+
+
+#define XCTestRNG_Done(rng) \
+    ccrng_test_done((struct ccrng_test_state*)rng); \
+    rng=NULL;
--- a/cc_fips/corecrypto/fipspost.h
+++ b/cc_fips/corecrypto/fipspost.h
@ -0,0 +1,74 @@
+/* Copyright (c) (2012,2015,2016,2017,2019) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to 
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by 
+ * Apple Inc. (if any) are limited to internal use within your organization only on 
+ * devices and computers you own or control, for the sole purpose of verifying the 
+ * security characteristics and correct functioning of the Apple Software.  You may 
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#ifndef _CORECRYPTO_FIPSPOST_H_
+#define _CORECRYPTO_FIPSPOST_H_
+
+#include <stdint.h>
+#include <corecrypto/cc_config.h>
+
+// Boot-Arg fips_mode Flags
+//
+// FIPS_MODE_FLAG_FULL is the default value when no other value is set, which
+// is the case for all production devices.
+//
+// When performing tests, if _FORCEFAIL is set to true, then the tests
+// intentionally fail and log their failure. The kernelspace and userspace
+// flags can be enabled independently.
+//
+// If it's not desired to panic, supply the _NOPANIC flag with the
+// _FORCEFAIL flag.
+//
+// Additional logging can be enabled by supplying the _VERBOSE flag.
+//
+// _NOINTEG is used to ignore just the results of the module integrity
+// check process, which is very useful when setting breakpoints in the
+// kext for diagnostic or auditing purposes.
+//
+// Supplying _TRACE causes a trace buffer to be accumulated of the instrumented
+// functions for only one execution of the POST.  As the POST finishes, the
+// _TRACE flag is cleared from the fips_mode and no further tracing will occur.
+#define FIPS_MODE_FLAG_DEBUG        (1 << 0)
+#define FIPS_MODE_FLAG_FULL         (1 << 1)
+#define FIPS_MODE_FLAG_DISABLE      (1 << 2)
+#define FIPS_MODE_FLAG_VERBOSE      (1 << 3)
+#define FIPS_MODE_FLAG_US_FORCEFAIL (1 << 4)
+#define FIPS_MODE_FLAG_KS_FORCEFAIL (1 << 5)
+#define FIPS_MODE_FLAG_NOINTEG      (1 << 6)
+#define FIPS_MODE_FLAG_TRACE        (1 << 7)
+#define FIPS_MODE_FLAG_NOPANIC      (1 << 8)
+
+#define FIPS_MODE_IS_DEBUG(MODE)        ((MODE) & FIPS_MODE_FLAG_DEBUG)
+#define FIPS_MODE_IS_FULL(MODE)         ((MODE) & FIPS_MODE_FLAG_FULL)
+#define FIPS_MODE_IS_DISABLE(MODE)      ((MODE) & FIPS_MODE_FLAG_DISABLE)
+#define FIPS_MODE_IS_VERBOSE(MODE)      ((MODE) & FIPS_MODE_FLAG_VERBOSE)
+#define FIPS_MODE_IS_US_FORCEFAIL(MODE) ((MODE) & FIPS_MODE_FLAG_US_FORCEFAIL)
+#define FIPS_MODE_IS_KS_FORCEFAIL(MODE) ((MODE) & FIPS_MODE_FLAG_KS_FORCEFAIL)
+#define FIPS_MODE_IS_NOINTEG(MODE)      ((MODE) & FIPS_MODE_FLAG_NOINTEG)
+#define FIPS_MODE_IS_TRACE(MODE)        ((MODE) & FIPS_MODE_FLAG_TRACE)
+#define FIPS_MODE_IS_NOPANIC(MODE)      ((MODE) & FIPS_MODE_FLAG_NOPANIC)
+
+#if CC_KERNEL
+#define FIPS_MODE_FLAG_FORCEFAIL        FIPS_MODE_FLAG_KS_FORCEFAIL
+#define FIPS_MODE_IS_FORCEFAIL(MODE)    FIPS_MODE_IS_KS_FORCEFAIL(MODE)
+#else
+#define FIPS_MODE_FLAG_FORCEFAIL        FIPS_MODE_FLAG_US_FORCEFAIL
+#define FIPS_MODE_IS_FORCEFAIL(MODE)    FIPS_MODE_IS_US_FORCEFAIL(MODE)
+#endif
+
+struct mach_header;
+
+/*
+ * Entrypoint for all POST tests.
+ */
+int fipspost_post(uint32_t fips_mode, struct mach_header *pmach_header);
+
+#endif /* _CORECRYPTO_FIPSPOST_H_ */
--- a/cc_fips/corecrypto/fipspost_get_cpu_key.h
+++ b/cc_fips/corecrypto/fipspost_get_cpu_key.h
@ -0,0 +1,18 @@
+/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to 
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by 
+ * Apple Inc. (if any) are limited to internal use within your organization only on 
+ * devices and computers you own or control, for the sole purpose of verifying the 
+ * security characteristics and correct functioning of the Apple Software.  You may 
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#ifndef _CORECRYPTO_FIPSPOST_GET_CPU_KEY_H_
+#define _CORECRYPTO_FIPSPOST_GET_CPU_KEY_H_
+
+size_t fipspost_get_cpu_key(char *label, size_t label_size, cpu_type_t cpuType,
+        cpu_subtype_t cpusubtype);
+
+#endif
--- a/cc_fips/corecrypto/fipspost_get_hmac.h
+++ b/cc_fips/corecrypto/fipspost_get_hmac.h
@ -0,0 +1,101 @@
+/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to 
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by 
+ * Apple Inc. (if any) are limited to internal use within your organization only on 
+ * devices and computers you own or control, for the sole purpose of verifying the 
+ * security characteristics and correct functioning of the Apple Software.  You may 
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#ifndef _CORECRYPTO_FIPSPOST_GET_HMAC_H_
+#define _CORECRYPTO_FIPSPOST_GET_HMAC_H_
+
+#include <corecrypto/ccsha2.h>
+
+struct mach_header;
+
+/*
+ * The pre-calculated SHA256 HMAC gets placed here for integrity
+ * testing.  The current value is a random number.  Use a different random
+ * number for each architecture type supported.
+ */
+#define FIPSPOST_PRECALC_HMAC_SIZE CCSHA256_OUTPUT_SIZE
+#define FIPSPOST_HMAC_VALUE fipspost_precalc_hmac
+#define FIPSPOST_PRECALC_HMAC_VARIABLE                                      \
+__attribute__((section("__TEXT,__fips_hmacs"))) const unsigned char FIPSPOST_HMAC_VALUE[FIPSPOST_PRECALC_HMAC_SIZE]
+
+#define FIPSPOST_PRECALC_HMAC(ARCH, MODE)                                   \
+      { ARCH, MODE, 0x10, 0xdc, 0xe5, 0x34, 0x6f, 0x01,                     \
+        0xdd, 0x82, 0xf8, 0xad, 0xe5, 0x8f, 0xa1, 0xcc,                     \
+        0xc1, 0x32, 0xe5, 0xa8, 0x53, 0xc8, 0x39, 0xa3,                     \
+        0x84, 0x5f, 0x3b, 0xcb, 0x39, 0x9e, 0xd1, 0x7b }
+
+/* Comprehensive list, in the order of mach/machine.h */
+#define FIPSPOST_PRECALC_HMAC_VALUE_X86_64      FIPSPOST_PRECALC_HMAC(0x86, 0x64)
+#define FIPSPOST_PRECALC_HMAC_VALUE_X86_32      FIPSPOST_PRECALC_HMAC(0x86, 0x32)
+#define FIPSPOST_PRECALC_HMAC_VALUE_ARM_4T      FIPSPOST_PRECALC_HMAC(0xa4, 0x01)
+#define FIPSPOST_PRECALC_HMAC_VALUE_ARM_6       FIPSPOST_PRECALC_HMAC(0xa6, 0x00)
+#define FIPSPOST_PRECALC_HMAC_VALUE_ARM_V5TEJ   FIPSPOST_PRECALC_HMAC(0xa5, 0x01)
+#define FIPSPOST_PRECALC_HMAC_VALUE_ARM_XSCALE  FIPSPOST_PRECALC_HMAC(0xa5, 0x02)
+#define FIPSPOST_PRECALC_HMAC_VALUE_ARM_7A      FIPSPOST_PRECALC_HMAC(0xa7, 0x0a)
+#define FIPSPOST_PRECALC_HMAC_VALUE_ARM_7F      FIPSPOST_PRECALC_HMAC(0xa7, 0x0f)
+#define FIPSPOST_PRECALC_HMAC_VALUE_ARM_7S      FIPSPOST_PRECALC_HMAC(0xa7, 0x05)
+#define FIPSPOST_PRECALC_HMAC_VALUE_ARM_7K      FIPSPOST_PRECALC_HMAC(0xa7, 0x04)
+#define FIPSPOST_PRECALC_HMAC_VALUE_ARM_6M      FIPSPOST_PRECALC_HMAC(0xa6, 0x01)
+#define FIPSPOST_PRECALC_HMAC_VALUE_ARM_7M      FIPSPOST_PRECALC_HMAC(0xa7, 0x06)
+#define FIPSPOST_PRECALC_HMAC_VALUE_ARM_7EM     FIPSPOST_PRECALC_HMAC(0xa7, 0x07)
+
+#define FIPSPOST_PRECALC_HMAC_VALUE_ARM_64      FIPSPOST_PRECALC_HMAC(0xa8, 0x64)
+#define FIPSPOST_PRECALC_HMAC_VALUE_ARM_64_V8   FIPSPOST_PRECALC_HMAC(0xa8, 0x68)
+#define FIPSPOST_PRECALC_HMAC_VALUE_ARM_64E     FIPSPOST_PRECALC_HMAC(0xa8, 0x6e)
+#define FIPSPOST_PRECALC_HMAC_VALUE_ARM_64_32   FIPSPOST_PRECALC_HMAC(0xa8, 0x32)
+
+#define FIPSPOST_CREATE_PRECALC_HMAC(ARCH, VARIANT)                         \
+    FIPSPOST_PRECALC_HMAC_VARIABLE = FIPSPOST_PRECALC_HMAC_VALUE ## _ ## ARCH ## _ ## VARIANT;
+
+/*
+ * Declare the individual variants based on the current architecture. Use the
+ * raw compiler flags because each archive must have a different value, even if
+ * they're all classed as '__arm__', to avoid duplicate values in a FAT file.
+ */
+#if defined(__x86_64__)
+#define FIPSPOST_DECLARE_PRECALC_HMAC FIPSPOST_CREATE_PRECALC_HMAC(X86, 64)
+#elif defined(__i386__)
+#define FIPSPOST_DECLARE_PRECALC_HMAC FIPSPOST_CREATE_PRECALC_HMAC(X86, 32)
+#elif defined(__ARM_ARCH_4T__)
+#define FIPSPOST_DECLARE_PRECALC_HMAC FIPSPOST_CREATE_PRECALC_HMAC(ARM, 4T)
+#elif defined(__ARM_ARCH_6K__)
+#define FIPSPOST_DECLARE_PRECALC_HMAC FIPSPOST_CREATE_PRECALC_HMAC(ARM, 6)
+// Unknown compiler flags for V5TEJ
+// Unknown compiler flags for XSCALE
+#elif defined (__ARM_ARCH_7A__) && !defined (__ARM_ARCH_7K__)
+#define FIPSPOST_DECLARE_PRECALC_HMAC FIPSPOST_CREATE_PRECALC_HMAC(ARM, 7A)
+#elif defined (__ARM_ARCH_7F__)
+#define FIPSPOST_DECLARE_PRECALC_HMAC FIPSPOST_CREATE_PRECALC_HMAC(ARM, 7F)
+#elif defined (__ARM_ARCH_7S__)
+#define FIPSPOST_DECLARE_PRECALC_HMAC FIPSPOST_CREATE_PRECALC_HMAC(ARM, 7S)
+#elif defined (__ARM_ARCH_7K__)
+#define FIPSPOST_DECLARE_PRECALC_HMAC FIPSPOST_CREATE_PRECALC_HMAC(ARM, 7K)
+#elif defined(__ARM_ARCH_6M__)
+#define FIPSPOST_DECLARE_PRECALC_HMAC FIPSPOST_CREATE_PRECALC_HMAC(ARM, 6M)
+#elif defined (__ARM_ARCH_7M__)
+#define FIPSPOST_DECLARE_PRECALC_HMAC FIPSPOST_CREATE_PRECALC_HMAC(ARM, 7M)
+#elif defined(__ARM_ARCH_7EM__)
+#define FIPSPOST_DECLARE_PRECALC_HMAC FIPSPOST_CREATE_PRECALC_HMAC(ARM, 7EM)
+#elif defined(__arm64e__)
+#define FIPSPOST_DECLARE_PRECALC_HMAC FIPSPOST_CREATE_PRECALC_HMAC(ARM, 64E)
+#elif defined(__ARM64_ARCH_8_32__)
+#define FIPSPOST_DECLARE_PRECALC_HMAC FIPSPOST_CREATE_PRECALC_HMAC(ARM, 64_32)
+#elif defined(__ARM_ARCH_ISA_A64)
+#define FIPSPOST_DECLARE_PRECALC_HMAC FIPSPOST_CREATE_PRECALC_HMAC(ARM, 64)
+// Unknown compiler flags for 64_V8
+#else
+#error Unsupported architecture type; add as necessary in the order of mach/machine.h.
+#endif
+
+#define FIPSPOST_EXTERN_PRECALC_HMAC extern FIPSPOST_PRECALC_HMAC_VARIABLE;
+
+int fipspost_get_hmac(const struct mach_header* pmach_header, unsigned char* sha256HMACBuffer, size_t max_offset);
+#endif
--- a/cc_fips/corecrypto/fipspost_indicator.h
+++ b/cc_fips/corecrypto/fipspost_indicator.h
@ -0,0 +1,33 @@
+/* Copyright (c) (2020) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by
+ * Apple Inc. (if any) are limited to internal use within your organization only on
+ * devices and computers you own or control, for the sole purpose of verifying the
+ * security characteristics and correct functioning of the Apple Software.  You may
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#ifndef _CORECRYPTO_FIPSPOST_INDICATOR_H_
+#define _CORECRYPTO_FIPSPOST_INDICATOR_H_
+
+/// Checks if a symmetric algorithm mode is allowed for the given key size.
+int fips_allowed_mode(const void *mode, size_t key_byte_length);
+
+/// Checks if a function is allowed according to FIPS. The arguments are precise the context in which the function will used if
+/// required. E.G., for a SHA* hash function no parameters are needed, since the function is sufficient to define the use. On the
+/// opposite a symmetric mode requires the key length in bytes and the cryptographic algorithm. num_args: the number of passed
+/// arguments. It can currently be 0, 1, or 2. Depending on num_args, the following arguments can be:
+/// * num_args == 1:
+///    - struct ccdigest_info * for a DRBG function
+///    - ccec_const_cp_t for an ECC function
+///    - struct ccdigest_info * for a HMAC function
+///    - ccdh_const_gp_t for a DH function
+///    - ccec_const_cp_t for ECDH function
+///    - key_byte_length for a KDF CTR CMAC function
+///    - struct ccdigest_info * for a KDF CTR HMAC or PBKDF2 function
+///    - key_bit_length for RSA related functions
+int fips_allowed(const void *function, size_t num_args, ...);
+
+#endif /* _CORECRYPTO_FIPSPOST_INDICATOR_H_ */
--- a/cc_fips/corecrypto/fipspost_post_aes_cbc.h
+++ b/cc_fips/corecrypto/fipspost_post_aes_cbc.h
@ -0,0 +1,17 @@
+/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to 
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by 
+ * Apple Inc. (if any) are limited to internal use within your organization only on 
+ * devices and computers you own or control, for the sole purpose of verifying the 
+ * security characteristics and correct functioning of the Apple Software.  You may 
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#ifndef _CORECRYPTO_FIPSPOST_POST_AES_CBC_H_
+#define _CORECRYPTO_FIPSPOST_POST_AES_CBC_H_
+
+int fipspost_post_aes_cbc(uint32_t fips_mode);
+
+#endif
--- a/cc_fips/corecrypto/fipspost_post_aes_ccm.h
+++ b/cc_fips/corecrypto/fipspost_post_aes_ccm.h
@ -0,0 +1,20 @@
+/* Copyright (c) (2018,2019) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to 
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by 
+ * Apple Inc. (if any) are limited to internal use within your organization only on 
+ * devices and computers you own or control, for the sole purpose of verifying the 
+ * security characteristics and correct functioning of the Apple Software.  You may 
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+//  Created on 5/1/18.
+//
+// Copyright (c) 2018 Apple Inc. All rights reserved.
+
+#ifndef fipspost_post_aes_ccm_h
+#define fipspost_post_aes_ccm_h
+
+int fipspost_post_aes_ccm(uint32_t fips_mode);
+
+#endif /* fipspost_post_aes_ccm_h */
--- a/cc_fips/corecrypto/fipspost_post_aes_cmac.h
+++ b/cc_fips/corecrypto/fipspost_post_aes_cmac.h
@ -0,0 +1,18 @@
+/* Copyright (c) (2017,2019,2020) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by
+ * Apple Inc. (if any) are limited to internal use within your organization only on
+ * devices and computers you own or control, for the sole purpose of verifying the
+ * security characteristics and correct functioning of the Apple Software.  You may
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#ifndef _CORECRYPTO_FIPSPOST_POST_AES_CMAC_H_
+#define _CORECRYPTO_FIPSPOST_POST_AES_CMAC_H_
+
+int fipspost_post_aes_cmac(uint32_t fips_mode);
+
+#endif
+
--- a/cc_fips/corecrypto/fipspost_post_aes_ecb.h
+++ b/cc_fips/corecrypto/fipspost_post_aes_ecb.h
@ -0,0 +1,17 @@
+/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to 
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by 
+ * Apple Inc. (if any) are limited to internal use within your organization only on 
+ * devices and computers you own or control, for the sole purpose of verifying the 
+ * security characteristics and correct functioning of the Apple Software.  You may 
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#ifndef _CORECRYPTO_FIPSPOST_POST_AES_ECB_H_
+#define _CORECRYPTO_FIPSPOST_POST_AES_ECB_H_
+
+int fipspost_post_aes_ecb(uint32_t fips_mode);
+
+#endif
--- a/cc_fips/corecrypto/fipspost_post_aes_gcm.h
+++ b/cc_fips/corecrypto/fipspost_post_aes_gcm.h
@ -0,0 +1,17 @@
+/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to 
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by 
+ * Apple Inc. (if any) are limited to internal use within your organization only on 
+ * devices and computers you own or control, for the sole purpose of verifying the 
+ * security characteristics and correct functioning of the Apple Software.  You may 
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#ifndef _CORECRYPTO_FIPSPOST_POST_AES_GCM_H_
+#define _CORECRYPTO_FIPSPOST_POST_AES_GCM_H_
+
+int fipspost_post_aes_gcm(uint32_t fips_mode);
+
+#endif
--- a/cc_fips/corecrypto/fipspost_post_aes_skg.h
+++ b/cc_fips/corecrypto/fipspost_post_aes_skg.h
@ -0,0 +1,20 @@
+/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to 
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by 
+ * Apple Inc. (if any) are limited to internal use within your organization only on 
+ * devices and computers you own or control, for the sole purpose of verifying the 
+ * security characteristics and correct functioning of the Apple Software.  You may 
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#ifndef _CORECRYPTO_FIPSPOST_POST_AES_SKG_H_
+#define _CORECRYPTO_FIPSPOST_POST_AES_SKG_H_
+
+int fipspost_post_aes_skg_enc_ecb_128(uint32_t fips_mode);
+int fipspost_post_aes_skg_dec_ecb_128(uint32_t fips_mode);
+int fipspost_post_aes_skg_enc_cbc_128(uint32_t fips_mode);
+int fipspost_post_aes_skg_dec_cbc_128(uint32_t fips_mode);
+
+#endif
--- a/cc_fips/corecrypto/fipspost_post_aes_trng.h
+++ b/cc_fips/corecrypto/fipspost_post_aes_trng.h
@ -0,0 +1,17 @@
+/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to 
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by 
+ * Apple Inc. (if any) are limited to internal use within your organization only on 
+ * devices and computers you own or control, for the sole purpose of verifying the 
+ * security characteristics and correct functioning of the Apple Software.  You may 
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#ifndef _CORECRYPTO_FIPSPOST_POST_AES_TRNG_H_
+#define _CORECRYPTO_FIPSPOST_POST_AES_TRNG_H_
+
+int fipspost_post_aes_trng(uint32_t fips_mode);
+
+#endif
--- a/cc_fips/corecrypto/fipspost_post_aes_xts.h
+++ b/cc_fips/corecrypto/fipspost_post_aes_xts.h
@ -0,0 +1,17 @@
+/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to 
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by 
+ * Apple Inc. (if any) are limited to internal use within your organization only on 
+ * devices and computers you own or control, for the sole purpose of verifying the 
+ * security characteristics and correct functioning of the Apple Software.  You may 
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#ifndef _CORECRYPTO_FIPSPOST_POST_AES_XTS_H_
+#define _CORECRYPTO_FIPSPOST_POST_AES_XTS_H_
+
+int fipspost_post_aes_xts(uint32_t fips_mode);
+
+#endif
--- a/cc_fips/corecrypto/fipspost_post_drbg_ctr.h
+++ b/cc_fips/corecrypto/fipspost_post_drbg_ctr.h
@ -0,0 +1,17 @@
+/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to 
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by 
+ * Apple Inc. (if any) are limited to internal use within your organization only on 
+ * devices and computers you own or control, for the sole purpose of verifying the 
+ * security characteristics and correct functioning of the Apple Software.  You may 
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#ifndef _CORECRYPTO_FIPSPOST_POST_DRBG_CTR_H_
+#define _CORECRYPTO_FIPSPOST_POST_DRBG_CTR_H_
+
+int fipspost_post_drbg_ctr(uint32_t fips_mode);
+
+#endif
--- a/cc_fips/corecrypto/fipspost_post_drbg_hmac.h
+++ b/cc_fips/corecrypto/fipspost_post_drbg_hmac.h
@ -0,0 +1,17 @@
+/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to 
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by 
+ * Apple Inc. (if any) are limited to internal use within your organization only on 
+ * devices and computers you own or control, for the sole purpose of verifying the 
+ * security characteristics and correct functioning of the Apple Software.  You may 
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#ifndef _CORECRYPTO_FIPSPOST_POST_DRBG_HMAC_H_
+#define _CORECRYPTO_FIPSPOST_POST_DRBG_HMAC_H_
+
+int fipspost_post_drbg_hmac(uint32_t fips_mode);
+
+#endif
--- a/cc_fips/corecrypto/fipspost_post_drbg_trng.h
+++ b/cc_fips/corecrypto/fipspost_post_drbg_trng.h
@ -0,0 +1,17 @@
+/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to 
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by 
+ * Apple Inc. (if any) are limited to internal use within your organization only on 
+ * devices and computers you own or control, for the sole purpose of verifying the 
+ * security characteristics and correct functioning of the Apple Software.  You may 
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#ifndef _CORECRYPTO_FIPSPOST_POST_DRBG_TRNG_H_
+#define _CORECRYPTO_FIPSPOST_POST_DRBG_TRNG_H_
+
+int fipspost_post_drbg_trng(uint32_t fips_mode);
+
+#endif
--- a/cc_fips/corecrypto/fipspost_post_ecdh.h
+++ b/cc_fips/corecrypto/fipspost_post_ecdh.h
@ -0,0 +1,17 @@
+/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to 
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by 
+ * Apple Inc. (if any) are limited to internal use within your organization only on 
+ * devices and computers you own or control, for the sole purpose of verifying the 
+ * security characteristics and correct functioning of the Apple Software.  You may 
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#ifndef _CORECRYPTO_FIPSPOST_POST_ECDH_H_
+#define _CORECRYPTO_FIPSPOST_POST_ECDH_H_
+
+int fipspost_post_ecdh(uint32_t fips_mode);
+
+#endif
--- a/cc_fips/corecrypto/fipspost_post_ecdsa.h
+++ b/cc_fips/corecrypto/fipspost_post_ecdsa.h
@ -0,0 +1,17 @@
+/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to 
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by 
+ * Apple Inc. (if any) are limited to internal use within your organization only on 
+ * devices and computers you own or control, for the sole purpose of verifying the 
+ * security characteristics and correct functioning of the Apple Software.  You may 
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#ifndef _CORECRYPTO_FIPSPOST_POST_ECDSA_H_
+#define _CORECRYPTO_FIPSPOST_POST_ECDSA_H_
+
+int fipspost_post_ecdsa(uint32_t fips_mode);
+
+#endif
--- a/cc_fips/corecrypto/fipspost_post_ffdh.h
+++ b/cc_fips/corecrypto/fipspost_post_ffdh.h
@ -0,0 +1,17 @@
+/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to 
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by 
+ * Apple Inc. (if any) are limited to internal use within your organization only on 
+ * devices and computers you own or control, for the sole purpose of verifying the 
+ * security characteristics and correct functioning of the Apple Software.  You may 
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#ifndef _CORECRYPTO_FIPSPOST_POST_FFDH_H_
+#define _CORECRYPTO_FIPSPOST_POST_FFDH_H_
+
+int fipspost_post_ffdh(uint32_t fips_mode);
+
+#endif
--- a/cc_fips/corecrypto/fipspost_post_hmac.h
+++ b/cc_fips/corecrypto/fipspost_post_hmac.h
@ -0,0 +1,17 @@
+/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to 
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by 
+ * Apple Inc. (if any) are limited to internal use within your organization only on 
+ * devices and computers you own or control, for the sole purpose of verifying the 
+ * security characteristics and correct functioning of the Apple Software.  You may 
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#ifndef _CORECRYPTO_FIPSPOST_POST_HMAC_H_
+#define _CORECRYPTO_FIPSPOST_POST_HMAC_H_
+
+int fipspost_post_hmac(uint32_t fips_mode);
+
+#endif
--- a/cc_fips/corecrypto/fipspost_post_indicator.h
+++ b/cc_fips/corecrypto/fipspost_post_indicator.h
@ -0,0 +1,19 @@
+/* Copyright (c) (2019,2020) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by
+ * Apple Inc. (if any) are limited to internal use within your organization only on
+ * devices and computers you own or control, for the sole purpose of verifying the
+ * security characteristics and correct functioning of the Apple Software.  You may
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#ifndef _CORECRYPTO_FIPSPOST_POST_INDICATOR_H_
+#define _CORECRYPTO_FIPSPOST_POST_INDICATOR_H_
+
+#include <stdint.h>
+
+int fipspost_post_indicator(uint32_t fips_mode);
+
+#endif /* _CORECRYPTO_FIPSPOST_POST_INDICATOR_H_ */
--- a/cc_fips/corecrypto/fipspost_post_integrity.h
+++ b/cc_fips/corecrypto/fipspost_post_integrity.h
@ -0,0 +1,19 @@
+/* Copyright (c) (2017,2019) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to 
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by 
+ * Apple Inc. (if any) are limited to internal use within your organization only on 
+ * devices and computers you own or control, for the sole purpose of verifying the 
+ * security characteristics and correct functioning of the Apple Software.  You may 
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#ifndef _CORECRYPTO_FIPSPOST_POST_INTEGRITY_H_
+#define _CORECRYPTO_FIPSPOST_POST_INTEGRITY_H_
+
+struct mach_header;
+
+int fipspost_post_integrity(uint32_t fips_mode, struct mach_header *pmach_header);
+
+#endif
--- a/cc_fips/corecrypto/fipspost_post_kdf_ctr.h
+++ b/cc_fips/corecrypto/fipspost_post_kdf_ctr.h
@ -0,0 +1,17 @@
+/* Copyright (c) (2012,2015,2020) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by
+ * Apple Inc. (if any) are limited to internal use within your organization only on
+ * devices and computers you own or control, for the sole purpose of verifying the
+ * security characteristics and correct functioning of the Apple Software.  You may
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#ifndef _CORECRYPTO_FIPSPOST_POST_KDF_CTR_H_
+#define _CORECRYPTO_FIPSPOST_POST_KDF_CTR_H_
+
+int fipspost_post_kdf_ctr(uint32_t fips_mode);
+
+#endif
--- a/cc_fips/corecrypto/fipspost_post_pbkdf.h
+++ b/cc_fips/corecrypto/fipspost_post_pbkdf.h
@ -0,0 +1,17 @@
+/* Copyright (c) (2012,2015,2020) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by
+ * Apple Inc. (if any) are limited to internal use within your organization only on
+ * devices and computers you own or control, for the sole purpose of verifying the
+ * security characteristics and correct functioning of the Apple Software.  You may
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#ifndef _CORECRYPTO_FIPSPOST_POST_PBKDF_H_
+#define _CORECRYPTO_FIPSPOST_POST_PBKDF_H_
+
+int fipspost_post_pbkdf(uint32_t fips_mode);
+
+#endif
--- a/cc_fips/corecrypto/fipspost_post_rsa.h
+++ b/cc_fips/corecrypto/fipspost_post_rsa.h
@ -0,0 +1,22 @@
+/* Copyright (c) (2019) Apple Inc. All rights reserved.
+ *
+ * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
+ * is contained in the License.txt file distributed with corecrypto) and only to 
+ * people who accept that license. IMPORTANT:  Any license rights granted to you by 
+ * Apple Inc. (if any) are limited to internal use within your organization only on 
+ * devices and computers you own or control, for the sole purpose of verifying the 
+ * security characteristics and correct functioning of the Apple Software.  You may 
+ * not, directly or indirectly, redistribute the Apple Software or any portions thereof.
+ */
+
+#ifndef _CORECRYPTO_FIPSPOST_POST_RSA_H_
+#define _CORECRYPTO_FIPSPOST_POST_RSA_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+// DER RSA key used for RSA operation tests pulled from FIPS 186-2 RSA test vectors.
+extern const uint8_t fipspost_post_rsa_test_key[];
+extern const size_t fipspost_post_rsa_test_key_nbytes;
+
+#endif
--- a/Show More
+++ b/Show More